admin 管理员组

文章数量: 1086866

某网站二次元美女图片爬取加破解(plus版) python

import sys
import time
import os
import requests
import re  # 正则表达式,进行文字匹配
from bs4 import BeautifulSoup  # (网页解析,获取数据)
import urllib.request, urllib.error  # 制定URL,获取网页数据,urllib.request urllib.error
import sqlite3
import random# UA_LIST = ['Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; U; ABrowse 0.6;  Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR   3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0;   Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;   SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;   .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)', 'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)']
'''测试通过,解码方式为gbk'''
'''k = int(input('输入页面数'))
for i in range(2, k+1):ip = random.randint(0, 12)headers = {'user-agent': '自己的ua'}url = '{}.html'.format(i)response = requests.get(url, headers=headers)response.encoding = 'gbk'html = response.texturl_2 = re.findall('<li><a href="(.*?)" target="_blank"><img src=".*?" alt=".*?" /><b>.*?</b></a>', html)print(url_2)'''
#
# url = '.html'#自己增加format会吧,线性的,记得加sleep
k = int(input("请输入爬取页面数:"))
uum = []
UA_LIST = ['Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; U; ABrowse 0.6;  Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR   3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0;   Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;   SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;   .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)', 'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)']
#网上找的uafor j in range(2, k + 1):ip = random.randint(1, 10)headers = {'user-agent': UA_LIST[ip]}# .htmlu_2 = '{}.html'.format(j)response = requests.get(u_2, headers=headers)response.encoding = 'gbk'html = response.texturl_2 = re.findall('<li><a href="(.*?)" target="_blank"><img src=".*?" alt=".*?" /><b>.*?</b></a>', html)uum += url_2time.sleep(0.1)for i in range(0, len(uum)):ip = random.randint(1, 10)print('爬取中{}'.format(i))url = '' + uum[i]headers = {'user-agent': UA_LIST[ip]}response = requests.get(url=url, headers=headers)response.encoding = 'gbk'time.sleep(0.1)  # 留点缓冲时间html = response.text# html.encode('utf-8')# urls = re.findall('<img lazysrc="(.*?)" lazysrc2x=".*?" height="348px" alt=".*?" title=".*?" />', html)urls = re.findall('<img src="(.*?)" data-pic=".*?" alt=".*?" title=".*?"></a>', html)filename = 'D:\点击获取资源壁纸破解\'print(urls)if not os.path.exists(filename):os.mkdir(filename)if len(urls) != 0:for url in urls:url = '/' + urlname = url.split('/')[-1]response = requests.get(url, headers=headers)with open(filename + name, mode='wb') as f:f.write(response.content)if i == len(uum) - 1:print("爬取结束了")

我这几天偶然发现之前一些网站的高清图址有迹可循,想着用异步和selenium写个爬虫,多少我今天一看,哇之前没看到,高清图址就在一段xpath中我竟然没发现,所以,今天赶紧写了一个可以爬取固定页面的爬虫,就是4k风景图,4k动漫等等,就只会爬该页1-n页,n是自己定义,想抓多少抓多少,不过我劝小伙伴们善良,网站已经加了验证码,难度上升了,我也加了反爬手段,如果后来实在不行,我会手写个图像识别。那么,高清大图爬取源代码奉上,如果有帮助,希望小伙伴们点个赞再走,感谢!

效果如下:

本文标签: 某网站二次元美女图片爬取加破解(plus版) python