admin 管理员组文章数量: 1086866
某网站二次元美女图片爬取加破解(plus版) python
import sys
import time
import os
import requests
import re # 正则表达式,进行文字匹配
from bs4 import BeautifulSoup # (网页解析,获取数据)
import urllib.request, urllib.error # 制定URL,获取网页数据,urllib.request urllib.error
import sqlite3
import random# UA_LIST = ['Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)', 'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)']
'''测试通过,解码方式为gbk'''
'''k = int(input('输入页面数'))
for i in range(2, k+1):ip = random.randint(0, 12)headers = {'user-agent': '自己的ua'}url = '{}.html'.format(i)response = requests.get(url, headers=headers)response.encoding = 'gbk'html = response.texturl_2 = re.findall('<li><a href="(.*?)" target="_blank"><img src=".*?" alt=".*?" /><b>.*?</b></a>', html)print(url_2)'''
#
# url = '.html'#自己增加format会吧,线性的,记得加sleep
k = int(input("请输入爬取页面数:"))
uum = []
UA_LIST = ['Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)', 'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)']
#网上找的uafor j in range(2, k + 1):ip = random.randint(1, 10)headers = {'user-agent': UA_LIST[ip]}# .htmlu_2 = '{}.html'.format(j)response = requests.get(u_2, headers=headers)response.encoding = 'gbk'html = response.texturl_2 = re.findall('<li><a href="(.*?)" target="_blank"><img src=".*?" alt=".*?" /><b>.*?</b></a>', html)uum += url_2time.sleep(0.1)for i in range(0, len(uum)):ip = random.randint(1, 10)print('爬取中{}'.format(i))url = '' + uum[i]headers = {'user-agent': UA_LIST[ip]}response = requests.get(url=url, headers=headers)response.encoding = 'gbk'time.sleep(0.1) # 留点缓冲时间html = response.text# html.encode('utf-8')# urls = re.findall('<img lazysrc="(.*?)" lazysrc2x=".*?" height="348px" alt=".*?" title=".*?" />', html)urls = re.findall('<img src="(.*?)" data-pic=".*?" alt=".*?" title=".*?"></a>', html)filename = 'D:\点击获取资源壁纸破解\'print(urls)if not os.path.exists(filename):os.mkdir(filename)if len(urls) != 0:for url in urls:url = '/' + urlname = url.split('/')[-1]response = requests.get(url, headers=headers)with open(filename + name, mode='wb') as f:f.write(response.content)if i == len(uum) - 1:print("爬取结束了")
我这几天偶然发现之前一些网站的高清图址有迹可循,想着用异步和selenium写个爬虫,多少我今天一看,哇之前没看到,高清图址就在一段xpath中我竟然没发现,所以,今天赶紧写了一个可以爬取固定页面的爬虫,就是4k风景图,4k动漫等等,就只会爬该页1-n页,n是自己定义,想抓多少抓多少,不过我劝小伙伴们善良,网站已经加了验证码,难度上升了,我也加了反爬手段,如果后来实在不行,我会手写个图像识别。那么,高清大图爬取源代码奉上,如果有帮助,希望小伙伴们点个赞再走,感谢!
效果如下:
本文标签: 某网站二次元美女图片爬取加破解(plus版) python
版权声明:本文标题:某网站二次元美女图片爬取加破解(plus版) python 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.roclinux.cn/b/1693576744a230226.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论