Python爬取中国知网文献、参考文献、引证文献-Linux大棚

admin 管理员组

文章数量: 1184232

2024年3月7日发(作者：define是什么意思中文)

= ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '').replace('.', '').replace(',', ';') res = ('[(.*?)] (.*)', text_info) article_type, author = (1), (2) e_type = article_e('[', '').replace(']', '') = e(';', '; ') l = ('a[2]/text()')[0] = ('a[3]/text()')[0][:4] except IndexError: text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '') try: res = ('(.*?)[(.*?)]. (.*).(.*?).(d{4})', text_info) = (1) e_type = (2) = (3).replace(',', '; ') l = (4) = (5) except AttributeError: try: res = ('(.*?)[(.*?)]. (.*?).(d{4})', text_info) = (1) e_type = (2) l = (3) = (4) except AttributeError: continue elif db_code == 'CDFD' or db_code == 'CMFD': try: = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ''.join(('text()')).replace('n', '').replace(' ', '') # try: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) = (2) = (3) ution = ('a[2]/text()')[0] except IndexError: continue elif db_code == 'CPFD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '') res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) l = (3) = (2) = (4) elif db_code == 'SSJD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '').replace(' ', '') try: res = ('[(.*?)] . (.*).(.*?) .(d{4})', info) e_type = (1) l = (3) = (2).replace(',', '; ') = (4) except AttributeError: res = ('[(.*?)] . (.*?) .(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'CRLDENG': try: = ('a[1]/text()')[0] info = ('text()')[0].replace('n', '').replace(' ', '').replace(' ', '') try: res = ('. (.*?). (.*?). (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError: try: res = ('. (.*). (.*?) (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError: try: res = ('. (.*). (.*?).', info) = (1).replace(',', '; ') l = (2) except AttributeError: try: res = (' (.*). (.*?). (d{4})', info) = (1) = (2) = (3) except AttributeError: try: res = ('.(.*?). (d{4})', info) = (1) = (2) except AttributeError: try: = ('(d{4})', info).group(1) except AttributeError: = ('.') except IndexError:

def download_paper_page(row): index, url, dir_path= row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' response = (url) _html(, file_path) if _file_size(file_path=file_path) < 5: print(f'{file_path}t下载失败') _(f'{index},{url},{dir_path}n') else: print(f'{file_path}t下载完成') with ThreadPoolExecutor() as pool: (download_paper_page, refer_) with ThreadPoolExecutor() as pool: (download_paper_page, cited_) _() def parse_refer_cited_detail_info(self): _f = open('', mode='w') refer_data = read_excel('refer_') refer_data['dir_path'] = _paper_dir cited_data = read_excel('cited_') cited_data['dir_path'] = _paper_dir def parse_paper_page(row): index, url, dir_path = row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' try: text = _html(file_path) response = HTML(text) try: title = ('//div[@class="wxTitle"]/h2[@class="title"]/text()')[0] institution = '; '.join(('//div[@class="orgn"]/span/a/text()')) except IndexError: return try: summary = ('//span[@id="ChDivSummary"]/text()')[0] except IndexError: summary = '' keywords = ' '.join([() for word in ( '//label[@id="catalog_KEYWORD"]/following-sibling::a/text()')]).strip(';') try: cls_num = ('//label[@id="catalog_ZTCLS"]/parent::p/text()')[0] except IndexError: cls_num = '' if "refer" in dir_path: e("update reference_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) elif "cited" in dir_path: e("update cited_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) print(f'{file_path} 更新完毕') except Exception as e: print(f'{url} 更新失败', e) _exc() _(f'{index},{url},{file_path}n') refer_(parse_paper_page, axis=1) cited_(parse_paper_page, axis=1) def get_refer_cited_paper_page(self): def download_refer_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query if "refer" in dir_path: file_path = f'{_refer}/{index}.html' elif "cited" in dir_path: file_path = f'{_refer}/{index}.html' refer_url = f"/kcms/detail/frame/?{query}&RefType=1&vl=" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (refer_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为：{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e) # _exc() def download_cited_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query cited_url = f"/kcms/detail/frame/?{query}&RefType=3&vl=" if "refer" in dir_path: file_path = f'{_cited}/{index}.html' elif "cited" in dir_path: file_path = f'{_cited}/{index}.html' try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (cited_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为：{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e) # _exc() refer_list = [] cited_list = [] with open('') as f: for line in f: row = ().split(',') if 'html/refer_paper' in line: refer_(row) if 'html/cited_paper' in line: cited_(row) if len(refer_list) > 0 or len(cited_list) > 0: _f = open('', mode='w')

= ('em[1]/text()')[0].strip().replace('[', '').replace(']', '') except IndexError: continue if db_code == 'CBBD': info = ('text()')[0].replace('n', '').replace(' ', '') try: res = ('(.*?)[(.*?)].(.*?),(.*?),(d{4})', info) = (1) e_type = (2) l = (3) = (4) = (5) except AttributeError as e: res = ('(.*?)[(.*?)].(.*?),(.*?),', info) = (1) e_type = (2) l = (3) = (4) elif db_code == 'CJFQ': try: = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '').replace('.', '').replace(',', ';') res = ('[(.*?)] (.*)', text_info) article_type, author = (1), (2) e_type = article_e('[', '').replace(']', '') = e(';', '; ') l = ('a[2]/text()')[0] = ('a[3]/text()')[0][:4] except IndexError: text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '') try: res = ('(.*?)[(.*?)]. (.*).(.*?).(d{4})', text_info) = (1) e_type = (2) = (3).replace(',', '; ') l = (4) = (5) except AttributeError: try: res = ('(.*?)[(.*?)]. (.*?).(d{4})', text_info) = (1) e_type = (2) l = (3) = (4) except AttributeError: continue elif db_code == 'CDFD' or db_code == 'CMFD': try: = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ''.join(('text()')).replace('n', '').replace(' ', '') # try: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) = (2) = (3) ution = ('a[2]/text()')[0] except IndexError: continue elif db_code == 'CPFD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '') res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) l = (3) = (2) = (4) elif db_code == 'SSJD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '').replace(' ', '') try: res = ('[(.*?)] . (.*).(.*?) .(d{4})', info) e_type = (1) l = (3) = (2).replace(',', '; ') = (4) except AttributeError: res = ('[(.*?)] . (.*?) .(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'CRLDENG': try: = ('a[1]/text()')[0] info = ('text()')[0].replace('n', '').replace(' ', '').replace(' ', '') try: res = ('. (.*?). (.*?). (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError: try: res = ('. (.*). (.*?) (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError:

try: res = ('. (.*). (.*?).', info) = (1).replace(',', '; ') l = (2) except AttributeError: try: res = (' (.*). (.*?). (d{4})', info) = (1) = (2) = (3) except AttributeError: try: res = ('.(.*?). (d{4})', info) = (1) = (2) except AttributeError: try: = ('(d{4})', info).group(1) except AttributeError: = ('.') except IndexError: info = ('text()')[0].replace('n', '').replace(' ', '').replace(' ', '') try: res = ('(.*). (.*?). (d{4})', info) = (1).replace(',', '; ') = (2) = (3) except AttributeError: try: res = ('(.*). (d{4})', info) = (1) = (2) except AttributeError: = ('.') elif db_code == 'CCND': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '') try: res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) = (2).replace(',', '; ') l = (3) = (4) except AttributeError: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'CYFD': # XNZS201112009 = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '') res = ('[(.*?)]. (.*?).', info) e_type = (1) l = (2) elif db_code == 'IPFD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '') try: res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) = (2).replace(',', '; ') l = (3) = (4) except AttributeError: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'SCPD': # TIRE20130201 = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( ' ', '') res = ('[(.*?)].(.*?).(.*?):(.*?),', info) e_type = (1) = (2).replace(',', '; ') l = (3) _num = (4) else: _(f'{db_code}t{filename}t类型异常n') print(f'{db_code}t{filename}t类型异常n') data_(item.__dict__) def spider_refer_cited_detail_page(self): _f = open('', mode='w') if ("refer_"): refer_data = read_excel('refer_') refer_data['dir_path'] = _paper_dir if len(r(_paper_dir)) > 0: refer_ids = {int(e('.html', '')) for file in r(_paper_dir)} refer_data = refer_data[~refer_data['index'].isin(refer_ids)] else: refer_data = read_sql("select distinct article_url from reference_article where article_url != '';", con=) refer__index(inplace=True) refer__excel('refer_', index=False) refer_data['dir_path'] = _paper_dir

if ('cited_'): cited_data = read_excel('cited_') cited_data['dir_path'] = _paper_dir if len(r(_paper_dir)) > 0: cited_ids = {int(e('.html', '')) for file in r(_paper_dir)} cited_data = cited_data[~cited_data['index'].isin(cited_ids)] else: cited_data = read_sql("select distinct article_url from cited_article where article_url != '';", con=) cited__index(inplace=True) cited__excel('cited_', index=False) cited_data['dir_path'] = _paper_dir def download_paper_page(row): index, url, dir_path= row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' response = (url) _html(, file_path) if _file_size(file_path=file_path) < 5: print(f'{file_path}t下载失败') _(f'{index},{url},{dir_path}n') else: print(f'{file_path}t下载完成') with ThreadPoolExecutor() as pool: (download_paper_page, refer_) with ThreadPoolExecutor() as pool: (download_paper_page, cited_) _() def parse_refer_cited_detail_info(self): _f = open('', mode='w') refer_data = read_excel('refer_') refer_data['dir_path'] = _paper_dir cited_data = read_excel('cited_') cited_data['dir_path'] = _paper_dir def parse_paper_page(row): index, url, dir_path = row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' try: text = _html(file_path) response = HTML(text) try: title = ('//div[@class="wxTitle"]/h2[@class="title"]/text()')[0] institution = '; '.join(('//div[@class="orgn"]/span/a/text()')) except IndexError: return try: summary = ('//span[@id="ChDivSummary"]/text()')[0] except IndexError: summary = '' keywords = ' '.join([() for word in ( '//label[@id="catalog_KEYWORD"]/following-sibling::a/text()')]).strip(';') try: cls_num = ('//label[@id="catalog_ZTCLS"]/parent::p/text()')[0] except IndexError: cls_num = '' if "refer" in dir_path: e("update reference_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) elif "cited" in dir_path: e("update cited_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) print(f'{file_path} 更新完毕') except Exception as e: print(f'{url} 更新失败', e) _exc() _(f'{index},{url},{file_path}n') refer_(parse_paper_page, axis=1) cited_(parse_paper_page, axis=1) def get_refer_cited_paper_page(self): def download_refer_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query if "refer" in dir_path: file_path = f'{_refer}/{index}.html' elif "cited" in dir_path: file_path = f'{_refer}/{index}.html' refer_url = f"/kcms/detail/frame/?{query}&RefType=1&vl=" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (refer_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为：{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e) # _exc() def download_cited_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query cited_url = f"/kcms/detail/frame/?{query}&RefType=3&vl=" if "refer" in dir_path: file_path = f'{_cited}/{index}.html' elif "cited" in dir_path: file_path = f'{_cited}/{index}.html' try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (cited_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为：{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e)

本文标签：下载请求状态

版权声明：本文标题：Python爬取中国知网文献、参考文献、引证文献内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.roclinux.cn/b/1709806968a547049.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

Linux大棚 – 不忘初心的技术博客，浮躁时代的安静角落

Python爬取中国知网文献、参考文献、引证文献

更多相关文章

Ubuntu下的微信登陆挑战与解决，苹果设备用户必读攻略！

快速教程：在文档中无缝添加和修改页眉的实用方法

Word新手必学：一键搞定插入页眉与清除横杠小教程，让你文稿整洁美观。

玩转Windows启动过程：揭秘并轻松处理logonui.exe相关难题

在Adobe Flash Player的世界里寻找答案：Img文件的命运是驻留在内部还是作为外部元素出现在舞台上？

一文精通Redis读写之道：避开穿透风险，提升性能的旁路技巧

360 CS墩不再是负担：卸载技巧分享

CSND用户必看：彻底卸载360的正确姿势

QQ音乐API入门到精通：打造专属音乐库，音乐爱好者必学的技能

QQ音乐API的实用技巧：从封装到调用的全流程

彻底掌握Microsoft Office 2016安装技巧，从入门到精通

安装Office 2016，从新手到专家只需几步

Windows系统下的绝密技巧：DMG文件快速转为ISO步骤

升级Win10卡顿难耐？这些技巧让你轻松飞快！

如何在双硬盘配置下优雅地安装Ubuntu

Windows 11安装全攻略：快速纯净系统搭建教程！

让你的电脑焕然一新：Win11系统安装全攻略，超实用教程

Java程序员新技能：在Windows 11下安装并配置JDK11

CPU 使用率100%的常见原因_开发中cpu100%可能的原因

如何用手机控制电脑或电脑控制手机？_minimouse安卓版

发表评论

推荐文章

即时更新：厂里Flash中心的最热文章，深度剖析Adobe Flash Player的前沿技术

面对Windows.Storage.dll错误，如何有效解决？攻略分享！

如何在移动硬盘上制作一个windows10系统_移动硬盘安装系统win10

如何在windows安装NET Framework 3.5_net framework 3.5百度云

当小米2S遇上Win78系统，数据线连电脑为何不灵？一文助你搞定！

热门文章

SWF文件安全删除法：一招让文件彻底消失

中毒的Adobe Flash Player？一招教你快速修复！

Mac玩家注意！当卸载软件遇到困难，试试这个简单步骤，轻松搞定！

双指滑动功能丢哪儿了？快速修复笔记本触摸板问题！

Win10自动关机设置失效？解决步骤一览！

CPU风扇智能调速软件实战应用指南

DM如何帮你定期自动清理备份文件_dm数据库 定时删除备份

如何解决CPU温度过高_流放2cpu高温

Windows系统合并磁盘分区_windows server2012合盘

Linux mint18通过禁用nouveau解决显示器出现“输入不支持”问题_linux mint 禁用nouveau

最新文章

一文教会你AIX系统备份：mksysb实用指南

SWF文件备份失败？这些步骤让你轻松搞定

Win10系统备份轻松搞定：掌握captureimage命令的关键技巧

Linux系统安全小贴士：掌握备份与恢复，安心每一天

省时省心！三步完成电脑系统高效备份！

Ubuntu系统维护秘籍：备份步骤详解，保护你的劳动成果！

Linux系统不哭：高效备份与快速恢复方案

Ubuntu系统安全大计，备份技巧大公开

GHOST教程：系统备份和还原，小白也能变成高手！

Linux备份与恢复必修课：SWF文件安全策略从入门到精通

Exploring the Finest Accommodations: A Comprehensive Guide to Ruston LA Hotels

The Enchanting Experience of ScaliniTella NYC: A Culinary Gem in the Heart of Manhattan

Exploring the Exquisite Aloft Chicago O'Hare: A Blend of Modern Luxury and Convenience

A Culinary Journey: Discovering the Finest Dining Experiences in Waco, TX

A Culinary Journey: Discovering the Finest Dining Experiences in Athens, GA

电脑设备管理器在哪里？一次让我抓狂又兴奋的寻找经历

与GWX的持久战：一段关于Windows10升级弹窗的私人记忆

以管理员身份运行：那些年我们追过的权限与踩过的坑

DM如何帮你定期自动清理备份文件_dm数据库定时删除备份