admin 管理员组文章数量: 1184232
2024年3月7日发(作者:define是什么意思中文)
= ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '').replace('.', '').replace(',', ';') res = ('[(.*?)] (.*)', text_info) article_type, author = (1), (2) e_type = article_e('[', '').replace(']', '') = e(';', '; ') l = ('a[2]/text()')[0] = ('a[3]/text()')[0][:4] except IndexError: text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '') try: res = ('(.*?)[(.*?)]. (.*).(.*?).(d{4})', text_info) = (1) e_type = (2) = (3).replace(',', '; ') l = (4) = (5) except AttributeError: try: res = ('(.*?)[(.*?)]. (.*?).(d{4})', text_info) = (1) e_type = (2) l = (3) = (4) except AttributeError: continue elif db_code == 'CDFD' or db_code == 'CMFD': try: = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ''.join(('text()')).replace('n', '').replace(' ', '') # try: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) = (2) = (3) ution = ('a[2]/text()')[0] except IndexError: continue elif db_code == 'CPFD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '') res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) l = (3) = (2) = (4) elif db_code == 'SSJD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '').replace('  ', '') try: res = ('[(.*?)] . (.*).(.*?) .(d{4})', info) e_type = (1) l = (3) = (2).replace(',', '; ') = (4) except AttributeError: res = ('[(.*?)] . (.*?) .(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'CRLDENG': try: = ('a[1]/text()')[0] info = ('text()')[0].replace('n', '').replace(' ', '').replace('  ', '') try: res = ('. (.*?). (.*?). (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError: try: res = ('. (.*). (.*?) (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError: try: res = ('. (.*). (.*?).', info) = (1).replace(',', '; ') l = (2) except AttributeError: try: res = (' (.*). (.*?). (d{4})', info) = (1) = (2) = (3) except AttributeError: try: res = ('.(.*?). (d{4})', info) = (1) = (2) except AttributeError: try: = ('(d{4})', info).group(1) except AttributeError: = ('.') except IndexError:
def download_paper_page(row): index, url, dir_path= row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' response = (url) _html(, file_path) if _file_size(file_path=file_path) < 5: print(f'{file_path}t下载失败') _(f'{index},{url},{dir_path}n') else: print(f'{file_path}t下载完成') with ThreadPoolExecutor() as pool: (download_paper_page, refer_) with ThreadPoolExecutor() as pool: (download_paper_page, cited_) _() def parse_refer_cited_detail_info(self): _f = open('', mode='w') refer_data = read_excel('refer_') refer_data['dir_path'] = _paper_dir cited_data = read_excel('cited_') cited_data['dir_path'] = _paper_dir def parse_paper_page(row): index, url, dir_path = row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' try: text = _html(file_path) response = HTML(text) try: title = ('//div[@class="wxTitle"]/h2[@class="title"]/text()')[0] institution = '; '.join(('//div[@class="orgn"]/span/a/text()')) except IndexError: return try: summary = ('//span[@id="ChDivSummary"]/text()')[0] except IndexError: summary = '' keywords = ' '.join([() for word in ( '//label[@id="catalog_KEYWORD"]/following-sibling::a/text()')]).strip(';') try: cls_num = ('//label[@id="catalog_ZTCLS"]/parent::p/text()')[0] except IndexError: cls_num = '' if "refer" in dir_path: e("update reference_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) elif "cited" in dir_path: e("update cited_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) print(f'{file_path} 更新完毕') except Exception as e: print(f'{url} 更新失败', e) _exc() _(f'{index},{url},{file_path}n') refer_(parse_paper_page, axis=1) cited_(parse_paper_page, axis=1) def get_refer_cited_paper_page(self): def download_refer_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query if "refer" in dir_path: file_path = f'{_refer}/{index}.html' elif "cited" in dir_path: file_path = f'{_refer}/{index}.html' refer_url = f"/kcms/detail/frame/?{query}&RefType=1&vl=" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (refer_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为:{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e) # _exc() def download_cited_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query cited_url = f"/kcms/detail/frame/?{query}&RefType=3&vl=" if "refer" in dir_path: file_path = f'{_cited}/{index}.html' elif "cited" in dir_path: file_path = f'{_cited}/{index}.html' try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (cited_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为:{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e) # _exc() refer_list = [] cited_list = [] with open('') as f: for line in f: row = ().split(',') if 'html/refer_paper' in line: refer_(row) if 'html/cited_paper' in line: cited_(row) if len(refer_list) > 0 or len(cited_list) > 0: _f = open('', mode='w')
= ('em[1]/text()')[0].strip().replace('[', '').replace(']', '') except IndexError: continue if db_code == 'CBBD': info = ('text()')[0].replace('n', '').replace(' ', '') try: res = ('(.*?)[(.*?)].(.*?),(.*?),(d{4})', info) = (1) e_type = (2) l = (3) = (4) = (5) except AttributeError as e: res = ('(.*?)[(.*?)].(.*?),(.*?),', info) = (1) e_type = (2) l = (3) = (4) elif db_code == 'CJFQ': try: = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '').replace('.', '').replace(',', ';') res = ('[(.*?)] (.*)', text_info) article_type, author = (1), (2) e_type = article_e('[', '').replace(']', '') = e(';', '; ') l = ('a[2]/text()')[0] = ('a[3]/text()')[0][:4] except IndexError: text_info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '') try: res = ('(.*?)[(.*?)]. (.*).(.*?).(d{4})', text_info) = (1) e_type = (2) = (3).replace(',', '; ') l = (4) = (5) except AttributeError: try: res = ('(.*?)[(.*?)]. (.*?).(d{4})', text_info) = (1) e_type = (2) l = (3) = (4) except AttributeError: continue elif db_code == 'CDFD' or db_code == 'CMFD': try: = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ''.join(('text()')).replace('n', '').replace(' ', '') # try: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) = (2) = (3) ution = ('a[2]/text()')[0] except IndexError: continue elif db_code == 'CPFD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '') res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) l = (3) = (2) = (4) elif db_code == 'SSJD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = '' info = ('text()')[0].replace('n', '').replace(' ', '').replace('  ', '') try: res = ('[(.*?)] . (.*).(.*?) .(d{4})', info) e_type = (1) l = (3) = (2).replace(',', '; ') = (4) except AttributeError: res = ('[(.*?)] . (.*?) .(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'CRLDENG': try: = ('a[1]/text()')[0] info = ('text()')[0].replace('n', '').replace(' ', '').replace('  ', '') try: res = ('. (.*?). (.*?). (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError: try: res = ('. (.*). (.*?) (d{4})', info) = (1).replace(',', '; ') l = (2) = (3) except AttributeError:
try: res = ('. (.*). (.*?).', info) = (1).replace(',', '; ') l = (2) except AttributeError: try: res = (' (.*). (.*?). (d{4})', info) = (1) = (2) = (3) except AttributeError: try: res = ('.(.*?). (d{4})', info) = (1) = (2) except AttributeError: try: = ('(d{4})', info).group(1) except AttributeError: = ('.') except IndexError: info = ('text()')[0].replace('n', '').replace(' ', '').replace('  ', '') try: res = ('(.*). (.*?). (d{4})', info) = (1).replace(',', '; ') = (2) = (3) except AttributeError: try: res = ('(.*). (d{4})', info) = (1) = (2) except AttributeError: = ('.') elif db_code == 'CCND': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '') try: res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) = (2).replace(',', '; ') l = (3) = (4) except AttributeError: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'CYFD': # XNZS201112009 = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '') res = ('[(.*?)]. (.*?).', info) e_type = (1) l = (2) elif db_code == 'IPFD': = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '') try: res = ('[(.*?)].(.*?).(.*?).(d{4})', info) e_type = (1) = (2).replace(',', '; ') l = (3) = (4) except AttributeError: res = ('[(.*?)].(.*?).(d{4})', info) e_type = (1) l = (2) = (3) elif db_code == 'SCPD': # TIRE20130201 = ('a[1]/text()')[0] article_url = ('a[1]/@href') if article_url: e_url = '/' + article_url[0] else: e_url = "" info = ''.join(('text()')).strip().replace('n', '').replace(' ', '').replace( '  ', '') res = ('[(.*?)].(.*?).(.*?):(.*?),', info) e_type = (1) = (2).replace(',', '; ') l = (3) _num = (4) else: _(f'{db_code}t{filename}t类型异常n') print(f'{db_code}t{filename}t类型异常n') data_(item.__dict__) def spider_refer_cited_detail_page(self): _f = open('', mode='w') if ("refer_"): refer_data = read_excel('refer_') refer_data['dir_path'] = _paper_dir if len(r(_paper_dir)) > 0: refer_ids = {int(e('.html', '')) for file in r(_paper_dir)} refer_data = refer_data[~refer_data['index'].isin(refer_ids)] else: refer_data = read_sql("select distinct article_url from reference_article where article_url != '';", con=) refer__index(inplace=True) refer__excel('refer_', index=False) refer_data['dir_path'] = _paper_dir
if ('cited_'): cited_data = read_excel('cited_') cited_data['dir_path'] = _paper_dir if len(r(_paper_dir)) > 0: cited_ids = {int(e('.html', '')) for file in r(_paper_dir)} cited_data = cited_data[~cited_data['index'].isin(cited_ids)] else: cited_data = read_sql("select distinct article_url from cited_article where article_url != '';", con=) cited__index(inplace=True) cited__excel('cited_', index=False) cited_data['dir_path'] = _paper_dir def download_paper_page(row): index, url, dir_path= row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' response = (url) _html(, file_path) if _file_size(file_path=file_path) < 5: print(f'{file_path}t下载失败') _(f'{index},{url},{dir_path}n') else: print(f'{file_path}t下载完成') with ThreadPoolExecutor() as pool: (download_paper_page, refer_) with ThreadPoolExecutor() as pool: (download_paper_page, cited_) _() def parse_refer_cited_detail_info(self): _f = open('', mode='w') refer_data = read_excel('refer_') refer_data['dir_path'] = _paper_dir cited_data = read_excel('cited_') cited_data['dir_path'] = _paper_dir def parse_paper_page(row): index, url, dir_path = row[0], row[1], row[2] file_path = f'{dir_path}/{index}.html' try: text = _html(file_path) response = HTML(text) try: title = ('//div[@class="wxTitle"]/h2[@class="title"]/text()')[0] institution = '; '.join(('//div[@class="orgn"]/span/a/text()')) except IndexError: return try: summary = ('//span[@id="ChDivSummary"]/text()')[0] except IndexError: summary = '' keywords = ' '.join([() for word in ( '//label[@id="catalog_KEYWORD"]/following-sibling::a/text()')]).strip(';') try: cls_num = ('//label[@id="catalog_ZTCLS"]/parent::p/text()')[0] except IndexError: cls_num = '' if "refer" in dir_path: e("update reference_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) elif "cited" in dir_path: e("update cited_article set title=?, summary=?, institution=?, keywords=?, cls_num=? where article_url=?", params=(title, summary, institution, keywords, cls_num, url)) print(f'{file_path} 更新完毕') except Exception as e: print(f'{url} 更新失败', e) _exc() _(f'{index},{url},{file_path}n') refer_(parse_paper_page, axis=1) cited_(parse_paper_page, axis=1) def get_refer_cited_paper_page(self): def download_refer_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query if "refer" in dir_path: file_path = f'{_refer}/{index}.html' elif "cited" in dir_path: file_path = f'{_refer}/{index}.html' refer_url = f"/kcms/detail/frame/?{query}&RefType=1&vl=" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (refer_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为:{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e) # _exc() def download_cited_num(row): index, url, dir_path = row[0], row[1], row[2] query = urlparse(url).query cited_url = f"/kcms/detail/frame/?{query}&RefType=3&vl=" if "refer" in dir_path: file_path = f'{_cited}/{index}.html' elif "cited" in dir_path: file_path = f'{_cited}/{index}.html' try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Referer': f'/KCMS/detail/?{query}', } response = (cited_url, headers=headers) if _code == 200: _html(, file_path) print(f'{file_path}t下载完成') else: raise Exception(f"请求异常, 状态码为:{_code}") except Exception as e: _(f'{index},{url},{dir_path}n') print(f'{url}t下载失败', e)
版权声明:本文标题:Python爬取中国知网文献、参考文献、引证文献 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.roclinux.cn/b/1709806968a547049.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论