admin 管理员组

文章数量: 1087817

pyspider爬虫框架之boss直聘招聘信息爬取

需求

需求:
1、 遍历首页所有职位分类
2、 点击进入职位分类详情页,按照地区抓取,职位名称,月薪,经验年限要求,学历要求,招聘公司,所属行业,轮次,人数(规模),发布时间
3、 点击进入职位详情页,抓取该职位的技能标签。

代码

代码有注释,没有代理慎用

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-06 10:40:07
# Project: boss_recruitfrom pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient# 连接线下数据库# admin 数据库有帐号,连接-认证-切换DB_NAME = 'research'
DB_COL = 'boss_recruit'
db = client[DB_NAME]
col = db[DB_COL]class Handler(BaseHandler):crawl_config = {"headers":{"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"},"proxy": "http://localhost:6666"}url = '/?ka=header-home'def format_date(self, date):return datetime.datetime.strptime(date, '%Y%m%d')@every(minutes=24 * 60)def on_start(self):print(get_proxy())self.crawl(self.url, callback=self.index_page, proxy=get_proxy())@config(age=60)def index_page(self, response):page = response.etreebase_url = ''# 所有行业列表vocation_list = page.xpath("//div[@class='job-menu']//div[@class='menu-sub']/ul/li")for each in vocation_list: belong = each.xpath("./h4/text()")[0]detail_list = each.xpath("./div[@class='text']/a")print(belong)for detail in detail_list:detail_title = detail.xpath("./text()")[0]detail_url = base_url + detail.xpath("./@href")[0]#save = {"belonging":[belong, detail_title]}save = {"belonging": detail_title}print(detail_title, detail_url)self.crawl(detail_url, callback=self.detail_page, save=save, proxy=get_proxy())@config(age=60)def detail_page(self, response):page = response.etreebase_url = ''# 城市列表city_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-city show-condition-district']/dd/a[@ka]")[1:]  #不要全国for each in city_list:city_name = each.xpath("./text()")[0]city_url = base_url + each.xpath("./@href")[0]params = {"ka": each.xpath("./@ka")[0]}save = {"city": city_name, "belonging": response.save["belonging"]}self.crawl(city_url, callback=self.parse_city, params=params, save=save, proxy=get_proxy())@config(age=60)        def parse_city(self, response):page = response.etreebase_url = ''#该城市的地区列表district_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-district show-condition-district']/dd/a[position()>1]")for each in district_list:district_name = each.xpath("./text()")[0]print(district_name)district_url = base_url + each.xpath("./@href")[0]params = {"ka": each.xpath("./@ka")[0]}save = {"district": district_name, "city": response.save["city"], "belonging": response.save["belonging"]}self.crawl(district_url, callback=self.parse_district, params=params, save=save, proxy=get_proxy())@config(age=60)        def parse_district(self, response):page = response.etreebase_url = ''#该地区的区域列表area_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-area show-condition-area']/dd/a[position()>1]")for each in area_list:area_name = each.xpath("./text()")[0]print(area_name)area_url = base_url + each.xpath("./@href")[0]params = {"ka": each.xpath("./@ka")[0]}save = {"area": area_name, "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "base_url": area_url, "page_num": 1}self.crawl(area_url, callback=self.parse_content, params=params, save=save, proxy=get_proxy())@config(age=60)def parse_page(self, response):page = response.etree#翻页page_url = response.save.pop("base_url")page_num = 10print(page_url)for each in range(1, page_num+1):ka = 'page-{}'.format(each)params = {"page": each,"ka": ka}self.crawl(page_url, callback=self.parse_content, params=params, save=response.save)@config(age=60)        def parse_content(self, response):page = response.etreebase_url = ''page_url = response.save.get("base_url")#内容列表content_list = page.xpath("//div[@class='job-list']/ul/li")#判断是否有内容if content_list == []:returnfor each in content_list:# 职位名称position_name = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/div[@class='job-title']/text()")[0]#薪水salary = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/span/text()")[0]#经验experience = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[1]#学历education = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[2]#公司company = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/h3[@class='name']/a/text()")[0]#轮数if len(each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")) == 3: rounds = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]#规模scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[2]else:rounds = ''#规模scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]                #发布时间public_time = each.xpath("./div[@class='job-primary']/div[@class='info-publis']/p/text()")[0]if ''.join(re.findall(u'昨天',public_time)):public_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y%m%d')elif ''.join(re.findall('\d+:\d+',public_time)):public_time = datetime.datetime.now().strftime('%Y%m%d')else:public_time = '2018' + ''.join(re.findall(u'(\d+)月(\d+)日',public_time)[0])print(public_time)#职位详情链接position_url = base_url + each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/@href")[0]print(position_url)save = {"area": response.save["area"], "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "position_name": position_name,"salary": salary,"experience": experience,"education": education,"company": company,"rounds": rounds,"scale": scale,"public_time": public_time}#爬取职位的详情self.crawl(position_url, callback=self.parse_body, save=save, proxy=get_proxy())#翻页page_num = response.save.get('page_num')print(page_num)page_num += 1if page_num <= 10:ka = 'page-{}'.format(page_num)params = {"page": page_num,"ka": ka}response.save.update({"page_num": page_num})self.crawl(page_url, callback=self.parse_content, params=params, save=response.save, proxy=get_proxy())def parse_body(self, response):page = response.etreeprint(response.save["public_time"])#职位技能skill = ''.join(page.xpath("//div[@class='detail-content']/div[@class='job-sec'][1]//text()")).strip()print(skill)result = {"skill": skill,"area": response.save["area"], "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "position_name": response.save["position_name"],"salary": response.save["salary"],"experience": response.save["experience"],"education": response.save["education"],"company": response.save["company"],"rounds": response.save["rounds"],"scale": response.save["scale"],"public_time": self.format_date(response.save["public_time"]),"update_time": datetime.datetime.now()}yield result def on_result(self, result):if result is None:returnupdate_key = {'position_name': result['position_name'],'public_time': result['public_time'],'city': result['city'],'district': result['district'],'area': result['area'],'company': result['company']}col.update(update_key, {'$set': result}, upsert=True)          

本文标签: pyspider爬虫框架之boss直聘招聘信息爬取