scrapy实战2分布式爬取lagou招聘(加入了免费的User-Agent随机动态获取库 fake-useragent 使用方法查看:https://github.com/hellysmile/fake-useragent)
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html import scrapy
class LagouItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#id
obj_id=scrapy.Field()
#职位名
positon_name=scrapy.Field()
#工作地点
work_place=scrapy.Field()
#发布日期
publish_time=scrapy.Field()
#工资
salary=scrapy.Field()
#工作经验
work_experience=scrapy.Field()
#学历
education=scrapy.Field()
#full_time
full_time=scrapy.Field()
#标签
tags=scrapy.Field()
#公司名字
company_name=scrapy.Field()
# #产业
# industry=scrapy.Field()
#职位诱惑
job_temptation=scrapy.Field()
#工作描述
job_desc=scrapy.Field()
#公司logo地址
logo_image=scrapy.Field()
#领域
field=scrapy.Field()
#发展阶段
stage=scrapy.Field()
#公司规模
company_size=scrapy.Field()
# 公司主页
home = scrapy.Field()
#职位发布者
job_publisher=scrapy.Field()
#投资机构
financeOrg=scrapy.Field()
#爬取时间
crawl_time=scrapy.Field()
spiders>lagou.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from LaGou.items import LagouItem
from LaGou.utils.MD5 import get_md5
from datetime import datetime class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['lagou.com']
start_urls = ['https://www.lagou.com/zhaopin/']
content_links=LinkExtractor(allow=(r"https://www.lagou.com/jobs/\d+.html"))
page_links=LinkExtractor(allow=(r"https://www.lagou.com/zhaopin/\d+"))
rules = (
Rule(content_links, callback="parse_item", follow=False),
Rule(page_links,follow=True)
) def parse_item(self, response):
item=LagouItem()
#获取到公司拉钩主页的url作为ID
item["obj_id"]=get_md5(response.url)
#公司名称
item["company_name"]=response.xpath('//dl[@class="job_company"]//a/img/@alt').extract()[0]
# 职位
item["positon_name"]=response.xpath('//div[@class="job-name"]//span[@class="name"]/text()').extract()[0]
#工资
item["salary"]=response.xpath('//dd[@class="job_request"]//span[1]/text()').extract()[0]
# 工作地点
work_place=response.xpath('//dd[@class="job_request"]//span[2]/text()').extract()[0]
item["work_place"]=work_place.replace("/","")
# 工作经验
work_experience=response.xpath('//dd[@class="job_request"]//span[3]/text()').extract()[0]
item["work_experience"]=work_experience.replace("/","")
# 学历
education=response.xpath('//dd[@class="job_request"]//span[4]/text()').extract()[0]
item["education"]=education.replace("/","")
# full_time
item['full_time']=response.xpath('//dd[@class="job_request"]//span[5]/text()').extract()[0]
#tags
tags=response.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()').extract()
item["tags"]=",".join(tags)
#publish_time
item["publish_time"]=response.xpath('//dd[@class="job_request"]//p[@class="publish_time"]/text()').extract()[0]
# 职位诱惑
job_temptation=response.xpath('//dd[@class="job-advantage"]/p/text()').extract()
item["job_temptation"]=",".join(job_temptation)
# 工作描述
job_desc=response.xpath('//dd[@class="job_bt"]/div//p/text()').extract()
item["job_desc"]=",".join(job_desc).replace("\xa0","").strip()
#job_publisher
item["job_publisher"]=response.xpath('//div[@class="publisher_name"]//span[@class="name"]/text()').extract()[0]
# 公司logo地址
logo_image=response.xpath('//dl[@class="job_company"]//a/img/@src').extract()[0]
item["logo_image"]=logo_image.replace("//","")
# 领域
field=response.xpath('//ul[@class="c_feature"]//li[1]/text()').extract()
item["field"]="".join(field).strip()
# 发展阶段
stage=response.xpath('//ul[@class="c_feature"]//li[2]/text()').extract()
item["stage"]="".join(stage).strip()
# 投资机构
financeOrg=response.xpath('//ul[@class="c_feature"]//li[3]/p/text()').extract()
if financeOrg:
item["financeOrg"]="".join(financeOrg)
else:
item["financeOrg"]=""
#公司规模
if financeOrg:
company_size= response.xpath('//ul[@class="c_feature"]//li[4]/text()').extract()
item["company_size"]="".join(company_size).strip()
else:
company_size = response.xpath('//ul[@class="c_feature"]//li[3]/text()').extract()
item["company_size"] = "".join(company_size).strip()
# 公司主页
item["home"]=response.xpath('//ul[@class="c_feature"]//li/a/@href').extract()[0]
# 爬取时间
item["crawl_time"]=datetime.now() yield item
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql
class LagouPipeline(object): def process_item(self, item, spider):
con = pymysql.connect(host="127.0.0.1", user="root", passwd="", db="lagou",charset="utf8")
cur = con.cursor()
sql = ("insert into lagouwang(obj_id,company_name,positon_name,salary,work_place,work_experience,education,full_time,tags,publish_time,job_temptation,job_desc,job_publisher,logo_image,field,stage,financeOrg,company_size,home,crawl_time)"
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
lis=(item["obj_id"],item["company_name"],item["positon_name"],item["salary"],item["work_place"],item["work_experience"],item["education"],item['full_time'],item["tags"],item["publish_time"],item["job_temptation"],item["job_desc"],item["job_publisher"],item["logo_image"],item["field"],item["stage"],item["financeOrg"],item["company_size"],item["home"],item["crawl_time"])
cur.execute(sql, lis)
con.commit()
cur.close()
con.close() return item
middlewares.py
from scrapy import signals
import random
#from LaGou.settings import USER_AGENTS
from fake_useragent import UserAgent class RandomUserAgent(object):
# def __init__(self,crawl):
# super(RandomUserAgent,self).__init__()
# self.ua=UserAgent()
def process_request(self, request, spider):
#useragent = random.choice(USER_AGENTS)
ua=UserAgent()
request.headers.setdefault("User-Agent",ua.random)
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for LaGou project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'LaGou' SPIDER_MODULES = ['LaGou.spiders']
NEWSPIDER_MODULE = 'LaGou.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'LaGou (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
COOKIES_ENABLED = False
# USER_AGENTS = [
# "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
# "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
# "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
# "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
# "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
# "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
# "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
# "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
# ]
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'LaGou.middlewares.LagouSpiderMiddleware': 543, #} # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'LaGou.middlewares.RandomUserAgent': 1,
# 'LaGou.middlewares.MyCustomDownloaderMiddleware': 543,
} # Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline':300, #'LaGou.pipelines.LagouPipeline': 300,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
redis数据:

mysql数据:

申明:以上只限于参考学习交流!!!更多:https://github.com/huwei86/spiderlagou
scrapy实战2分布式爬取lagou招聘(加入了免费的User-Agent随机动态获取库 fake-useragent 使用方法查看:https://github.com/hellysmile/fake-useragent)的更多相关文章
- scrapy实战1分布式爬取有缘网(6.22接口已挂):
直接上代码: items.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See docu ...
- scrapy基础知识之 CrawlSpiders爬取lagou招聘保存在mysql(分布式):
items.py import scrapy class LagouItem(scrapy.Item): # define the fields for your item here like: # ...
- scrapy实战--登陆人人网爬取个人信息
今天把scrapy的文档研究了一下,感觉有点手痒,就写点东西留点念想吧,也做为备忘录.随意写写,看到的朋友觉得不好,不要喷我哈. 创建scrapy工程 cd C:\Spider_dev\app\scr ...
- scrapy-redis实现爬虫分布式爬取分析与实现
本文链接:http://blog.csdn.net/u012150179/article/details/38091411 一 scrapy-redis实现分布式爬取分析 所谓的scrapy-redi ...
- Scrapy 分布式爬取
由于受到计算机能力和网络带宽的限制,单台计算机运行的爬虫咋爬取数据量较大时,需要耗费很长时间.分布式爬取的思想是“人多力量大”,在网络中的多台计算机同时运行程序,公童完成一个大型爬取任务, Scrap ...
- scrapy-redis + Bloom Filter分布式爬取tencent社招信息
scrapy-redis + Bloom Filter分布式爬取tencent社招信息 什么是scrapy-redis 什么是 Bloom Filter 为什么需要使用scrapy-redis + B ...
- scrapy-redis分布式爬取tencent社招信息
scrapy-redis分布式爬取tencent社招信息 什么是scrapy-redis 目标任务 安装爬虫 创建爬虫 编写 items.py 编写 spiders/tencent.py 编写 pip ...
- python-scrapy爬取某招聘网站(二)
首先要准备python3+scrapy+pycharm 一.首先让我们了解一下网站 拉勾网https://www.lagou.com/ 和Boss直聘类似的网址设计方式,与智联招聘不同,它采用普通的页 ...
- 一个scrapy框架的爬虫(爬取京东图书)
我们的这个爬虫设计来爬取京东图书(jd.com). scrapy框架相信大家比较了解了.里面有很多复杂的机制,超出本文的范围. 1.爬虫spider tips: 1.xpath的语法比较坑,但是你可以 ...
随机推荐
- windows常用cmd指令
打开命令行 1.在菜单栏中搜索命令行 2.在文件管理器的Path栏输入cmd,则在当前目录打开命令行 3.Windows+R,输入cmd,回车 ping(网络诊断工具) ping是Windows.Un ...
- 【转载】json 数据 添加 删除 排序
张映 发表于 2014-02-10 分类目录: js/jquery 标签:json, 删除, 排序, 添加 js数据格式和json数据格式,各有各的用处,就个人而言,json更好用一点,js自身的数组 ...
- MVC EF Model First
1 在Models下新建实体数据模型Model.edmx 2 在Model.edmx中点右键建立各个实体,增加Scalar Property 3 空白处点右键,添加关系,勾选增加外键 4 保存Mode ...
- 基于树莓派的微型气象站设计与开发(Windows 10 IoT Core)
前言 树莓派(Raspberry Pi,RPi)是专门为学生计算机编程教育而设计,只有信用卡大小的卡片式电脑,可以运行Linux或者Windows 10 IoT Core操作系统.本文将利用树莓派和U ...
- WPF获取控件内部的ScrollViewer,并控制ScrollViewer操作
//获取内部 ScrollViewer方法 public static T FindVisualChild<T>(DependencyObject obj) where T : Depe ...
- php如何去掉二维数组中重复的元素
$arr=array( "1"=>array("a","b "), "2"=>array("a&q ...
- 关于SetLength报Out of memory的研究及解决办法
关于SetLength报Out of memory的研究及解决办法 最近在做一个GIS系统, 在读GIS数据时采用了动态数组,突然读一个数据时SetLength报错!Out of memory 仔细研 ...
- Win10《芒果TV》更新v3.5.0夏至版:会员尊享蓝光画质,关联本地视频播放
在Win10秋季创意者更新前夕,Win10版<芒果TV>全平台同步更新夏至版v3.5.0,新增会员蓝光画质,关联本地视频播放,进一步提升使用体验. Win10版<芒果TV>V3 ...
- 零元学Expression Blend 4 - Chapter 4元件重复运用的观念
原文:零元学Expression Blend 4 - Chapter 4元件重复运用的观念 本章将教大家Blend元件重复运用的观念,这在Silverlight设计中是非常重要的,另外加码赠送渐层工具 ...
- 事件循环和线程没有必然关系(就像Windows子线程默认没有消息循环一样),模态对话框和事件循环也没有必然关系(QWidget直接就可以)
周末天冷,索性把电脑抱到床上上网,这几天看了 dbzhang800 博客关于 Qt 事件循环的几篇 Blog,发现自己对 Qt 的事件循环有不少误解.从来只看到现象,这次借 dbzhang800 的博 ...