scrapy基础知识之 CrawlSpiders爬取lagou招聘保存在mysql（分布式）：

items.py

 import scrapy

 class LagouItem(scrapy.Item):

     # define the fields for your item here like:

     # name = scrapy.Field()

     #id

     # obj_id=scrapy.Field()

     #职位名

     positon_name=scrapy.Field()

     #工作地点

     work_place=scrapy.Field()

     #发布日期

     publish_time=scrapy.Field()

     #工资

     salary=scrapy.Field()

     #工作经验

     work_experience=scrapy.Field()

     #学历

     education=scrapy.Field()

     #full_time

     full_time=scrapy.Field()

     #标签

     tags=scrapy.Field()

     #公司名字

     company_name=scrapy.Field()

     # #产业

     # industry=scrapy.Field()

     #职位诱惑

     job_temptation=scrapy.Field()

     #工作描述

     job_desc=scrapy.Field()

     #公司logo地址

     logo_image=scrapy.Field()

      #领域

     field=scrapy.Field()

     #发展阶段

     stage=scrapy.Field()

     #公司规模

     company_size=scrapy.Field()

     # 公司主页

     home = scrapy.Field()

     #职位发布者

     job_publisher=scrapy.Field()

     #投资机构

     financeOrg=scrapy.Field()

     #爬取时间

     crawl_time=scrapy.Field()

lagou.py

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from LaGou.items import LagouItem

from LaGou.utils.MD5 import get_md5

from datetime import datetime

class LagouSpider(CrawlSpider):

    name = 'lagou'

    allowed_domains = ['lagou.com']

    start_urls = ['https://www.lagou.com/zhaopin/']

    content_links=LinkExtractor(allow=(r"https://www.lagou.com/jobs/\d+.html"))

    page_links=LinkExtractor(allow=(r"https://www.lagou.com/zhaopin/\d+"))

    rules = (

        Rule(content_links, callback="parse_item", follow=False),

        Rule(page_links,follow=True)

    )

    def parse_item(self, response):

        item=LagouItem()

        #获取到公司拉钩主页的url作为ID

        # item["obj_id"]=get_md5(response.url)

        #公司名称

        item["company_name"]=response.xpath('//dl[@class="job_company"]//a/img/@alt').extract()[0]

        # 职位

        item["positon_name"]=response.xpath('//div[@class="job-name"]//span[@class="name"]/text()').extract()[0]

        #工资

        item["salary"]=response.xpath('//dd[@class="job_request"]//span[1]/text()').extract()[0]

        # 工作地点

        work_place=response.xpath('//dd[@class="job_request"]//span[2]/text()').extract()[0]

        item["work_place"]=work_place.replace("/","")

        # 工作经验

        work_experience=response.xpath('//dd[@class="job_request"]//span[3]/text()').extract()[0]

        item["work_experience"]=work_experience.replace("/","")

        # 学历

        education=response.xpath('//dd[@class="job_request"]//span[4]/text()').extract()[0]

        item["education"]=education.replace("/","")

        # full_time

        item['full_time']=response.xpath('//dd[@class="job_request"]//span[5]/text()').extract()[0]

        #tags

        tags=response.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()').extract()

        item["tags"]=",".join(tags)

        #publish_time

        item["publish_time"]=response.xpath('//dd[@class="job_request"]//p[@class="publish_time"]/text()').extract()[0]

        # 职位诱惑

        job_temptation=response.xpath('//dd[@class="job-advantage"]/p/text()').extract()

        item["job_temptation"]=",".join(job_temptation)

        # 工作描述

        job_desc=response.xpath('//dd[@class="job_bt"]/div//p/text()').extract()

        item["job_desc"]=",".join(job_desc).replace("\xa0","").strip()

        #job_publisher

        item["job_publisher"]=response.xpath('//div[@class="publisher_name"]//span[@class="name"]/text()').extract()[0]

        # 公司logo地址

        logo_image=response.xpath('//dl[@class="job_company"]//a/img/@src').extract()[0]

        item["logo_image"]=logo_image.replace("//","")

        # 领域

        field=response.xpath('//ul[@class="c_feature"]//li[1]/text()').extract()

        item["field"]="".join(field).strip()

        # 发展阶段

        stage=response.xpath('//ul[@class="c_feature"]//li[2]/text()').extract()

        item["stage"]="".join(stage).strip()

        # 投资机构

        financeOrg=response.xpath('//ul[@class="c_feature"]//li[3]/p/text()').extract()

        if financeOrg:

            item["financeOrg"]="".join(financeOrg)

        else:

            item["financeOrg"]=""

        #公司规模

        if financeOrg:

             company_size= response.xpath('//ul[@class="c_feature"]//li[4]/text()').extract()

             item["company_size"]="".join(company_size).strip()

        else:

            company_size = response.xpath('//ul[@class="c_feature"]//li[3]/text()').extract()

            item["company_size"] = "".join(company_size).strip()

        # 公司主页

        item["home"]=response.xpath('//ul[@class="c_feature"]//li/a/@href').extract()[0]

        # 爬取时间

        item["crawl_time"]=datetime.now()

        yield item

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql

class LagouPipeline(object):

    def process_item(self, item, spider):

        con = pymysql.connect(host="127.0.0.1", user="root", passwd="", db="lagou",charset="utf8")

        cur = con.cursor()

        sql = ("insert into lagouwang(company_name,positon_name,salary,work_place,work_experience,education,full_time,tags,publish_time,job_temptation,job_desc,job_publisher,logo_image,field,stage,financeOrg,company_size,home,crawl_time)"

               "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")

        lis=(item["company_name"],item["positon_name"],item["salary"],item["work_place"],item["work_experience"],item["education"],item['full_time'],item["tags"],item["publish_time"],item["job_temptation"],item["job_desc"],item["job_publisher"],item["logo_image"],item["field"],item["stage"],item["financeOrg"],item["company_size"],item["home"],item["crawl_time"])

        cur.execute(sql, lis)

        con.commit()

        cur.close()

        con.close()

        return item

middlewares.py (主要是User_Agent的随机切换没有加ip代理）

import random

from LaGou.settings import USER_AGENTS

class RandomUserAgent(object):

    def process_request(self, request, spider):

        useragent = random.choice(USER_AGENTS)

        request.headers.setdefault("User-Agent", useragent)

settings.py

BOT_NAME = 'LaGou'

SPIDER_MODULES = ['LaGou.spiders']

NEWSPIDER_MODULE = 'LaGou.spiders'

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 5

COOKIES_ENABLED = False

USER_AGENTS = [

    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",

    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",

    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",

    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",

    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",

    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",

    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",

    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"

   ]

DOWNLOADER_MIDDLEWARES = {

      'LaGou.middlewares.RandomUserAgent': 1,

#    'LaGou.middlewares.MyCustomDownloaderMiddleware': 543,

}

ITEM_PIPELINES = {

      #'scrapy_redis.pipelines.RedisPipeline':300,

    'LaGou.pipelines.LagouPipeline': 300,

}

main.py(用于启动调试)

 #coding=utf-8

 from scrapy.cmdline import execute

 execute(["scrapy","crawl","lagou"])

在settings.py配置加入如下代码会实现分布式数据保存在redis里面，怎么从redis取出数据参考前几章

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

SCHEDULER_PERSIST = True

ITEM_PIPELINES = {

      'scrapy_redis.pipelines.RedisPipeline':300,

    #'LaGou.pipelines.LagouPipeline': 300,

}

主要用到知识点：CrawlSpider的(LinkExtractor,Rule)，内容的处理(xpath,extract),字符的处理（join,replace,strip,split），User_Agent随机切换等

scrapy基础知识之 CrawlSpiders爬取lagou招聘保存在mysql（分布式）：的更多相关文章

scrapy基础知识之 CrawlSpiders(爬取腾讯校内招聘):
import scrapyfrom scrapy.spider import CrawlSpider,Rulefrom scrapy.linkextractors import LinkExtract ...
scrapy基础知识之 CrawlSpiders：
通过下面的命令可以快速创建 CrawlSpider模板的代码: scrapy genspider -t crawl spidername xx.com LinkExtractors class sc ...
python之scrapy爬取jingdong招聘信息到mysql数据库
1.创建工程 scrapy startproject jd 2.创建项目 scrapy genspider jingdong 3.安装pymysql pip install pymysql 4.set ...
将爬取的数据保存到mysql中
为了把数据保存到mysql费了很多周折,早上再来折腾,终于折腾好了安装数据库 1.pip install pymysql(根据版本来装) 2.创建数据打开终端键入mysql -u root -p ...
scrapy实战2分布式爬取lagou招聘（加入了免费的User-Agent随机动态获取库 fake-useragent 使用方法查看：https://github.com/hellysmile/fake-useragent）
items.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentati ...
scrapy基础知识之将item 通过pipeline保存数据到mysql mongoDB：
pipelines.py class xxPipeline(object): def process_item(self, item, spider): con=pymysql.connect(hos ...
0.Python 爬虫之Scrapy入门实践指南（Scrapy基础知识）
目录 0.0.Scrapy基础 0.1.Scrapy 框架图 0.2.Scrapy主要包括了以下组件: 0.3.Scrapy简单示例如下: 0.4.Scrapy运行流程如下: 0.5.还有什么? 0. ...
pymysql 使用twisted异步插入数据库：基于crawlspider爬取内容保存到本地mysql数据库
本文的前提是实现了整站内容的抓取,然后把抓取的内容保存到数据库. 可以参考另一篇已经实现整站抓取的文章:Scrapy 使用CrawlSpider整站抓取文章内容实现本文也是基于这篇文章代码基础上实现 ...
【图文详解】scrapy爬虫与动态页面——爬取拉勾网职位信息（2）
上次挖了一个坑,今天终于填上了,还记得之前我们做的拉勾爬虫吗?那时我们实现了一页的爬取,今天让我们再接再厉,实现多页爬取,顺便实现职位和公司的关键词搜索功能. 之前的内容就不再介绍了,不熟悉的请一定要 ...

随机推荐

2017 JavaScript 开发者的学习图谱
码云项目推荐前端框架类 1.项目名称: 基于 Vue.js 的 UI 组件库 iView 项目简介:iView 是一套基于 Vue.js 的 UI 组件库,主要服务于 PC 界面的中后台产品. 特性 ...
最简单的IdentityServer实现——Client
客户端控制台演示请求访问令牌,然后使用此令牌访问API 1.新建项目并添加引用新建一个.net core的控制台程序IdentityServer.EasyDemo.Client 1 引用Iden ...
【Python】Camera拍照休眠唤醒测试
#!/usr/bin/python # -*- coding: UTF-8 -*- import os import sys import time rebootCount = int(input(& ...
使用tratto进行CISCO网络设备的管理
测试环境: CSR1000V CentOS7.4 X64 Step 1:在CentOS7上安装python 3.0环境 [root@docker ~]# python3 -VPython 3.7.0[ ...
SQL Server 2008收缩日志文件--dbcc shrinkfile参数说明
原文:SQL Server 2008收缩日志文件--dbcc shrinkfile参数说明 DBCC SHRINKFILE 收缩相关数据库的指定数据文件或日志文件大小. 语法 DBCC SHRINKF ...
ManualResetEvent 让你的代码等你几分钟
using System;using System.Collections.Generic;using System.Linq;using System.Threading; namespace Co ...
UWP中弹出框屏幕适配问题
上次在UWP中的消息提示框(二)中谈到弹出框在虚拟导航栏的手机上被遮挡问题,今天接就着扯. 上次给用户控件指定的宽高都是Window.Current.Bounds的宽高,而这个获取到的是包含应用程序窗 ...
WIN10以后如果Manifest中不写支持WIN10的话，获取版本号的API获取的是6
if TOSVersion.Major = 10 then // 高版本的Delphi(比如Berlin)可以这样写 ShowMessage('Windows 10'); 或者: if Win32M ...
Qt：正确判断文件、文件夹是否存在的方法
一直对Qt的isFile.isDir.exists这几个方法感到混乱,不知道到底用哪个,网上搜了下资料,也是用这几个方法但是都没有对其深究,经过测试发现会存在问题,先看看下面的测试代码 { QFile ...

scrapy基础知识之 CrawlSpiders爬取lagou招聘保存在mysql（分布式）：

scrapy基础知识之 CrawlSpiders爬取lagou招聘保存在mysql（分布式）：的更多相关文章

随机推荐

热门专题