scrapy 抓取拉勾网数据

其实很简单，却因为一些小问题，折腾不少时间，简要记录一下，以备后需。

>> scrapy startproject lagou

>> cd lagou

>> scrapy gen lagou_jd www.lagou.com

定义item

在items.py中继续完善定义：

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class LagouItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    job_title = scrapy.Field()

    job_description = scrapy.Field()

    job_url = scrapy.Field()

完善爬虫

# -*- coding: utf-8 -*-

from scrapy.selector import Selector

from scrapy.contrib.spiders import CrawlSpider,Rule

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from lagou.items import LagouItem

import codecs,re

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

class LagoujdSpider(CrawlSpider):

    name = "lagoujd"

    allowed_domains = ["lagou.com"]

    start_urls = (

        'http://www.lagou.com/jobs/787409.html',

    )

    rules = [

        Rule(SgmlLinkExtractor(allow =r'jobs/\d+\.html'),callback = 'parse_lagou',follow=True),

    ]

    def parse_lagou(self, response): # 主要改名，不能使用默认的parse！

        self.SPLIT_DEMAND = re.compile(u'(要求|资格|条件)[:：;\r\n]?')

        self.SPLIT_LINE = re.compile(u'[;；。\r\n]')

        self.DEMAND = re.compile(u'具备|熟悉|具有|熟练|掌握|良好的|能够|丰富的|以上学历|优秀的|有深入研究|有很强的|工作

经历|工作经验|善于|懂得|优先|不少于|不超过|喜欢|较强的.{2,8}能力|相关专业|相关学历|开发经验|实习经验|\d年以上')

        item = LagouItem()

        sel = Selector(response)

        try:

            item["job_title"] =sel.xpath("//title/text()").extract()[0].split('-')[0][:-2].strip()

            job_des = sel.xpath('//*[@id="container"]/div[1]/div[1]/dl[1]/dd[2]').extract()[0]

            job_des = BeautifulSoup(job_des).get_text()

            item["job_description"] = self.get_demand(job_des)

            item["job_url"] = response.url

            print item['job_title']

        except Exception,e:

            print e

       # if item.has_key("job_title") and item.has_key("job_description"):

       #     with codecs.open("./output/"+item["job_title"].strip()+".txt",'a','utf-8') as fw:

       #         fw.write(item["job_description"])

       #         print item["job_title"],"done"

        return item

    def get_demand(self,jdstr):

        res = []

        if self.SPLIT_DEMAND.search(jdstr):

            pos = self.SPLIT_DEMAND.search(jdstr).span()[1]

            linelist =self.SPLIT_LINE.split(jdstr[pos:])

            for line in linelist:

                if len(line)<5:continue

                if re.match('\d',line.strip()):

                    res.append(line)

                elif self.DEMAND.search(line):

                    res.append(line)

                else:

                    break

        return '\n'.join(res)

存储抓取的数据为json格式

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json

import codecs

class LagouPipeline(object):

    def __init__(self):

        self.file = codecs.open('lagou_jd.json','w',encoding='utf-8')

    def process_item(self, item, spider):

        line = json.dumps(dict(item),ensure_ascii=False)+'\n'

        self.file.write(line)

        return item

    def spider_closed(self,spider):

        self.file.close()

在setttings.py 中注册pipline

# -*- coding: utf-8 -*-

# Scrapy settings for lagou project

#

# For simplicity, this file contains only the most important settings by

# default. All the other settings are documented here:

#

#     http://doc.scrapy.org/en/latest/topics/settings.html

#

BOT_NAME = 'lagou'

SPIDER_MODULES = ['lagou.spiders']

NEWSPIDER_MODULE = 'lagou.spiders'

ITEM_PIPELINES = {

    'lagou.pipelines.LagouPipeline':300,

}

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'lagou (+http://www.yourdomain.com)'

运行，各种抓！！！

>> scrapy crawl lagou_jd

或者

>> scrapy crawl lagou_jd -o item.json -t json

demo:

jkmiao@jkmiao-ipin:~/workplace/spiders/lagou$ more lagou_jd.json

{"job_url": "http://www.lagou.com/jobs/1102051.html", "job_description": "1、具有2年以上互联网产品经验，优秀的交互设计

能力，对产品设计有极高的要求，追求极致的用户体验 2、善于观察与学习，具有宽广的视野、对于整体产品规划有自己的见解和理念 

3、有优秀缜密的逻辑与思维能力，良好的协调能力、分析、计划及项目管理能力，具备良好的团队精神，沟通能力强 4、熟练使用 Axu

re 、 visio 、 office 等软件\n 5、有成熟的O2O平台类产品设计经验者优先", "job_title": "产品经理"}

{"job_url": "http://www.lagou.com/jobs/917776.html", "job_description": "1、有2年以上互联网产品规划和体验设计相关经验,

熟悉互联网或软件产品整体实现过程，包括从需求分析到产品发布\n2、有完整策划至少2个以上成功、目前尚在运营中的互联网产品设

计案例\n3、能通过数据分析等系统性方法深刻理解用户需求并予以满足\n4、执行力强，善于组织协调并推动项目进展\n5、对工作充满

热情，富有创新精神，能承受较大的工作压力\n6、有良好的学习能力、良好的沟通能力和团队合作精神，出色的组织能力", "job_titl

e": "产品经理"}

新建脚本文件preprocess.py，进一步预处理

#!/usr/bin/env python

# coding=utf-8

import simplejson as json

import re

import sys,codecs

from collections import defaultdict

reload(sys)

sys.setdefaultencoding('utf-8')

from simhash import Simhash

def get_top_jobname(jobname,namelist):

    namelist = sorted(namelist)

    dis = [ (Simhash(jobname).distance(Simhash(other)),other) for other in namelist ]

    dis = sorted(dis,key=lambda x:x[0])

    return dis[0]

def clean_text(fname='./lagou_jd.json'):

    SPLIT_LINE = re.compile(u'[;；。\r\n]')

    FILTER_DEMAND = re.compile(u'薪酬|待遇|待遇|福利|加入我们|职责|你|成为')

    res = defaultdict(str)

   # fw1 = codecs.open('demands.txt','w','utf-8')

   # fw2 = codecs.open('duty.txt','w','utf-8')

    i=1

    for line in codecs.open(fname):

        jd = json.loads(line)

        if not re.match(u'\d',jd['job_demand'].strip()) or len(jd["job_demand"])<8 or len(

jd["job_title"])<2:continue

        if FILTER_DEMAND.search(jd['job_demand']):continue

        if len(res.keys())>0:

            top_job = get_top_jobname(jd['job_title'],res.keys())

        else:

            top_job = tuple((0,jd['job_title']))

        if top_job[0]<7: # 如果距离<，就归并为一个职业

            if top_job[0]>4:

                print top_job[0],top_job[1],jd['job_title']

            jd['job_title'] =  top_job[1]

        jd["job_demand"] = re.sub(ur"\xa0","",jd["job_demand"].decode('utf-8'))

      # jd["job_duty"] = re.sub(ur"\xa0","",jd["job_duty"].decode('utf-8'))

        jd["sum_request"] = re.sub(ur"\xa0|\s+"," ",jd["sum_request"].decode('utf-8'))

        demand = [ x.strip() for x in jd['job_demand'].split() if len(x)>5]

        if len(demand)<3: continue

       # duty = [x.strip() for x in jd['job_duty'].split() if len(x)>5]

        sum_request = [ x.strip() for x in jd['sum_request'].split() if len(x)>3 ]

        jd['job_demand'] = '\n'.join(demand)

    #   jd['job_duty'] = '\n'.join(duty)

    #    fw1.write('\n'.join(demand)+'\n')

    #    fw2.write('\n'.join(duty)+'\n')

        if not res.has_key(jd["job_title"]):

            res[jd["job_title"]] = ' '.join(sum_request)+'\n'+jd["job_demand"].strip()

        else:

            res[jd['job_title']] += '\n'+'\n'.join(SPLIT_LINE.split(jd['job_demand']))

        i += 1

        if i%100==0:

            print i

    print i,"done"

    print len(res)

    json.dump(res,codecs.open('./lagou_jd_clean.json','w','utf-8'))

def get_jds(fname='./lagou_jd_clean.json'):

    res = json.load(codecs.open(fname))

    i = 1

    for k,v in res.iteritems():

        if len(v.split())<16:

            print i,k

            print v

            print "\n============\n"

            i += 1

            if i>20:

                break

if __name__ == "__main__":

    clean_text()

scrapy 抓取拉勾网数据的更多相关文章

scrapy抓取拉勾网职位信息（一）——scrapy初识及lagou爬虫项目建立
本次以scrapy抓取拉勾网职位信息作为scrapy学习的一个实战演练 python版本:3.7.1 框架:scrapy(pip直接安装可能会报错,如果是vc++环境不满足,建议直接安装一个visua ...
pythonのscrapy抓取网站数据
(1)安装Scrapy环境步骤请参考:https://blog.csdn.net/c406495762/article/details/60156205 需要注意的是,安装的时候需要根据自己的pyt ...
scrapy抓取拉勾网职位信息（三）——爬虫rules内容编写
在上篇中,分析了拉勾网需要跟进的页面url,本篇开始进行代码编写. 在编写代码前,需要对scrapy的数据流走向有一个大致的认识,如果不是很清楚的话建议先看下:scrapy数据流本篇目标:让拉勾网爬 ...
scrapy抓取拉勾网职位信息（二）——拉勾网页面分析
网站结构分析: 四个大标签:首页.公司.校园.言职我们最终是要得到详情页的信息,但是从首页的很多链接都能进入到一个详情页,我们需要对这些标签一个个分析,分析出哪些链接我们需要跟进. 首先是四个大标签 ...
scrapy抓取拉勾网职位信息（七）——实现分布式
上篇我们实现了数据的存储,包括把数据存储到MongoDB,Mysql以及本地文件,本篇说下分布式. 我们目前实现的是一个单机爬虫,也就是只在一个机器上运行,想象一下,如果同时有多台机器同时运行这个爬虫 ...
scrapy抓取拉勾网职位信息（四）——对字段进行提取
上一篇中已经分析了详情页的url规则,并且对items.py文件进行了编写,定义了我们需要提取的字段,本篇将具体的items字段提取出来这里主要是涉及到选择器的一些用法,如果不是很熟,可以参考:sc ...
scrapy抓取拉勾网职位信息（七）——数据存储（MongoDB，Mysql，本地CSV）
上一篇完成了随机UA和随机代理的设置,让爬虫能更稳定的运行,本篇将爬取好的数据进行存储,包括本地文件,关系型数据库(以Mysql为例),非关系型数据库(以MongoDB为例). 实际上我们在编写爬虫r ...
scrapy抓取拉勾网职位信息（六）——反爬应对（随机UA，随机代理）
上篇已经对数据进行了清洗,本篇对反爬虫做一些应对措施,主要包括随机UserAgent.随机代理. 一.随机UA 分析:构建随机UA可以采用以下两种方法我们可以选择很多UserAgent,形成一个列表 ...
scrapy抓取拉勾网职位信息（五）——代码优化
上一篇我们已经让代码跑起来,各个字段也能在控制台输出,但是以item类字典的形式写的代码过于冗长,且有些字段出现的结果不统一,比如发布日期. 而且后续要把数据存到数据库,目前的字段基本都是string ...

随机推荐

MyEclipse WebSphere开发教程：安装和更新WebSphere 6.1, JAX-WS, EJB 3.0（三）
MyEclipse超值折扣限量 100 套! 立即开抢>> [MyEclipse最新版下载] MyEclipse支持Java EE技术(如JAX-WS和EJB 3.0),它们以功能包的形 ...
获取bean的两种方式
BeanFactory方式: 1: public void testFactory(){ ResourcePatternResolver rpt=new PathMatchingResourcePat ...
web测试小结
今年5月份开始接触web测试,经过大半年的测试及学习,简单总结下测试过程: 1.需求理解 2.测试策略.方案.用例编写及评审 3.测试环境搭建 4.测试执行 5.bug提单.问题跟踪 6.回归测试 ...
强化学习平台 openAI 的 gym 安装（Ubuntu环境下如何安装Python的gym模块）
openAI 公司给出了一个集成较多环境的强化学习平台 gym , 本篇博客主要是讲它怎么安装. openAI公司的主页: https://www.openai.com/systems/ 从主页上我 ...
linux shell终端打开方式
前言 Linux操作系统没有Window操作系统界面友好,使用者需要使用命令与系统进行交互,交互媒介为shell终端. 有三种方式可以打开终端: 方法一: 打开新的窗口并打开shell终端,快捷键:c ...
IplImage 与mat之间的转换及释放内存
IplImage *x = cvLoadImage(savefilename, ); Mat img(x,true); cvReleaseImage(&x); 完
使用Inno Setup Compiler制作安装软件包
前言项目开发完成之后,需要程序打包发行,本文使用Inno Setup工具制作安装软件包. 系统环境系统:win7_x64 工具:Inno Setup Complier 实现步骤 1.下载安装Inn ...
jquery中not的用法[.not(selector)]
描述: 从匹配的元素集合中移除指定的元素. 如果提供的jQuery对象代表了一组DOM元素,.not()方法构建一个新的匹配元素的jQuery对象,用于存放筛选后的元素.所提供的选择器是对每个元素进行 ...
leetcode:Reverse Integer【Python版】
1.在进入while之前,保证x是非负的: 2.符号还是专门用flag保存 =================== 3.另一思路:将integer转换成string,然后首位swap,直至中间: cl ...
使用过的bug跟踪系统
MantisBT jira

scrapy 抓取拉勾网数据

scrapy 抓取拉勾网数据的更多相关文章

随机推荐

热门专题