-- coding: utf-8 --

import scrapy

from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengwuyouItem

class QianchengSpiderSpider(scrapy.Spider):

name = 'qiancheng_spider'

# allowed_domains = ['www.qq.com']

start_urls = [

#关键字数据分析

'https://search.51job.com/list/000000,000000,0000,00,9,99,%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字数据挖掘

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字算法

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25AE%2597%25E6%25B3%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字机器学习

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字深度学习

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%25B7%25B1%25E5%25BA%25A6%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    #关键字人工智能

    'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

]

#以上是第一步，获取搜索到这些关键字的都有哪些url

#第二步骤，这些个检索页，下面有很多页，要翻页，每一页中的每个详情页的里面的数据

#那么首先我们要先写提取一个页面当中的url（每一个详情页的url），这应该是一个a标签

def parse(self, response):

    xpath="//div[@class='el']" #这里面要过滤筛选一下用这个xpth获得

    items = response.xpath(xpath); #这里面获得是不满足条件的el标签

    print(items)

    for item in items:

    #遍历一下这个items，把不符合需求的过滤掉

    # 如何去过滤呢？也就是说如何选择下面这个if的条件呢，我们来看看原始网页代码的特点

        #观察之后我们发现每一个t1标签前面还都有一个p标签，t1后面还有一个空格

        if not len(item.xpath("./p[@class='t1 ']")):

            continue

                              #一个点表示当前节点#p标签 t1 （这里有个空格）#这部分为这么这样写我不是特别明白

               #也就是说我在上面这个items下面我再查找，有没有"p标签"，有没有class等于‘t1空格’

        url = item.xpath("./p[@class='t1 ']//a/@href").extract_first()#这里获得是详情页的全部内容

    #./p[@class='t1 '#照抄，因为他下面只有一个“a标签”，获取他下面全部内容@

    #href属性，

        yield scrapy.Request(url, callback=self.detail_parse)

    #下面开始是想得到他的翻页行为

    next_page_url = response.xpath("//a[@id='rtNext']/@href").extract_first()

    if not next_page_url is None:

         yield scrapy.Request(next_page_url, callback=self.parse)

def detail_parse(self,response):

    item = JobscrawlerQianchengwuyouItem()

    # 招聘名称

    item["job_name"] = response.xpath("//div[@class='cn']/h1/text()").extract_first().strip()

    # 可以获得没有白空格的job_name

    # 职位信息

    item["job_info"] = "".join(response.xpath("//div[@calss='bmsg job_msg inbox']//text()").extract()).strip()

    # 薪资

    item["job_salary"] = "".join(response.xpath('//div[@class="sp4"]/text()').extract()).strip()

    # 职位福利

    item["job_welfare"] = ",".join(response.xpath("//span[@class='sp4']/text()").extract())

    #item["job_welfare"] = response.xpath("//span[@class='sp4']/text()这样会获得一个列表，但是我们需要的是一个字符串

    # 经验要求

    item["job_exp_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip()

    item["job_edu_require"] = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip()

    # 学历要求#获取详情页的细节信息

    # 公司名称

    item["company_name"] = response.xpath('//div[@class="com_msg"]//p/text').extract_first().strip()

    # 公司行业

    # 公司性质

    itme["company_industry"] = "".join(response.xpath('//span[@class="i_trade"]/../text()').extract()).strip()

    item["company_nature"] = "".join(response.xpath('//span[@class="i_flag"]/../text()').extract()).strip()

    #“..（点点）”的意思是我希望定位到父标签的text，但是定位不到父标签，能定位到子标签同一级的标签，然后通过子标签点点，就可以了

    #"".join（....）意思是得到的是一个列表，join一下，就加到前面的“”当中去了，就变成str格式了

    #这里如果希望把所有的白空格都处理掉的话，就需要for循环，但是数据量比较大，就把收尾的白空格去掉就可以了

    # 公司人数

    item["company_people"] = "".join(response.xpath('//span[@class="i_people"]/../text()').extract()).strip()

    # 公司地址

    item["company_location"] = ""

    # 公司概况

    item["company_overview"] = "".join(response.xpath('//div[@class="tmsg inbox"]//text()').extract()).strip()

    # 公司融资阶段

    item["company_financing_stage"] = ""

    yield item

from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengwuyouItem的更多相关文章

# -*- coding: utf-8 -*-
-- coding: utf-8 -- import scrapy from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengw ...
Scrapy框架的学习(6.item介绍以及items的使用（提前定义好字段名）)转载https://blog.csdn.net/wei18791957243/article/details/86259688
在Scrapy框架中的items.py的作用 1.可以预先定义好要爬取的字段 items.py import scrapy class TencentItem(scrapy.I ...
python from import 自定义模块
from douban250.items import Douban250Item python import 自定义模块 (1)主程序与模块程序在同一目录下: 如下面程序结构: `-- src ...
Scrapy进阶知识点总结（三）——Items与Item Loaders
一.Items 抓取的主要目标是从非结构化源(通常是网页)中提取结构化数据.Scrapy蜘蛛可以像Python一样返回提取的数据.虽然方便和熟悉,但Python缺乏结构:很容易在字段名称中输入拼写错误 ...
scrapy框架之items项目
Items 主要目标是从非结构化来源(通常是网页)提取结构化数据.Scrapy爬虫可以将提取的数据作为Python语句返回.虽然方便和熟悉,Python dicts缺乏结构:很容易在字段名称中输入错误 ...
Scrapy持久化(items+pipelines)
一.items保存爬取的文件 items.py import scrapy class QuoteItem(scrapy.Item): # define the fields for your ite ...
Scrapy框架爬虫初探——中关村在线手机参数数据爬取
关于Scrapy如何安装部署的文章已经相当多了,但是网上实战的例子还不是很多,近来正好在学习该爬虫框架,就简单写了个Spider Demo来实践.作为硬件数码控,我选择了经常光顾的中关村在线的手机页面 ...
利用scrapy和MongoDB来开发一个爬虫
今天我们利用scrapy框架来抓取Stack Overflow里面最新的问题(),并且将这些问题保存到MongoDb当中,直接提供给客户进行查询. 安装在进行今天的任务之前我们需要安装二个框架,分别 ...
Python之路【第二十三篇】爬虫
difference between urllib and urllib2 自己翻译的装逼必备 What is the difference between urllib and urllib2 mo ...

随机推荐

android开发_view和view属性
一.view视图的宽度和高度属性,属性值:固定和浮动两种状态 1属性为固定值 <View android:layout_width="30dp" android:layout ...
利用策略模式实现了同一接口的多个Servicel实现类，如何同时注入Controller
解决方法如上图,通过给实现类起别名,并在controller中,通过@Qualifier注解获取不同的实现类,之前没有这样写,会出现这样的情况: 通过@autowired注解注入dao层时为空,会报空 ...
cookie应用——UI中查询条件的保存
var cookieOperate = { cookieNames: { companyCNName:"_companyCNName", companyENName:"_ ...
Unity外包团队：关于手机unity游戏开发的技术选型
技术选型 Unity引擎内置了多人联机的解决方案,涵盖了从最底层的网络数据传输,到不同玩家之间的消息发送,再到游戏大厅这样的高级功能.考虑到Unity官方提供的云服务(Internet Service ...
跨域获取后台日期-ASP
最近所有的计划都被打乱,生活节奏也有些控制不住,所以在自己还算清醒的时候,把之前一个小功能写下来,对其它人也有些帮助. 需求前景:需要用AJAX跨域获取后台服务器日期. 1.分析需求: 在这个需求中, ...
sqlite3如何判断一个表是否已经存在于数据库中 C++
SELECT count(*) AS cnt FROM sqlite_master WHERE type='table' AND name='table_name';cnt will return 0 ...
合并K个排序链表(java实现)
题目: 合并 k 个排序链表,返回合并后的排序链表.请分析和描述算法的复杂度. 示例: 输入: [ 1->4->5, 1->3->4, 2->6 ] 输出: ...
hdoj3138
题意:略各点向原信念连INF+1的边,不同信念连INF的边,这样割原信念花费大一点.然后好友连1的边.最小割的结果-n*INF就是答案,因为割到哪边最少都要INF. #include <ios ...
支付-stripe
国外三大支付paypal,braintree,stripe,有兴趣可以去了解一下他们的区别. 支付宝和paypal基本只需要发送charge信息请求交给后端做就ok了,那么stripe前端也只需要收集 ...
Educational Codeforces Round 41 (Rated for Div. 2)F. k-substrings
题意比较麻烦略题解:枚举前缀的中点,二分最远能扩展的地方,lcp来check,然后线段树维护每个点最远被覆盖的地方,然后查询线段树即可 //#pragma GCC optimize(2) //#pr ...

from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengwuyouItem

-- coding: utf-8 --

from jobscrawler_qianchengwuyou.items import JobscrawlerQianchengwuyouItem的更多相关文章

随机推荐

热门专题