设置进程池爬取拉钩网:

# coding = utf-
import json
import pymongo
import pandas as pd
import requests
from lxml import etree
import time
from multiprocessing import Pool # 设置mongodb
client = pymongo.MongoClient('localhost')
db = client['lagou']
# 查询的岗位名称
POSITION_NAME = '数据挖掘'
# 想要爬取的总页面数
PAGE_SUM =
# 每页返回的职位数量
PAGE_SIZE =
# 指定数据库的名字
DATA_NAME = "DataMiningPosition" base_url = 'https://m.lagou.com/search.json?city=%E5%85%A8%E5%9B%BD&positionName={positionName}' \
'&pageNo={pageNo}&pageSize={pageSize}' def page_index(pageno):
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
# cookie能不要尽量不要,这里正好不用cookie也可以正常返回数据
# "Cookie": "user_trace_token=20181119151914-03711263-38a2-4d81-bd81-5f480d930039; _ga=GA1.2.605262108.1542611954; _gid=GA1.2.249787972.1542611954; LGSID=20181119151916-6c3da9fa-ebcb-11e8-8958-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DOnHWjpEfiW4_pVm7hX8NYOFm0iJ7bz1ZJJlaKPPnmMzLE-6ypKNo0f19ABO5bjW4%26wd%3D%26eqid%3D8f61629100016e18000000065bf263e7; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fgongsi%2F147.html; LGUID=20181119151916-6c3dabf3-ebcb-11e8-8958-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAAGCABCC2D851CA25D1CFCD2B28DCDD6E00A2C7E; _ga=GA1.3.605262108.1542611954; X_HTTP_TOKEN=a0cc1a4beb8a41f57f144bc0bfd77bd7; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672adb3834203-08b3706084b44a-3961430f-1327104-1672adb3835428%22%2C%22%24device_id%22%3A%221672adb3834203-08b3706084b44a-3961430f-1327104-1672adb3835428%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542611954,1542612053,1542612277,1542612493; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542613115; LGRID=20181119153837-20bafb1a-ebce-11e8-8958-5254005c3644",
"Host": "m.lagou.com",
"Proxy-Connection": "keep-alive",
"Referer": "http://m.lagou.com/search.html",
"X-Requested-With": "XMLHttpRequest",
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
}
url = base_url.format(positionName=POSITION_NAME, pageNo=pageno, pageSize=PAGE_SIZE)
response = requests.get(url, headers=headers)
html = response.text
content = json.loads(html)
print(content)
if content.get("content"):
return content
else:
time.sleep()
return page_index(pageno) def parse_page_index(content): for i in range():
try:
item = content['content']['data']['page']['result'][i]
#print(item)
yield {
'positionId': item.get('positionId'),
'positionName': item.get('positionName'),
'city': item.get('city'),
'createTime': item.get('createTime'),
'salary': item.get('salary'),
'companyId': item.get('companyId'),
'companyFullName': item.get('companyFullName')
}
except IndexError as e:
print('可能没有那么多字段', e) def save_to_mongo(data):
if db[DATA_NAME].update({'positionId': data['positionId']}, {'$set': data}, True):
print('Saved to Mongo', data['positionId'])
else:
print('Saved to Mongo Failed', data['positionId']) def parse_detail(url):
# url = "http://m.lagou.com/jobs/4593934.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"Accept - Encoding": "gzip, deflate",
"Accept - Language": "zh - CN, zh;q = 0.9",
"Cache - Control": "max - age = 0",
"Connection": "eep - alive",
# "Cookie": "_ga=GA1.2.474762156.1528795210; _gid=GA1.2.574638607.1528795210; user_trace_token=20180612172010-cdf76dc1-6e21-11e8-9af0-525400f775ce; LGUID=20180612172010-cdf772c0-6e21-11e8-9af0-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1528795210,1528795215,1528795223; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=f3ed266ddeee802fb7d402e4f6d4f4a3; JSESSIONID=ABAAABAAAFDABFG9F9C52FA9D8CAE24F139A0131C45E918; _ga=GA1.3.474762156.1528795210; _gat=1; LGSID=20180612184248-597a7795-6e2d-11e8-9479-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=http%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=http%3A%2F%2Fm.lagou.com%2Fjobs%2F4079910.html; LGRID=20180612184505-ab051d02-6e2d-11e8-9479-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1528800306" }
try:
response = requests.get(url, headers=headers)
if response.status_code == :
print("请求成功")
text = response.content.decode()
# print(text)
html = etree.HTML(text)
workyear = html.xpath('//span[@class="item workyear"]/span/text()')
if workyear:
workyear = workyear[]
else:
time.sleep()
parse_detail(url)
positiondesc = html.xpath('//div[@class="positiondesc"]//p/text()')
#print(workyear, positiondesc)
return workyear, positiondesc
except Exception as e:
print(e) # 将爬取的数据存到Mongodb
def to_mongo(page_sum):
# 拉勾网顶多只能显示到334页
for page in range(page_sum):
html = page_index(page)
items = parse_page_index(html)
# print(items)
for item in items:
print(item)
save_to_mongo(item) # 运用进程池将爬取的数据存到Mongodb
def to_mongo_pool(page):
# 拉勾网顶多只能显示到334页
content = page_index(page)
items = parse_page_index(content)
# print(items)
for item in items:
print(item)
save_to_mongo(item) # 解析爬取的字条,以便把数据转为DataFrame格式
def parse_items(page_sum):
for page in range(page_sum):
html = page_index(page)
items = parse_page_index(html)
for item in items:
positionId = item["positionId"]
detail_url = "http://m.lagou.com/jobs/{}.html".format(positionId)
workyear, positiondesc = parse_detail(detail_url)
print(positionId,positiondesc)
yield [
item["positionId"],
item["positionName"],
item["city"],
item["createTime"],
item["salary"],
item["companyId"],
item["companyFullName"],
workyear,
positiondesc
] # 把数据保存为csv格式
def to_csv(page_sum):
item_lists = []
# print(parse_items())
for item in parse_items(page_sum):
item_lists.append(item)
#print(item_lists)
data = pd.DataFrame(item_lists,
columns=["positionId", "positionName", "city", "createTime", "salary", "companyId",
"companyFullName", "workyear", "positiondesc"])
data.to_csv("python_positon.csv") if __name__ == '__main__': #to_csv
#to_mongo()
# 建议保存到mongodb数据库中 start_time = time.time()
pool = Pool() # pool()参数:进程个数:默认的是电脑cpu的核的个数,如果要指定进程个数,这个进程个数要小于等于cpu的核数
# 第一个参数是一个函数体,不需要加括号,也不需指定参数。。
# 第二个参数是一个列表,列表中的每个参数都会传给那个函数体
pool.map(to_mongo_pool,[i for i in range(PAGE_SUM)])
# close它只是把进程池关闭
pool.close()
# join起到一个阻塞的作用,主进程要等待子进程运行完,才能接着往下运行
pool.join()
end_time = time.time()
print("总耗费时间%.2f秒" % (end_time - start_time))

进程池爬取并存入mongodb的更多相关文章

  1. python进程池爬取下载美女图片(xpath)--lowbiprogrammer

    # -*- coding: utf-8 -*-import requests,osfrom lxml import etreeimport multiprocessingfrom retrying i ...

  2. 基于requests模块的cookie,session和线程池爬取

    目录 基于requests模块的cookie,session和线程池爬取 基于requests模块的cookie操作 基于requests模块的代理操作 基于multiprocessing.dummy ...

  3. 5 使用ip代理池爬取糗事百科

    从09年读本科开始学计算机以来,一直在迷茫中度过,很想学些东西,做些事情,却往往陷进一些技术细节而蹉跎时光.直到最近几个月,才明白程序员的意义并不是要搞清楚所有代码细节,而是要有更宏高的方向,要有更专 ...

  4. Python使用Scrapy框架爬取数据存入CSV文件(Python爬虫实战4)

    1. Scrapy框架 Scrapy是python下实现爬虫功能的框架,能够将数据解析.数据处理.数据存储合为一体功能的爬虫框架. 2. Scrapy安装 1. 安装依赖包 yum install g ...

  5. Python爬虫-代理池-爬取代理入库并测试代理可用性

    目的:建立自己的代理池.可以添加新的代理网站爬虫,可以测试代理对某一网址的适用性,可以提供获取代理的 API. 整个流程:爬取代理 ----> 将代理存入数据库并设置分数 ----> 从数 ...

  6. 42.scrapy爬取数据入库mongodb

    scrapy爬虫采集数据存入mongodb采集效果如图: 1.首先开启服务切换到mongodb的bin目录下 命令:mongod --dbpath e:\data\db 另开黑窗口 命令:mongo. ...

  7. 使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中

    import requests import time, random, csv from fake_useragent import UserAgent from bs4 import Beauti ...

  8. 使用requests、re、BeautifulSoup、线程池爬取携程酒店信息并保存到Excel中

    import requests import json import re import csv import threadpool import time, random from bs4 impo ...

  9. 19 03 13 关于 scrapy 框架的 对环球网的整体爬取(存储于 mongodb 数据库里)

    关于  spinder  在这个框架里面   和不用数据库  相同 # -*- coding: utf-8 -*- import scrapy from yang_guan.items import ...

随机推荐

  1. HDU/HDOJ 1867 A + B for you again

    仔细了解KMP之后再看这题就会发现是裸题. 因为kmp我们可以求出s的f数组,表示能与p的多少前缀匹配.那么我们只需取f[s.size() - 1]即可. #include <cstdio> ...

  2. mybatis 直接执行sql 【我】

    Connection conn = getConnection();//            Connection conn = this.ss.getConnection(); 返回Connect ...

  3. java练习:质数,匿名内部类创建接口,抽象类派生子类,画圆,字节截取字符串,数字变钱币,五子棋,梭哈

    java学习-质数的孤独 正在看质数的孤独,,,于是写了一个练习代码,输出1-100之间的质数 代码比较烂.待完善吧. 这里用到了continue和break,continue指结束当前轮次循环,跳入 ...

  4. Jenkins Pipelines Summary

    示例1: pipeline{ agent {label "xxx"} // label is a special machine registered in Jenkins env ...

  5. qml性能优化(来源于群友分享);

    Qt quick性能优化 使用时间驱动 避免定时轮询: 使用信号槽形式: 使用多线程 C++; QML WorkerScript元件: 使用Qt Quick Compiler 只需要再PRO文件中添加 ...

  6. QT: 自定义断言;

    使用Qt  creator + mingw + gdb进行qt项目开发时,应用Q_ASSERT进行断言总是会出现问题:  断言失败,程序崩溃而不是停止: 采用自定义断言能完美解决该问题(方法取自于国外 ...

  7. 三层结构、MVC的简介

    以前总是听说什么三层结构.什么MVC,但是一直傻傻分不清这是什么意思,下面来简单介绍一下它们吧~ 1.三层结构 在 B/S 架构中,系统标准的三层架构包括:表现层.业务层.持久层 1)表现层 也就是我 ...

  8. python操作execl学习笔记(一)

    本节只记录关于execl的读操作: execl 内容及格式 python3 #!/usr/bin/env python #-*- coding:utf-8 -*- import xlrd import ...

  9. js中Date与timestamp(时间戳)的相互转换

    #时间(Date)转时间戳(Timestamp): 1.var timestamp1 = (new Date()).valueOf(); // 结果:1535374762785,通过valueOf() ...

  10. window跟vue变量互相绑定

    js实现变量监听 //定义一个对象,挂载到window下,后续在任何模块中,给这个对象的show属性赋值,都将触发set对应的代码,我这么写主要是为了解决vue子组件向父组件传值的问题 window. ...