爬取招聘网的招聘信息:

import json
import random
import time import pymongo
import re
import pandas as pd
import requests
from lxml import etree
import datetime # 设置cookie中可变的值
now = datetime.datetime.now()
timeStamp = int(now.timestamp()*)
geshi = "%Y%m%d%H%M%S"
time1 = datetime.datetime.strftime(now,geshi) # 设置mongodb
client = pymongo.MongoClient('localhost')
# 设置数据库名
db = client['lagou']
# 指定集合名
data_name = 'lagouData'
detail = 'detailData' # 常量
CITY = '广州'
# 查询的岗位名称
POSITON_NAME = '数据挖掘'
# 想要爬取的总页面数
PAGE_SUN =
# 每页返回的职位数量
PAGE_SIZE = # 匹配span[position()>]:表示p标签下从第三个span开始匹配所以
# //dd[@class='job_request']/p/span[position()>3] # index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize=15'
# index页地址url
index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize={}'
# 详情页地址url
detail_url = 'https://m.lagou.com/jobs/{}.html'
# 浏览代理用户
user_agents = [
"Mozilla/5.0 (iPhone 84; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.8.0 Mobile/14G60 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
"Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080",
"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 MicroMessenger/6.5.18 NetType/WIFI Language/en",
"Mozilla/5.0 (Linux; Android 5.1.1; vivo Xplay5A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 T7/9.3 baiduboxapp/9.3.0.10 (Baidu; P1 5.1.1)",
"Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0; LEX626 Build/HEXCNFN5902606111S) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/7.4 baiduboxapp/8.3.1 (Baidu; P1 6.0)",
"Mozilla/5.0 (iPhone 92; CPU iPhone OS 10_3_2 like Mac OS X) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.7.2 Mobile/14F89 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
"Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; ZUK Z2121 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.8.952 Mobile Safari/537.36"] # index页面
def index_fn():
user_agent=random.choice(user_agents)
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "m.lagou.com",
#
"Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
timeStamp=timeStamp, time=time1),
"Referer": "https://m.lagou.com/search.html",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
for i in range(PAGE_SIZE):
proxies = {'HTTP': '171.37.164.78:8123'}
response = requests.get(index_url.format(CITY, POSITON_NAME, i, PAGE_SIZE), headers=headers,proxies=proxies).content.decode()
content = json.loads(response)
# print('content', content)
if content:
try:
result = content['content']['data']['page']['result']
for item in result:
# print(type(item),item)
# print(item['positionId'])
data = {
'positionId': item['positionId'],
'positionName': item['positionName'],
'city': item['city'],
'createTime':item['createTime'],
'companyId': item['companyId'],
'companyLogo': item['companyLogo'],
'companyName': item['companyName'],
'companyFullName': item['companyFullName'],
}
time.sleep(0.5)
# db['lagouData'].insert(data)
yield data
except Exception as e:
print('爬取index页出错', e)
else:
time.sleep()
print('重新加载')
# except Exception as e:
# print('爬取index页出错', e) # 详情页:
def detail_d(positionId):
# 随机获取代理对象
user_agent = random.choice(user_agents)
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "m.lagou.com",
#
"Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
timeStamp=timeStamp, time=time1),
"Referer": "https://m.lagou.com/search.html",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
response = requests.get(detail_url.format(positionId), headers=headers).content.decode() xml = etree.HTML(response)
title = xml.xpath('''//div[@class='postitle']/h2/text()''')
job_details = xml.xpath('''//div[@class='detail']/div[1]//span/span/text()''')
job_detail = str(job_details).replace(r'\n', '').replace(' ', '')
work_detial = xml.xpath('''//div[@class='content']//p/text()''')
company_img = xml.xpath('''//div[@class='content']//p/text()''')
company_infors = xml.xpath(
'''//div[@class='company activeable']/div/div/h2/text()|//div[@class='dleft']/p/text()''')
company_infor = str(company_infors).strip().replace(r'\n', '').replace(' ', '')
detail_datas = {
'title': title,
'job_detail': job_detail,
'work_detial': work_detial,
'company_img': company_img,
'company_infor': company_infor
}
return detail_datas # 保存到mongodb
def save_to_mongodb(data, detail_datas, positionId):
# if db[data_name].update({'positionId': positionId}, {'$set': data}, True):
# print('update to Mongo', data['positionId']) db['lagouData'].insert(data)
db['detailDta'].insert(detail_datas)
print('成功存入mongodb') # 保存为csv文件
def save_to_csv():
item_list = []
for item in index_fn():
item_list.append(item)
print('', item)
# print('详情列', item_list)
# item_list是一个列表,里面装很多字典类似{'positionId': , 'positionName': '数据挖掘工程师', 'city': '广州',...
datas = pd.DataFrame(item_list, columns=["positionId", "positionName", "city", "createTime", "salary", "companyId",
"companyLogo", "companyName", "companyFullName"]) datas.to_csv('./static/lagou.csv')
print('保存为csv文件成功') def run():
# 保存为csv文件
# proxies=get_ip()
# for i in proxies:
data = index_fn()
save_to_csv()
for item in data:
print('data', item)
positionId = item['positionId']
print(positionId)
# 调用详情页函数
detail_datas = detail_d(positionId)
# 保存详情页和主页的数据到mongodb
save_to_mongodb(data, detail_datas, positionId) if __name__ == '__main__':
run()

预防反爬虫措施:

1.用户代理变换设置

2.不同ip代理的设置

3.设置用户cookie变化的信息

列举一下获取网络免费ip代理,并验证其是否可用的代码:

import requests
import re
import telnetlib
from lxml import etree
import time
def get_ip():
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
}
url = "http://www.xicidaili.com/nn/"
res = requests.get(url,headers=headers)
content = res.content.decode() # 调用lxml中的etree库便于用xpath获取html中的文本
xml = etree.HTML(content)
# 以下为xpath语法
# //tr[@class='odd']//td[2]//text() ip
# //tr[@class='odd']//td[3]//text() port 端口
# //tr[@class='odd']//td[6]//text() type 类型
ip_list = xml.xpath("//tr[@class='odd']//td[2]//text()")
port_list = xml.xpath("//tr[@class='odd']//td[3]//text()")
type_list = xml.xpath("//tr[@class='odd']//td[6]//text()")
if len(ip_list) != :
for ip,port,type in zip(ip_list,port_list,type_list):
proxies = {
type:"{}:{}".format(ip,port)
}
try:
telnetlib.Telnet(ip, port=port, timeout=)
except Exception:
print("不能使用该{}".format(proxies))
else:
print('可以使用该{}'.format(proxies))
yield proxies
get_ip()
else:
time.sleep()
get_ip() # content= res.content.decode()
# print(content) # if __name__ == '__main__':
# ip = get_ip()
# print(ip)
# for i in ip:
# pass
# yield i
# print('getip,getip',i)

利用xpath爬取招聘网的招聘信息的更多相关文章

  1. python爬取当当网的书籍信息并保存到csv文件

    python爬取当当网的书籍信息并保存到csv文件 依赖的库: requests #用来获取页面内容 BeautifulSoup #opython3不能安装BeautifulSoup,但可以安装Bea ...

  2. 利用scrapy爬取腾讯的招聘信息

    利用scrapy框架抓取腾讯的招聘信息,爬取地址为:https://hr.tencent.com/position.php 抓取字段包括:招聘岗位,人数,工作地点,发布时间,及具体的工作要求和工作任务 ...

  3. 如何利用Xpath抓取京东网商品信息

    前几小编分别利用Python正则表达式和BeautifulSoup爬取了京东网商品信息,今天小编利用Xpath来为大家演示一下如何实现京东商品信息的精准匹配~~ HTML文件其实就是由一组尖括号构成的 ...

  4. 利用python爬取贝壳网租房信息

    最近准备换房子,在网站上寻找各种房源信息,看得眼花缭乱,于是想着能否将基本信息汇总起来便于查找,便用python将基本信息爬下来放到excel,这样一来就容易搜索了. 1. 利用lxml中的xpath ...

  5. 利用jsoup爬取百度网盘资源分享连接(多线程)

    突然有一天就想说能不能用某种方法把百度网盘上分享的资源连接抓取下来,于是就动手了.知乎上有人说过最好的方法就是http://pan.baidu.com/wap抓取,一看果然链接后面的uk值是一串数字, ...

  6. 利用selenium爬取京东商品信息存放到mongodb

    利用selenium爬取京东商城的商品信息思路: 1.首先进入京东的搜索页面,分析搜索页面信息可以得到路由结构 2.根据页面信息可以看到京东在搜索页面使用了懒加载,所以为了解决这个问题,使用递归.等待 ...

  7. python 爬虫之爬取大街网(思路)

    由于需要,本人需要对大街网招聘信息进行分析,故写了个爬虫进行爬取.这里我将记录一下,本人爬取大街网的思路. 附:爬取得数据仅供自己分析所用,并未用作其它用途. 附:本篇适合有一定 爬虫基础 crawl ...

  8. 利用Selenium爬取淘宝商品信息

    一.  Selenium和PhantomJS介绍 Selenium是一个用于Web应用程序测试的工具,Selenium直接运行在浏览器中,就像真正的用户在操作一样.由于这个性质,Selenium也是一 ...

  9. Python爬虫项目--爬取自如网房源信息

    本次爬取自如网房源信息所用到的知识点: 1. requests get请求 2. lxml解析html 3. Xpath 4. MongoDB存储 正文 1.分析目标站点 1. url: http:/ ...

随机推荐

  1. 在 vue.js 中动态绑定 v-model

    在最近的项目中(基于vue),有一个需求就是通过 v-for 动态生成 input.在正常情况下,页面中的input数量是固定的,而且每个input绑定的v-model也是固定的,我们可以在 data ...

  2. Gym 101911E "Painting the Fence"(线段树区间更新+双端队列)

    传送门 题意: 庭院中有 n 个围栏,每个围栏上都被涂上了不同的颜色(数字表示): 有 m 条指令,每条指令给出一个整数 x ,你要做的就是将区间[ x第一次出现的位置 , x最后出现的位置 ]中的围 ...

  3. 计算机基础:计算机网络-socket编程

    来源:mooc大学华南理工大学计算机网络课程 chapter6 代码:https://github.com/NeilKeats/SocketDemo/commit/5f3a795250a9533910 ...

  4. Gym102082 G-What Goes Up Must Come Down(树状数组)

    Several cards with numbers printed on them are lined up on the table. We’d like to change their orde ...

  5. POJ 3687 Labeling Balls (top 排序)

    Labeling Balls Time Limit: 1000MS   Memory Limit: 65536K Total Submissions: 15792   Accepted: 4630 D ...

  6. goto语句

    让程序直接跳到自定义标签位置 public static void Main(string[] args) { ; goto myLabel;AppDomainInitializer//直接跳到标签m ...

  7. RabbitMQ 消费者的消息确认机制

    消息确认的几点说明: 1. Basic.Ack 发回给 RabbitMQ 以告知,可以将相应 message 从 RabbitMQ 的消息缓存中移除.2. Basic.Ack 未被 consumer ...

  8. (叉乘求面积) nyoj1011-So Easy[II]

    1011-So Easy[II] 内存限制:64MB 时间限制:1000ms 特判: No通过数:2 提交数:4 难度:2 题目描述: 这是一道基础的计算几何问题(其实这不提示大家也都看的出).问题描 ...

  9. PHP操作cookie

    1.当只有一个参数的时候,默认是删除,响应报文里面设置了一个过去的时间 setcookie('key2'); 2.当有两个参数的时候,是设置cookie setcookie('key','value1 ...

  10. 表连接join on

    表A记录如下:  aID aNum  1 a20050111  2 a20050112  3 a20050113  4 a20050114  5 a20050115  表B记录如下:  bID bNa ...