python爬取大众点评

拖了好久的代码

1.首先进入页面确定自己要抓取的数据（我们要抓取的是左侧分类栏-----包括美食、火锅）
先爬取第一级分类（美食、婚纱摄影、电影），之后根据第一级链接爬取第二层（火锅）。要注意第二级的pid是第一级的classid，这样才能区分出第二级分类哪些是属于第一级的。
2.上一步我们分别把链接存入Redis，名称存入了Mongodb，这一步我们要从Redis取链接，取第二级的链接。因为我们要获取店铺的信息（所以取第二级链接就够），我们首先分析取得第一页的内容，然后找到他下一页的代码（取得下一页代码就获取整个分类的店铺）
3.获取店铺信息（根据取得的店铺链接获取对应的店铺信息）
因为我们发现http协议头和cookie我们几乎每次都用到了，所以我们将它们封装成了一个方法，以便用的时候调用。
至此，大众点评就算结束了，只不过还没取评论信息，会慢慢上。

1.py

# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree

from pymongo import MongoClient
client = MongoClient('localhost',27017)
db=client.dianping
collection=db.classification	#类别表

import redis
r = redis.Redis(host='127.0.0.1',port=6379,db=0)

ii=0

def secClassFind(selector,classid):
	global ii
	ii += 1
	secItems = selector.xpath('//div[@class="sec-items"]/a')
	for secItem in secItems:
		url = secItem.get('href')
		title = secItem.text
		classid = collection.insert({'classname':title,'pid':classid})
		classurl = '%s,%s,%i,%s'%(classid,url,ii,title)
		r.lpush('classurl',classurl)

def findRootNode(url):
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
	req_timeout = 5
	req = Request(url=url,headers=headers)
	f = urlopen(req,None,req_timeout)
	s=f.read()
	s=s.decode("utf-8")
	# beautiful 提取数据
	soup=BeautifulSoup(s,'html.parser')
	links=soup.find_all(name='li',class_="first-item")
	for link in links:
		selector = etree.HTML(str(link))
		'''
		indexTitleUrls = selector.xpath('//a[@class="index-title"]/@href')
		#获取一级类别url和title
		for titleurl in indexTitleUrls:
			print(titleurl)
		'''
		indexTitles = selector.xpath('//a[@class="index-title"]/text()')
		for title in indexTitles:
			print(title)
			classid = collection.insert({'classname':title,'pid':None})
			#第二级别url
			secClassFind(selector,classid)
			#print(rs)
			print('-------------')

		print('----------------------------------------------')

findRootNode('http://www.dianping.com/')

　　2.py

# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from urllib.request import Request
from slaver3_list import getCurPageList
from bs4 import BeautifulSoup
from lxml import etree

from pymongo import MongoClient
client = MongoClient('localhost',27017)
db=client.dianping

import redis
r = redis.Redis(host='127.0.0.1',port=6379,db=0)

'''
1.从classurl中取得一个链接
2.根据此链接获得一个列表页面
3.分析获得页面上的店铺链接
4.获得下一页链接
5.继续爬取下一页信息,继续解析获得链接(重复2~5)
    直到没有下一页为止
'''
#1.从redis中获取一个链接
#classurls = bytes.decode(r.lindex('classurl',0))
shopflag = int(r.get('shopflag'))

if shopflag==0:
    collection=db.shops0    #类别表
    collection.remove({})
    r.set(')
else:
    collection=db.shops1    #类别表
    collection.remove({})
    r.set(')

r.delete('shopurl')

list = r.lrange('classurl',0,-1)
for item in list:
    classurl = bytes.decode(item)        #二进制转字符串
    arr = classurl.split(',')
    #print(arr[0])        #classid
    #print(arr[1])        #classurl
    getCurPageList(arr[0],arr[1],shopflag)
    break

'''
print(classurls)
arr = classurls.split(',')

if int(arr[2])==16:
    #调用
    getCurPageList(arr[0],arr[1])
'''

3.py

# -*- coding: utf-8 -*-
import re
#from urllib.request import urlopen
#from urllib.request import Request
from common import httpSpider
from bs4 import BeautifulSoup
from lxml import etree
from bson.objectid import ObjectId
from slaver4_shopinfo import getShopInfo

from pymongo import MongoClient
client = MongoClient('localhost',27017)
db=client.dianping
collection=None

import redis
r = redis.Redis(host='127.0.0.1',port=6379,db=0)

ii=0

#id,店名,类别id
def insertShop(classid,shopList):
    global collection
    for div in shopList:
        #print(div.get("href"))
        #print(div.get('title'))
        url = div.get("href")
        shopid = collection.insert({'_id':url,'shopname':div.get('title'),'classid':ObjectId(classid)})
        # shopurl = '%s,%s,%s'%(classid,shopid,url)
        # r.lpush('shopurl',shopurl)
        getShopInfo(shopid,url)

def getCurPageList(classid,url,shopflag):
    global ii
    ii += 1
    html = httpSpider(url)
    #print(html)

    selector = etree.HTML(html)

    global collection
    if shopflag==0:
            collection=db.shops0    #店铺表
    else:
        collection=db.shops1

    divTits = selector.xpath('//div[@class="tit"]/a[@title]')
    insertShop(classid,divTits)
    '''
    for div in divTits:
        print(div.get("href"))
        print(div.get('title'))
    '''
    print('----------%i---------------'%(ii))
    #-----下一页--------------------------
    '''
    nextPage = selector.xpath('//a[@class="next"]/@href')
    if len(nextPage)>0:
        newUrl = nextPage[0]
        #print(nextPage[0])
        getCurPageList(newUrl)
    '''

4.py

# -*- coding: utf-8 -*-
import re
#from urllib.request import urlopen
#from urllib.request import Request
from common import httpSpider
from bs4 import BeautifulSoup
from lxml import etree
from bson.objectid import ObjectId

from pymongo import MongoClient
client = MongoClient('localhost',27017)
db=client.dianping
collection=db.shops    #店铺表

import redis
r = redis.Redis(host='127.0.0.1',port=6379,db=0)

def getShopInfo(shopid,shopurl):
    html = httpSpider(shopurl)
    selector = etree.HTML(html)
    briefInfo = selector.xpath('//div[@class="brief-info"]//span[@class="item"]')
    for item in briefInfo:
        print(item.text)

common.py

# -*- coding: utf-8 -*-
import urllib.request
from urllib.request import urlopen
from urllib.request import Request
import http.cookiejar

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

head = {
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
}

def makeMyOpener(head):
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    header = []
    for key, value in head.items():
        elem = (key, value)
        header.append(elem)
    opener.addheaders = header
    return opener
def httpSpider(url):
    oper = makeMyOpener(head)
    req_timeout = 5
    uop = oper.open(url, timeout = req_timeout)
    data = uop.read()
    html = data.decode()
    return html

def dynamicSpider(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4',
        'Connection': 'keep-alive'
    }
    cap = DesiredCapabilities.PHANTOMJS.copy()    #使用copy()防止修改原代码定义dict
    for key, value in headers.items():
        cap['phantomjs.page.customHeaders.{}'.format(key)] = value
    cap["phantomjs.page.settings.loadImages"] = False
    driver = webdriver.PhantomJS(desired_capabilities=cap,executable_path='D:/phantoms/phantomjs-2.1.1-windows/bin/phantomjs.exe')
    driver.get(url)
    html = driver.page_source
    driver.quit()
    return html

python爬取大众点评的更多相关文章

python爬取大众点评并写入mongodb数据库和redis数据库
抓取大众点评首页左侧信息,如图: 我们要实现把中文名字都存到mongodb,而每个链接存入redis数据库. 因为将数据存到mongodb时每一个信息都会有一个对应的id,那样就方便我们存入redis ...
Python 爬取大众点评 50 页数据，最好吃的成都火锅竟是它！
前言文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: 胡萝卜酱 PS:如有需要Python学习资料的小伙伴可以加点击下方链 ...
用Python爬取大众点评数据，推荐火锅店里最受欢迎的食品
前言文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者:有趣的Python PS:如有需要Python学习资料的小伙伴可以加点 ...
python爬虫实战---爬取大众点评评论
python爬虫实战—爬取大众点评评论(加密字体) 1.首先打开一个店铺找到评论很多人学习python,不知道从何学起.很多人学习python,掌握了基本语法过后,不知道在哪里寻找案例上手.很多已经 ...
python爬虫爬取大众点评并导入redis
直接上代码,导入redis的中文编码没有解决,日后解决了会第一时间上代码!新手上路,多多包涵! # -*- coding: utf-8 -*- import re import requests fr ...
Python爬虫丨大众点评数据爬虫教程（1）
大众点评数据获取 --- 基础版本大众点评是一款非常受普罗大众喜爱的一个第三方的美食相关的点评网站. 因此,该网站的数据也就非常有价值.优惠,评价数量,好评度等数据也就非常受数据公司的欢迎. 今天就 ...
Python 爬取所有51VOA网站的Learn a words文本及mp3音频
Python 爬取所有51VOA网站的Learn a words文本及mp3音频 #!/usr/bin/env python # -*- coding: utf-8 -*- #Python 爬取所有5 ...
python爬取网站数据
开学前接了一个任务,内容是从网上爬取特定属性的数据.正好之前学了python,练练手. 编码问题因为涉及到中文,所以必然地涉及到了编码的问题,这一次借这个机会算是彻底搞清楚了. 问题要从文字的编码讲 ...
python爬取某个网页的图片-如百度贴吧
python爬取某个网页的图片-如百度贴吧作者:vpoet mail:vpoet_sir@163.com 注:随意copy,不用告诉我 #coding:utf-8 import urllib imp ...

随机推荐

【转】three.js详解之入门篇
原文链接:https://www.cnblogs.com/shawn-xie/archive/2012/08/16/2642553.html 开场白 webGL可以让我们在canvas上实现3D效 ...
Carbondata源码系列（二）文件格式详解
在上一章当中,写了文件的生成过程.这一章主要讲解文件格式(V3版本)的具体细节. 1.字典文件格式详解字典文件的作用是在存储的时候将字符串等类型转换为int类型,好处主要有两点: 1.减少存储占用空 ...
最全最详细：ubuntu16.04下内核编译以及设备驱动程序的编写（针对新手而写）
写在前面:本博客为本人原创,转载请注明出处!同时,本博客严禁任何下载站随意抓取!!! 本博客唯一合法URL: 总体考虑要去写设备驱动程序,说白了就三大步骤:下载内核源码构建内核源码树(也就是下载你的 ...
最近整理AI相关感想
前言目前笔者致力于在AI 开发研究,四大平台里,百度AI 提供的开发者资料是最全,开发的友好度也是最高的,很多都已经集成在SDK中,支持许多语言体系. 其实作为公司层面的考虑,针对技术的研究出 ...
网页设计——6.html其他标签
今天学习html的其他标签: 一.列表 1.无序列表ul 基本结构: <ul type="属性值"> <li>列表内容</li> </u ...
python面向对象其他相关-异常处理-反射
1.isinstance(obj, cls) 检查是否obj是否是类 cls 的对象 2.issubclass(sub, super) 检查sub类是否是 super 类的派生类 n1 = 10 ...
Linux 进程间通信(包含一个经典的生产者消费者实例代码）
前言:编写多进程程序时,有时不可避免的需要在多个进程之间传递数据,我们知道,进程的用户的地址空间是独立,父进程中对数据的修改并不会反映到子进程中,但内核是共享的,大多数进程间通信方式都是在内核中建立一 ...
《重构--改善既有代码的设计》总结or读后感：重构是程序员的本能
此文写得有点晚,记得去年7月读完的这本书,只是那时没有写文章的意识,也无所谓总结了,现在稍微聊一下吧. 想起写这篇感想,还是前几天看了这么一篇文章研究发现重构软件并不会改善代码质量先从一个大家都有 ...
Android学习笔记(10).布局管理器
布局管理器的几个类都是ViewGroup派生的,用于管理组件的分布和大小,使用布局管理器能够非常好地解决屏幕适配问题. 布局管理器本身也是一个UI组件,布局管理器能够相互嵌套使用,以下是布局管理器的类 ...
HDOJ 4251 The Famous ICPC Team Again
划分树水题..... The Famous ICPC Team Again Time Limit: 30000/15000 MS (Java/Others) Memory Limit: 3276 ...

python爬取大众点评

python爬取大众点评的更多相关文章

随机推荐

热门专题