1.商品爬取

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

# Created on 2019-02-02 08:59:40

# Project: oneDrug

from pyspider.libs.base_handler import *

from pymongo import MongoClient

import re

class Handler(BaseHandler):

    crawl_config = {

    }

    def __init__(self):

        self.client = MongoClient('mongodb://localhost:27017')

        self.drug = self.client.drug

    def insert_goods(self, data):

        collection = self.drug['goods']

        collection.update({'goods_id': data['goods_id']}, data, True)

    def insert_comments(self, data):

        collection = self.drug['comments']

        collection.insert_one(data)

    @every(minutes=24 * 60)

    def on_start(self):

        self.crawl('https://www.111.com.cn/categories/', callback=self.categories_page, validate_cert=False,

                   fetch_type='js')

    @config(age=10 * 24 * 60 * 60)

    def categories_page(self, response):

        for each in response.doc('.allsort em > a').items():

            self.crawl(each.attr.href, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')

    @config(priority=1)

    def cagetory_list_page(self, response):

        for each in response.doc('#itemSearchList a[target="_blank"][class="product_pic pro_img"]').items():

            self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, fetch_type='js')

        next = response.doc('#search_table > div.turnPageBottom > a.page_next').attr.href

        self.crawl(next, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')

    @config(priority=2)

    def detail_page(self, response):

        goods_id = response.doc('#gallery_view > ul > li.item_number').text()

        cagetory_one = response.doc('body > div.wrap.clearfix > div > span:nth-child(3) > a').text()

        cagetory_two = response.doc('body > div.wrap.clearfix > div > span:nth-child(5) > a').text()

        cagetory_three = response.doc('body > div.wrap.clearfix > div > span:nth-child(7) > a').text()

        merchants = response.doc('div.middle_property > span:nth-child(1)').text()

        goods_name = response.doc('div.middle_property > h1').text()

        goods_desc = response.doc('div.middle_property > span.red.giftRed').text()

        goods_price = response.doc(

            'div.middle_property > div.shangpin_info > dl:nth-child(2) > dd > span.good_price').text()

        total_comments = response.doc('#fristReviewCount > span > a').text()

        brand = response.doc(

            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(2)').text()

        spec = response.doc(

            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(4)').text()

        weight = response.doc(

            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(2)').text()

        manufacturers = response.doc(

            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(4)').text()

        approval_number = response.doc(

            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(2)').text()

        drug_type = response.doc(

            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(4)').text()

        instructions = {}

        if response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child(1) > th').text():

            for i in range(3, 22):

                instructions_key = \

                response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > th'.format(i)).text().split(

                    " ")[0]

                instructions_value = response.doc(

                    '#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > td'.format(i)).text()

                instructions[instructions_key] = instructions_value

        total_comments = response.doc('#itemComments > span').text()

        good_comments = response.doc('#productExperience > div > ul > li:nth-child(2) > a > span').text()

        mid_comments = response.doc('#productExperience > div > ul > li:nth-child(3) > a > span').text()

        bad_comments = response.doc('#productExperience > div > ul > li:nth-child(4) > a > span').text()

        url_id = re.findall('\d+', response.url)[1]

        goods_data = {

            'url_id': url_id,

            'goods_id': goods_id,

            'goods_name': goods_name,

            'goods_desc': goods_desc,

            'goods_price': goods_price,

            'merchants': merchants,

            'cagetory': {

                '': cagetory_one,

                '': cagetory_two,

                '': cagetory_three

            },

            'drug_detail': {

                'brand': brand,

                'spec': spec,

                'weight': weight,

                'manufacturers': manufacturers,

                'approval_number': approval_number,

                'drug_type': drug_type

            },

            'instructions': instructions,

            'comments': {

                'total_comments': total_comments,

                'good_comments': good_comments,

                'mid_comments': mid_comments,

                'bad_comments': bad_comments

            }

        }

        self.insert_goods(goods_data)

2.评论爬取

from pymongo import MongoClient

import requests

from bs4 import BeautifulSoup

import re

import socket

class Drug:

    def __init__(self):

        self.clint = MongoClient('mongodb://localhost:27017')

        self.drug = self.clint.drug

        self.collection = self.drug['goods']

        self.comm_collection = self.drug['comments']

    def dbmodify(self):

        for data in self.collection.find({},{"goods_id":1,"goods_price":1}):

            try:

                _id = data['_id']

                id = data['goods_id'].split("：")[1]

                price = data['goods_price'].split("￥")[1]

                self.collection.update({'_id': _id},{'$set':{'goods_id':id,'goods_price':price}})

                print(_id, id, price)

            except IndexError:

                pass

    def getBaseArgument(self,goods_id):

        base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'

        data = {

            'goodsId': goods_id,

            'pageIndex': 1,

            'score': '1&_19020301'

        }

        try:

            self.collection.update_one({'url_id': goods_id}, {'$set': {'commspider': True}})

            requests.packages.urllib3.disable_warnings()

            requests.adapters.DEFAULT_RETRIES = 5

            # 设置连接活跃状态为False

            s = requests.session()

            s.keep_alive = False

            r = s.get(base_url, params=data, timeout = 5,verify=False)

            r.close()

            soup = BeautifulSoup(r.text, 'html.parser')

            if soup.find_all("div", class_="view_no_result"):

                return "No Comments!"

            else:

                total_page_text = soup.find_all(text=re.compile(r'共\d+页'))[0]

                pattern = re.compile(r'\d+')

                total_page = pattern.findall(total_page_text)

                return total_page[0]

        except requests.exceptions.RequestException as e:

            print(e)

    def getCommlist(self,goods_id, total_page):

        base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'

        try:

            for i in range(1, int(total_page)):

                data = {

                    'goodsId': goods_id,

                    'pageIndex': i,

                    'score': '1&_19020301'

                }

                try:

                    requests.packages.urllib3.disable_warnings()

                    requests.adapters.DEFAULT_RETRIES = 15

                    # 设置连接活跃状态为False

                    s = requests.session()

                    s.keep_alive = False

                    r = s.get(base_url, params=data, timeout = 5,verify=False)

                    r.close()

                    soup = BeautifulSoup(r.text, 'html.parser')

                    for tr in soup.find_all("tr"):

                        comments = {}

                        try:

                            comments['goodsId'] = goods_id

                            comments['content'] = tr.find('p').text.strip()

                            comments['date'] = tr.find('p', attrs={'class': 'eval_date'}).text.strip()

                            self.comm_collection.insert_one(comments)

                        except:

                            print(goods_id + "Have some problem!\n")

                        print(comments)

                except requests.exceptions.RequestException as e:

                    print(e)

        except ValueError:

            return "No Comments! Try next!"

    def getComments(self):

        i = 0

        goods_list = []

        for data in self.collection.find({'commspider': False}, {"url_id"}):

            id = data['url_id']

            goods_list.append(id)

        length = len(goods_list)

        print("总共 {} 条商品".format(length))

        for good in goods_list:

            total_page = self.getBaseArgument(good)

            comments = self.getCommlist(good,total_page)

            i = i + 1

            print("总共 {} 条商品\n目前第 {} 条\n商品编号 {} \n".format(length,i, good))

            print(comments)

test = Drug().getComments()

基于pydpier爬取1药网(转载)的更多相关文章

Python爬取中国天气网
Python爬取中国天气网基于requests库制作的爬虫. 使用方法:打开终端输入 “python3 weather.py 北京(或你所在的城市)" 程序正常运行需要在同文件夹下加入一个 ...
爬取西刺网的免费IP
在写爬虫时,经常需要切换IP,所以很有必要自已在数据维护库中维护一个IP池,这样,就可以在需用的时候随机切换IP,我的方法是爬取西刺网的免费IP,存入数据库中,然后在scrapy 工程中加入tools ...
python爬虫基础应用----爬取校花网视频
一.爬虫简单介绍爬虫是什么? 爬虫是首先使用模拟浏览器访问网站获取数据,然后通过解析过滤获得有价值的信息,最后保存到到自己库中的程序. 爬虫程序包括哪些模块? python中的爬虫程序主要包括,re ...
selenium爬取煎蛋网
selenium爬取煎蛋网直接上代码 from selenium import webdriver from selenium.webdriver.support.ui import WebDriv ...
Scrapy实战篇（一）之爬取链家网成交房源数据（上）
今天,我们就以链家网南京地区为例,来学习爬取链家网的成交房源数据. 这里推荐使用火狐浏览器,并且安装firebug和firepath两款插件,你会发现,这两款插件会给我们后续的数据提取带来很大的方便. ...
（python爬取小故事网并写入mysql）
前言: 这是一篇来自整理EVERNOTE的笔记所产生的小博客,实现功能主要为用广度优先算法爬取小故事网,爬满100个链接并写入mysql,虽然CS作为双学位已经修习了三年多了,但不仅理论知识一般,动手 ...
Python Scrapy 爬取煎蛋网妹子图实例（一）
前面介绍了爬虫框架的一个实例,那个比较简单,这里在介绍一个实例爬取煎蛋网妹子图,遗憾的是上周煎蛋网还有妹子图了,但是这周妹子图变成了随手拍, 不过没关系,我们爬图的目的是为了加强实战应用,管 ...
利用Python网络爬虫爬取学校官网十条标题
利用Python网络爬虫爬取学校官网十条标题案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...
Python的scrapy之爬取顶点小说网的所有小说
闲来无事用Python的scrapy框架练练手,爬取顶点小说网的所有小说的详细信息. 看一下网页的构造: tr标签里面的 td 使我们所要爬取的信息下面是我们要爬取的二级页面小说的简介信息: 下面 ...

随机推荐

468C Hack it!
传送门题目大意分析 here 对于最后求p的过程我想再说一下那个45就是最前一位分别是0~9,所以总贡献就是45乘上每一种数开头对应多少种情况而后面的10则是他前面可以填多少不同的数对他做的贡 ...
Browsersync 简介 and 使用
简介省时的浏览器同步测试工具,Browsersync能让浏览器实时.快速响应您的文件更改(html.js.css.sass.less等)并自动刷新页面. 曾经我们每改一次的代码,都需要手动去刷新一次 ...
excel中的绝对引用和相对应用
六.相对引用和绝对引用 1.相对引用单元格或单元格区域的相对引用是指相对于包含公式的单元格的相对位置.例如,单元格 B2 包含公式＝A1 ;Excel 将在距单元格 B2 上面一个单元 ...
Java IO输入输出流 FileWriter 字符流
字节缓冲流 //为什么要使用包装流,使用包装流是为了提高读写操作的性能. public class Packing_flowDemo { public static void main(String[ ...
面试题：测试给定的list，使用for，foreach，iterator删除元素的不同表现
上代码: 1. 使用增强for循环(foreach) package com.xxx; import java.util.ArrayList; import java.util.List; /** * ...
Lucene的基本概念----转载yufenfei的文章
Lucene的基本概念 Lucene是什么? Lucene是一款高性能.可扩展的信息检索工具库.信息检索是指文档搜索.文档内信息搜索或者文档相关的元数据搜索等操作. 信息检索流程如下: 1. 将即将检 ...
ERC230 VS ERC223
ERC223对ERC220的改进 ERC223是以太坊上最新的代币(token)接口标准,主要是为了解决ERC220代币转账丢失问题,那么怎么解决的呢,一起来看看. 1. ERC220 存在问题 ER ...
VIN-Fusion config with Realsense D435i
### First shot Copy the .launch file in package VINS-Fusion to the directory of realsense2_cameara/l ...
OC - runtime 之关联对象
header{font-size:1em;padding-top:1.5em;padding-bottom:1.5em} .markdown-body{overflow:hidden} .markdo ...
八大排序算法的python实现（八）简单选择排序
代码: #coding:utf-8 #author:徐卜灵 # L = [6, 3, 2, 32, 5, 4] def Select_sort(L): for i in range(0,len(L)) ...

基于pydpier爬取1药网(转载)

1.商品爬取

2.评论爬取

基于pydpier爬取1药网(转载)的更多相关文章

随机推荐

热门专题