taobao.py

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
from urllib.parse import quote
from scrapysplashtest.items import ProductItem
from scrapy_splash import SplashRequest script = """
function main(splash, args)
splash.images_enabled = false
assert(splash:go(args.url))
assert(splash:wait(args.wait))
js = string.format("document.querySelector('#mainsrp-pager div.form > input').value=%d;document.querySelector('#mainsrp-pager div.form > span.btn.J_Submit').click()", args.page)
splash:evaljs(js)
assert(splash:wait(args.wait))
return splash:html()
end
""" class TaobaoSpider(Spider):
name = 'taobao'
allowed_domains = ['www.taobao.com']
base_url = 'https://s.taobao.com/search?q=' def start_requests(self):
for keyword in self.settings.get('KEYWORDS'):
for page in range(1, self.settings.get('MAX_PAGE') + 1):
url = self.base_url + quote(keyword)
yield SplashRequest(url, callback=self.parse, endpoint='execute', args={'lua_source': script, 'page': page, 'wait': 7}) def parse(self, response):
products = response.xpath('//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]')
for product in products:
item = ProductItem()
item['price'] = ''.join(
product.xpath('.//div[contains(@class, "price")]//text()').extract()).strip()
item['title'] = ''.join(
product.xpath('.//div[contains(@class, "title")]//text()').extract()).strip()
item['shop'] = ''.join(
product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip()
item['image'] = ''.join(product.xpath(
'.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip()
item['deal'] = product.xpath(
'.//div[contains(@class, "deal-cnt")]//text()').extract_first()
item['location'] = product.xpath(
'.//div[contains(@class, "location")]//text()').extract_first()
yield item

items.py

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html from scrapy import Item, Field class ProductItem(Item):
collection = 'products' image = Field()
price = Field()
deal = Field()
title = Field()
shop = Field()
location = Field()

middlewares.py

# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ScrapysplashtestSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects. @classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider. # Should return None or raise an exception.
return None def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response. # Must return an iterable of Request, dict or Item objects.
for i in result:
yield i def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict
# or Item objects.
pass def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated. # Must return only requests (not items).
for r in start_requests:
yield r def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db @classmethod
def from_crawler(cls, crawler):
return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB')) def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db] def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
return item def close_spider(self, spider):
self.client.close()

settings.py

# -*- coding: utf-8 -*-
# Scrapy settings for scrapysplashtest project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'scrapysplashtest' SPIDER_MODULES = ['scrapysplashtest.spiders']
NEWSPIDER_MODULE = 'scrapysplashtest.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'scrapysplashtest (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False # Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# } # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
} # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810
} # Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# } # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapysplashtest.pipelines.MongoPipeline': 300,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' KEYWORDS = ['iPad'] MAX_PAGE = 100 SPLASH_URL = 'http://localhost:8050' DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' MONGO_URI = 'localhost'
MONGO_DB = 'taobao'

17.splash_case06_ScrapySplashTest-master的更多相关文章

  1. ssh: connect to host master port 22: Connection refused

    hadoop集群启动的时候namenode显示Connection refused 到windows下ping  master 显示传输中过期 ip是静态的 ssh master 也是连接拒绝 重启s ...

  2. REdis MASTER aborted replication NOAUTH Authentication required

    对于REdis集群,如果设置了requirepass,则一定要设置masterauth,否则从节点无法正常工作,查看从节点日志可以看到哪下内容:19213:S 22 Apr 2019 10:52:17 ...

  3. 痞子衡嵌入式:i.MXRT全系列下FlexSPI外设AHB Master ID定义与AHB RX Buffer指定的异同

    大家好,我是痞子衡,是正经搞技术的痞子.今天痞子衡给大家介绍的是i.MXRT全系列下FlexSPI外设AHB Master ID定义与AHB RX Buffer指定的异同. 因为 i.MXRT 全系列 ...

  4. 设置redis主从出现的问题

    314:S 05 Jan 15:12:17.433 # WARNING: The TCP backlog setting of 511 cannot be enforced because /proc ...

  5. Macaca自动化测试之PC端测试

    Macaca是一套完整的自动化测试解决方案.由阿里巴巴公司开源: http://macacajs.github.io/macaca/ 特点: 同时支持PC端和移动端(Android.iOS)自动化测试 ...

  6. Saltstack-自动化部署

    Saltstack概述 Salt一种全新的基础设施管理方式,部署轻松,在几分钟内可运行起来,扩展性好,很容易管理上万台服务器,速度够快,服务器之间秒级通讯. salt底层采用动态的连接总线, 使其可以 ...

  7. Mydumper & Myloader Documentation

    Mydumper.org web site has been missing in action for a while now. I've uploaded a copy of the Mydump ...

  8. [转](多实例)mysql-mmm集群

    一.需求说明 最近一直在学习mysql-mmm,想以后这个架构也能用在我们公司的业务上,我们公司的业务是单机多实例部署,所以也想把mysql-mmm部署成这样,功夫不负有心人,我成功了,和大家分享一下 ...

  9. Redis中的master-slave&sentinel

    redis安装 解压完成后可以看到INSTALL和README.md文件,查看以获取更多有用信息. 在README文件中可以获取到软件的安装步骤.以下安装步骤基于此. #step1 进入文件夹,执行编 ...

  10. hadoop单机环境搭建

    [在此处输入文章标题] Hadoop单机搭建 1. 工具准备 1) Hadoop Linux安装包 2) VMware虚拟机 3) Java Linux安装包 4) Window 电脑一台 2. 开始 ...

随机推荐

  1. win10 快速访问存在 2345Downloads 删除解决方案

    有时候重装新系后统会发有很多自己不喜欢的捆绑程序,比如2345辣鸡浏览器 这个时候很多人会选择卸载,然后把文件夹位置删除 但是删除后会发现有一个地方一直还在那就是现快速访问的位置里面 这个位置由于卸载 ...

  2. 云cassandra 重磅发布dynamodb特性

    云cassandra全新发布dynamodb特性 nosql主力数据库再上新台阶 9月阿里云cassandra产品发布,具体参考阿里云全球首发云Cassandra服务.迄今为止,已有上百大B客户开通了 ...

  3. Framework7-Vue搭建项目

    在Framework7的gitHub上有一套与Vue结合的模板,直接下载下来使用即可 https://github.com/framework7io/framework7-template-vue-w ...

  4. NX二次开发-UFUN设置透明度UF_OBJ_set_translucency

    NX9+VS2012 #include <uf.h> #include <uf_modl.h> #include <uf_obj.h> UF_initialize( ...

  5. csp-s模拟测试91

    csp-s模拟测试91 倒悬吃屎的一套题. $T1$认真(?)分析题意发现复杂度不能带$n$(?),计划直接维护答案,考虑操作对答案的影响,未果.突然发现可以动态开点权值线段树打部分分,后来$Tm$一 ...

  6. Spark 自定义函数(udf,udaf)

    Spark 版本 2.3 文中测试数据(json) {"name":"lillcol", "age":24,"ip":& ...

  7. windows下执行tensorflow/models的代码显示No module named 'object_detection'

    Traceback (most recent call last): File "object_detection/builders/model_builder_test.py", ...

  8. 《软件调试修炼之道》Part 1(CH1~5)读书笔记 PB16110698 第八周(~4.26)

    编程中,调试几乎是必不可少的,一劳永逸.一次完成预想功能而完全不出bug的情况凤毛麟角,出现bug→调试→再出现bug→再调试……基本是软件工程中的常态.可以说,软件调试是每个coder的必修课,而& ...

  9. 《创新者》读书笔记 PB16110698 第五周(~4.5)

    本周我阅读了某同学推荐的<创新者>,这本书实际上是两个世纪以来信息技术的编年史,从巴贝奇的差分机到如今互联网时代的超级计算机,作者通过各个时代里一位位杰出的创新者,将计算机诞生.发展.崛起 ...

  10. The linux command之环境

    一.环境 shell在环境中存储了两种数据类型:环境变量(environment variables )shell变量(shell variables).在shell中这两种变量基本没有什么不同. 此 ...