先看scrapy-redis源码

 class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None # Redis client placeholder.
server = None def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests() def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal. This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None) if crawler is None:
raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip():
raise ValueError("redis_key must not be empty") if self.redis_batch_size is None:
# TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),
) try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__) self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
fetch_one = self.server.spop if use_set else self.server.lpop
# XXX: Do we need to use a timeout here?
found = 0
# TODO: Use redis pipeline execution.
while found < self.redis_batch_size:
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data)
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data) if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key) def make_request_from_data(self, data):
"""Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to
provide your own message decoding. Parameters
----------
data : bytes
Message from redis. """
url = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(url) def schedule_next_requests(self):
"""Schedules a request if available"""
# TODO: While there is capacity, schedule a batch of redis requests.
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self) def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: False)
Use SET operations to retrieve messages from the redis queue. If False,
the messages are retrieve using the LPOP command.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: True)
Use SET operations to retrieve messages from the redis queue.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj

仔细看完的话会发现

make_request_from_data(self, data)
这个方法是从redis中返回一个请求实例 默认是一个url
接下来重写一下这个方法直接传入到
self.make_requests_from_url
一个json串就好了
在这个方法里面可以把这个串解析了请求url或者生产url
代码如下
     def make_request_from_data(self, data):
'''
:params data bytes, Message from redis
'''
company = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(company) def make_requests_from_url(self, company):
data = eval(company)
url = data["url"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
}
return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)

值得注意的是

不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
但是同时重写make_request_from_data和make_requests_from_url方法则可以执行

scrapy-redis使redis不止保存url的更多相关文章

  1. Redis学习——Redis持久化之AOF备份方式保存数据

    新技术的出现一定是在老技术的基础之上,并且完善了老技术的某一些不足的地方,新技术和老技术就如同JAVA中的继承关系.子类(新技术)比父类(老技术)更加的强大! 在前面介绍了Redis学习--Redis ...

  2. Redis学习——Redis持久化之RDB备份方式保存数据

    从这一个介绍里面知道,redis比memcache作为缓存数据库强大的地方,一个是支持的数据类型比较多,另一个就是redis持久化功能. 下面就介绍Redis的持久化之RDB! 一:什么是redis的 ...

  3. 解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留。可能修改数据集的命令被禁用。请检查Redis日志,了解有关错误的详细信息。

    解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留.可能修改数据集的命令被禁用.请检查Redis日志,了解有关错误的详细信息. 出现bug: 在学习celery,将数据 ...

  4. 8.1 k8s使用PV/PVC做数据持久化运行redis服务,数据保存至NFS

    1.制作redis docker镜像 1.1 准备alpine基础镜像 # 下载 docker pull alpine:3.13 # 更改tag docker tag alpine:3.13 192. ...

  5. 搭建LNAMP环境(五)- PHP7源码安装Redis和Redis拓展

    上一篇:搭建LNAMP环境(四)- 源码安装PHP7 一.安装Redis 1.创建redis用户组和用户 groupadd redis useradd -r -g redis -s /sbin/nol ...

  6. redis配置文件redis.conf中文版

    转账自:http://www.jb51.net/article/50605.htm # Redis示例配置文件 # 注意单位问题:当需要设置内存大小的时候,可以使用类似1k.5GB.4M这样的常见格式 ...

  7. redis配置文件redis.conf中文版(基于2.4)

    转载于:http://www.itxuexiwang.com/a/shujukujishu/redis/2016/0216/99.html?1455869981 代码如下: # Redis示例配置文件 ...

  8. vagrant系列教程(四):vagrant搭建redis与redis的监控程序redis-stat(转)

    上一篇php7环境的搭建 真是火爆,仅仅两天时间,就破了我之前swagger系列的一片文章,看来,大家对搭建环境真是情有独钟. 为了访问量,我今天再来一篇Redis的搭建.当然不能仅仅是redis的搭 ...

  9. Redis配置文件redis.conf参数配置详解

    ########################################## 常规 ########################################## daemonize n ...

随机推荐

  1. Java和JS MD5加密-附盐值加密demo

    JAVA和JS的MD5加密 经过测试:字母和数据好使,中文不好使. 源码如下: ** * 类MD5Util.java的实现描述: * */public class MD5Util { // 获得MD5 ...

  2. PDF在线预览-pdfjs使用

    请参考我的开源: https://github.com/wuyechun2018/itools/blob/master/src/main/webapp/WEB-INF/views/pdf/index. ...

  3. propagation属性的7个传播行为

    关于propagation属性的7个传播行为 REQUIRED:指定当前方法必需在事务环境中运行,如果当前有事务环境就加入当前正在执行的事务环境,如果当前没有事务,就新建一个事务.这是默认值.即有事务 ...

  4. CSS Display属性与盒模型

    由于HTML流式文档的特性,页面布局往往是新手最为头疼的问题之中的一个. 每一个HTML元素都会渲染为一个Box,可分为inline Box和block Box. 依据display属性的不同.Box ...

  5. shell中eval命令

    原文:http://www.cnblogs.com/xdzone/archive/2011/03/15/1984971.html 语法:eval cmdLine eval会对后面的cmdLine进行两 ...

  6. MDA模型定义及扩展

    Tiny框架中.对模型本向没有不论什么强制性约束,也就是说你能够把不论什么类型的对象作为模型.也不必实现不论什么接口. 因此简单的说,你定义一个类.里面有一些描写叙述业务属性或处理的内容,就能够说它是 ...

  7. jQuery - 制作非缘勿扰页面特效

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  8. Base Class Doesn't Contain Parameterless Constructor?

    http://stackoverflow.com/questions/7689742/base-class-doesnt-contain-parameterless-constructor #regi ...

  9. 学界| UC Berkeley提出新型分布式框架Ray:实时动态学习的开端—— AI 应用的系统需求:支持(a)异质、并行计算,(b)动态任务图,(c)高吞吐量和低延迟的调度,以及(d)透明的容错性。

    学界| UC Berkeley提出新型分布式框架Ray:实时动态学习的开端 from:https://baijia.baidu.com/s?id=1587367874517247282&wfr ...

  10. acc文件的运行

    1.method 1: use "acc" >acc hello.acc world.mc <--- compilation will generate the hel ...