先看scrapy-redis源码

 class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None # Redis client placeholder.
server = None def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests() def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal. This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None) if crawler is None:
raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip():
raise ValueError("redis_key must not be empty") if self.redis_batch_size is None:
# TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),
) try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__) self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
fetch_one = self.server.spop if use_set else self.server.lpop
# XXX: Do we need to use a timeout here?
found = 0
# TODO: Use redis pipeline execution.
while found < self.redis_batch_size:
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data)
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data) if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key) def make_request_from_data(self, data):
"""Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to
provide your own message decoding. Parameters
----------
data : bytes
Message from redis. """
url = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(url) def schedule_next_requests(self):
"""Schedules a request if available"""
# TODO: While there is capacity, schedule a batch of redis requests.
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self) def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: False)
Use SET operations to retrieve messages from the redis queue. If False,
the messages are retrieve using the LPOP command.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: True)
Use SET operations to retrieve messages from the redis queue.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj

仔细看完的话会发现

make_request_from_data(self, data)
这个方法是从redis中返回一个请求实例 默认是一个url
接下来重写一下这个方法直接传入到
self.make_requests_from_url
一个json串就好了
在这个方法里面可以把这个串解析了请求url或者生产url
代码如下
     def make_request_from_data(self, data):
'''
:params data bytes, Message from redis
'''
company = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(company) def make_requests_from_url(self, company):
data = eval(company)
url = data["url"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
}
return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)

值得注意的是

不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
但是同时重写make_request_from_data和make_requests_from_url方法则可以执行

scrapy-redis使redis不止保存url的更多相关文章

  1. Redis学习——Redis持久化之AOF备份方式保存数据

    新技术的出现一定是在老技术的基础之上,并且完善了老技术的某一些不足的地方,新技术和老技术就如同JAVA中的继承关系.子类(新技术)比父类(老技术)更加的强大! 在前面介绍了Redis学习--Redis ...

  2. Redis学习——Redis持久化之RDB备份方式保存数据

    从这一个介绍里面知道,redis比memcache作为缓存数据库强大的地方,一个是支持的数据类型比较多,另一个就是redis持久化功能. 下面就介绍Redis的持久化之RDB! 一:什么是redis的 ...

  3. 解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留。可能修改数据集的命令被禁用。请检查Redis日志,了解有关错误的详细信息。

    解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留.可能修改数据集的命令被禁用.请检查Redis日志,了解有关错误的详细信息. 出现bug: 在学习celery,将数据 ...

  4. 8.1 k8s使用PV/PVC做数据持久化运行redis服务,数据保存至NFS

    1.制作redis docker镜像 1.1 准备alpine基础镜像 # 下载 docker pull alpine:3.13 # 更改tag docker tag alpine:3.13 192. ...

  5. 搭建LNAMP环境(五)- PHP7源码安装Redis和Redis拓展

    上一篇:搭建LNAMP环境(四)- 源码安装PHP7 一.安装Redis 1.创建redis用户组和用户 groupadd redis useradd -r -g redis -s /sbin/nol ...

  6. redis配置文件redis.conf中文版

    转账自:http://www.jb51.net/article/50605.htm # Redis示例配置文件 # 注意单位问题:当需要设置内存大小的时候,可以使用类似1k.5GB.4M这样的常见格式 ...

  7. redis配置文件redis.conf中文版(基于2.4)

    转载于:http://www.itxuexiwang.com/a/shujukujishu/redis/2016/0216/99.html?1455869981 代码如下: # Redis示例配置文件 ...

  8. vagrant系列教程(四):vagrant搭建redis与redis的监控程序redis-stat(转)

    上一篇php7环境的搭建 真是火爆,仅仅两天时间,就破了我之前swagger系列的一片文章,看来,大家对搭建环境真是情有独钟. 为了访问量,我今天再来一篇Redis的搭建.当然不能仅仅是redis的搭 ...

  9. Redis配置文件redis.conf参数配置详解

    ########################################## 常规 ########################################## daemonize n ...

随机推荐

  1. java获取类名不包括路径

    class.getSimpleName(),就能获得仅仅的类名 class.getName()获得的是全路径的类名

  2. 多个机器获取微信access-token导致的有效性问题

    多个机器获取微信access-token导致的有效性问题 单个机器获取的access-token,只有最后一个是有效的: 多个机器各自获取自己的access-token,都是各自有效的: 在服务器和本 ...

  3. 恳请CSDN的活动可以落实

    前言:在CSDN举办的"扒一扒你遇到过最NB开发项目"有奖征文活动中有幸获得"最佳评论奖",可是时至今日.也没有收到书籍,咨询CSDN管理员的时候.居然得到&q ...

  4. scp and tar

    scp 命令随记 scp file username@remoteIp:directory 创建tar包 tar zcvf file.tar.gz directory tar zcvf hadoop. ...

  5. test框架搭建

     http://blog.csdn.net/huilan_same  https://github.com/huilansame/Test_framework  

  6. hdfs du命令是算的一份数据

    As you can see, hadoop fsck and hadoop fs -dus report the effective HDFS storage space used, i.e. th ...

  7. [Apple开发者帐户帮助]二、管理你的团队(4)离开一个团队

    您可以随时离开组织的开发团队.但是,帐户持有人有法律责任,只能在指定另一个团队成员作为帐户持有人后离开团队. 如果您是Apple Developer Program中的团队成员,则可以将团队留在App ...

  8. 解决VS2008 开发Wince应用程序项目生成速度慢的问题

    最近用VS2008开发Windows Mobile程序,使用C#..NET Compact Framework,发现项目生成速度比较慢.用VS2008打开项目后,开始一段时间生成速度还能忍受,时间一长 ...

  9. 关于Vue.js去掉#号路由

    正常启动后访问路由: 中间会自动加入一个#号 去掉#号: 在route文件夹下的index.js中加入mode: 'history', ①: ②: 关于mode说明: 默认值: ‘hash‘(浏览器) ...

  10. Python 2:str.title()(使字符串每个单词首字母大写)

    name = "hello,world! hello,python!" print(name.title()) #单词首字母大写 运行结果将会是:Hello,World!Hello ...