scrapy-redis使redis不止保存url
先看scrapy-redis源码
class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None # Redis client placeholder.
server = None def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests() def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal. This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None) if crawler is None:
raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip():
raise ValueError("redis_key must not be empty") if self.redis_batch_size is None:
# TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),
) try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__) self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
fetch_one = self.server.spop if use_set else self.server.lpop
# XXX: Do we need to use a timeout here?
found = 0
# TODO: Use redis pipeline execution.
while found < self.redis_batch_size:
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data)
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data) if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key) def make_request_from_data(self, data):
"""Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to
provide your own message decoding. Parameters
----------
data : bytes
Message from redis. """
url = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(url) def schedule_next_requests(self):
"""Schedules a request if available"""
# TODO: While there is capacity, schedule a batch of redis requests.
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self) def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: False)
Use SET operations to retrieve messages from the redis queue. If False,
the messages are retrieve using the LPOP command.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: True)
Use SET operations to retrieve messages from the redis queue.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
仔细看完的话会发现
make_request_from_data(self, data)
这个方法是从redis中返回一个请求实例 默认是一个url
接下来重写一下这个方法直接传入到
self.make_requests_from_url
一个json串就好了
在这个方法里面可以把这个串解析了请求url或者生产url
代码如下
def make_request_from_data(self, data):
'''
:params data bytes, Message from redis
'''
company = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(company) def make_requests_from_url(self, company):
data = eval(company)
url = data["url"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
}
return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)
值得注意的是
不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
但是同时重写make_request_from_data和make_requests_from_url方法则可以执行
scrapy-redis使redis不止保存url的更多相关文章
- Redis学习——Redis持久化之AOF备份方式保存数据
新技术的出现一定是在老技术的基础之上,并且完善了老技术的某一些不足的地方,新技术和老技术就如同JAVA中的继承关系.子类(新技术)比父类(老技术)更加的强大! 在前面介绍了Redis学习--Redis ...
- Redis学习——Redis持久化之RDB备份方式保存数据
从这一个介绍里面知道,redis比memcache作为缓存数据库强大的地方,一个是支持的数据类型比较多,另一个就是redis持久化功能. 下面就介绍Redis的持久化之RDB! 一:什么是redis的 ...
- 解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留。可能修改数据集的命令被禁用。请检查Redis日志,了解有关错误的详细信息。
解决---MISCONF Redis被配置为保存RDB快照,但目前无法在磁盘上存留.可能修改数据集的命令被禁用.请检查Redis日志,了解有关错误的详细信息. 出现bug: 在学习celery,将数据 ...
- 8.1 k8s使用PV/PVC做数据持久化运行redis服务,数据保存至NFS
1.制作redis docker镜像 1.1 准备alpine基础镜像 # 下载 docker pull alpine:3.13 # 更改tag docker tag alpine:3.13 192. ...
- 搭建LNAMP环境(五)- PHP7源码安装Redis和Redis拓展
上一篇:搭建LNAMP环境(四)- 源码安装PHP7 一.安装Redis 1.创建redis用户组和用户 groupadd redis useradd -r -g redis -s /sbin/nol ...
- redis配置文件redis.conf中文版
转账自:http://www.jb51.net/article/50605.htm # Redis示例配置文件 # 注意单位问题:当需要设置内存大小的时候,可以使用类似1k.5GB.4M这样的常见格式 ...
- redis配置文件redis.conf中文版(基于2.4)
转载于:http://www.itxuexiwang.com/a/shujukujishu/redis/2016/0216/99.html?1455869981 代码如下: # Redis示例配置文件 ...
- vagrant系列教程(四):vagrant搭建redis与redis的监控程序redis-stat(转)
上一篇php7环境的搭建 真是火爆,仅仅两天时间,就破了我之前swagger系列的一片文章,看来,大家对搭建环境真是情有独钟. 为了访问量,我今天再来一篇Redis的搭建.当然不能仅仅是redis的搭 ...
- Redis配置文件redis.conf参数配置详解
########################################## 常规 ########################################## daemonize n ...
随机推荐
- [bzoj1007][HNOI2008]水平可见直线_单调栈
水平可见直线 bzoj-1007 HNOI-2008 题目大意:给你n条直线,为你从上往下看能看见多少跳直线. 注释:能看见一条直线,当且仅当这条直线上存在一条长度>0的线段使得这条线段上方没有 ...
- 洛谷 P1120 小木棍 [数据加强版]
P1120 小木棍 [数据加强版] 题目描述 乔治有一些同样长的小木棍,他把这些木棍随意砍成几段,直到每段的长都不超过50. 现在,他想把小木棍拼接成原来的样子,但是却忘记了自己开始时有多少根木棍和它 ...
- Oracle-统计数据库表数据总数量
create or replace procedure prc_table_count(p_flag out varchar2) AS TCOUNT number; SCOUNT number; CO ...
- Linux下统计某个目录的文件个数(转)
1.统计某文件夹下文件个数,不包括子文件夹 比如:统计/home下.jpeg文件的个数 ls -l "/home" | grep ".jpeg" | wc -l ...
- 完全卸载SQL Server 2008 R2(转)
系统:Windows 10 以下方法转自:http://www.cnblogs.com/qanholas/p/3804123.html 1.在控制面板卸载Miscrosoft SQL Server 2 ...
- Java:解决Servlet的UTF8编码问题
要让Servlet支持UTF8,需要在doGet或者doPost中添加如下一条语句: request.setCharacterEncoding("UTF-8");
- leetcode第一刷_Minimum Window Substring
好题.字符串.线性时间. 我认为第一次拿到这个题的人应该不会知道该怎么做吧,要么就是我太弱了..先搞清楚这个题要求的是什么.从一个长字符串中找一个字串,这个字串中的字符全然包括了另一个给定目标串中的字 ...
- Codeforces Round #388 (Div. 2) C. Voting
题意:有n个人,每个人要么是属于D派要么就是R派的.从编号1开始按顺序,每个人都有一次机会可以剔除其他任何一个人(被剔除的人就不在序列中也就失去了剔除其他人的机会了):当轮完一遍后就再次从头从仅存的人 ...
- Android将图像转换成流存储与将流转换成图像
1.将图片转换成二进制流 public byte[] getBitmapByte(Bitmap bitmap){ ByteArrayOutputStream out = new ByteArrayOu ...
- Codeforces--630E--A rectangle(规律)
E - A rectangle Crawling in process... Crawling failed Time Limit:500MS Memory Limit:65536KB ...