Scrapy学习-23-分布式爬虫
pip isntall redis pip install scrapy
SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}
# 基于basic_spider from scrapy_redis.spiders import RedisSpider class MySpider(RedisSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'myspider_redis'
redis_key = 'myspider:start_urls' def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop('domain', '')
self.allowed_domains = filter(None, domain.split(','))
super(MySpider, self).__init__(*args, **kwargs) def parse(self, response):
return {
'name': response.css('title::text').extract_first(),
'url': response.url,
} # 基于crawl_spider from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor from scrapy_redis.spiders import RedisCrawlSpider class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'mycrawler_redis'
redis_key = 'mycrawler:start_urls' rules = (
# follow all links
Rule(LinkExtractor(), callback='parse_page', follow=True),
) def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop('domain', '')
self.allowed_domains = filter(None, domain.split(','))
super(MyCrawler, self).__init__(*args, **kwargs) def parse_page(self, response):
return {
'name': response.css('title::text').extract_first(),
'url': response.url,
}
scrapy runspider myspider.py
redis-cli lpush myspider:start_urls http://google.com # scrapy-redis会从这个myspider:start_urls获取第一个爬取的URL
myspider:requests myspider:dupefilter
# scrapy-redis中的dupefilter 与scrapy中的dupefilters,基于于同一个类,实现的接口都相同 import logging
import time from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint from . import defaults
from .connection import get_redis_from_settings logger = logging.getLogger(__name__) # TODO: Rename class to RedisDupeFilter.
class RFPDupeFilter(BaseDupeFilter):
"""Redis-based request duplicates filter. This class can also be used with default Scrapy's scheduler. """ logger = logger def __init__(self, server, key, debug=False):
"""Initialize the duplicates filter. Parameters
----------
server : redis.StrictRedis
The redis server instance.
key : str
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests. """
self.server = server # 在connection.py中被实例化了,是一个redis对象
self.key = key
self.debug = debug
self.logdupes = True @classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key. Parameters
----------
settings : scrapy.settings.Settings Returns
-------
RFPDupeFilter
A RFPDupeFilter instance. """
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug) @classmethod
def from_crawler(cls, crawler):
"""Returns instance from crawler. Parameters
----------
crawler : scrapy.crawler.Crawler Returns
-------
RFPDupeFilter
Instance of RFPDupeFilter. """
return cls.from_settings(crawler.settings) def request_seen(self, request):
"""Returns True if request was already seen. Parameters
----------
request : scrapy.http.Request Returns
-------
bool """
fp = self.request_fingerprint(request)
# This returns the number of values added, zero if already exists.
added = self.server.sadd(self.key, fp)
return added == 0 def request_fingerprint(self, request):
"""Returns a fingerprint for a given request. Parameters
----------
request : scrapy.http.Request Returns
-------
str """
return request_fingerprint(request) @classmethod
def from_spider(cls, spider):
settings = spider.settings
server = get_redis_from_settings(settings)
dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY)
key = dupefilter_key % {'spider': spider.name}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug) def close(self, reason=''):
"""Delete data on close. Called by Scrapy's scheduler. Parameters
----------
reason : str, optional """
self.clear() def clear(self):
"""Clears fingerprints data."""
self.server.delete(self.key) def log(self, request, spider):
"""Logs given request. Parameters
----------
request : scrapy.http.Request
spider : scrapy.spiders.Spider """
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
# 这里使用的就是python的pickle模块,一个兼容py2和py3的串行化工具。这个serializer主要用于一会的scheduler存reuqest对象 """A pickle wrapper module with protocol=-1 by default.""" try:
import cPickle as pickle # PY2
except ImportError:
import pickle def loads(s):
return pickle.loads(s) def dumps(obj):
return pickle.dumps(obj, protocol=-1)
# 用于将items序列化后异步(deferToThread)存放到reids中 # 此处pipelines不是必须使用的,我们可以自己写一个pipelines将数据保存在本地 from scrapy.utils.misc import load_object
from scrapy.utils.serialize import ScrapyJSONEncoder
from twisted.internet.threads import deferToThread from . import connection, defaults default_serialize = ScrapyJSONEncoder().encode class RedisPipeline(object):
"""Pushes serialized item into a redis list/queue Settings
--------
REDIS_ITEMS_KEY : str
Redis key where to store items.
REDIS_ITEMS_SERIALIZER : str
Object path to serializer function. """ def __init__(self, server,
key=defaults.PIPELINE_KEY,
serialize_func=default_serialize):
"""Initialize pipeline. Parameters
----------
server : StrictRedis
Redis client instance.
key : str
Redis key where to store items.
serialize_func : callable
Items serializer function. """
self.server = server
self.key = key
self.serialize = serialize_func @classmethod
def from_settings(cls, settings):
params = {
'server': connection.from_settings(settings),
}
if settings.get('REDIS_ITEMS_KEY'):
params['key'] = settings['REDIS_ITEMS_KEY']
if settings.get('REDIS_ITEMS_SERIALIZER'):
params['serialize_func'] = load_object(
settings['REDIS_ITEMS_SERIALIZER']
) return cls(**params) @classmethod
def from_crawler(cls, crawler):
return cls.from_settings(crawler.settings) def process_item(self, item, spider):
return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider):
key = self.item_key(item, spider)
data = self.serialize(item)
self.server.rpush(key, data)
return item def item_key(self, item, spider):
"""Returns redis key based on given spider. Override this function to use a different key depending on the item
and/or spider. """
return self.key % {'spider': spider.name}
# 此处定义了三种队列先进先出队列、优先级队列、后进先出队列 from scrapy.utils.reqser import request_to_dict, request_from_dict from . import picklecompat class Base(object):
"""Per-spider base queue class""" def __init__(self, server, spider, key, serializer=None):
"""Initialize per-spider redis queue. Parameters
----------
server : StrictRedis
Redis client instance.
spider : Spider
Scrapy spider instance.
key: str
Redis key where to put and get messages.
serializer : object
Serializer object with ``loads`` and ``dumps`` methods. """
if serializer is None:
# Backward compatibility.
# TODO: deprecate pickle.
serializer = picklecompat
if not hasattr(serializer, 'loads'):
raise TypeError("serializer does not implement 'loads' function: %r"
% serializer)
if not hasattr(serializer, 'dumps'):
raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
% serializer) self.server = server
self.spider = spider
self.key = key % {'spider': spider.name}
self.serializer = serializer def _encode_request(self, request):
"""Encode a request object"""
obj = request_to_dict(request, self.spider)
return self.serializer.dumps(obj) def _decode_request(self, encoded_request):
"""Decode an request previously encoded"""
obj = self.serializer.loads(encoded_request)
return request_from_dict(obj, self.spider) def __len__(self):
"""Return the length of the queue"""
raise NotImplementedError def push(self, request):
"""Push a request"""
raise NotImplementedError def pop(self, timeout=0):
"""Pop a request"""
raise NotImplementedError def clear(self):
"""Clear queue/stack"""
self.server.delete(self.key) class FifoQueue(Base):
"""Per-spider FIFO queue""" def __len__(self):
"""Return the length of the queue"""
return self.server.llen(self.key) def push(self, request):
"""Push a request"""
self.server.lpush(self.key, self._encode_request(request)) def pop(self, timeout=0):
"""Pop a request"""
if timeout > 0:
data = self.server.brpop(self.key, timeout)
if isinstance(data, tuple):
data = data[1]
else:
data = self.server.rpop(self.key)
if data:
return self._decode_request(data) class PriorityQueue(Base):
"""Per-spider priority queue abstraction using redis' sorted set""" def __len__(self):
"""Return the length of the queue"""
return self.server.zcard(self.key) def push(self, request):
"""Push a request"""
data = self._encode_request(request)
score = -request.priority
# We don't use zadd method as the order of arguments change depending on
# whether the class is Redis or StrictRedis, and the option of using
# kwargs only accepts strings, not bytes.
self.server.execute_command('ZADD', self.key, score, data) def pop(self, timeout=0):
"""
Pop a request
timeout not support in this queue class
"""
# use atomic range/remove using multi/exec
pipe = self.server.pipeline()
pipe.multi()
pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
results, count = pipe.execute()
if results:
return self._decode_request(results[0]) class LifoQueue(Base):
"""Per-spider LIFO queue.""" def __len__(self):
"""Return the length of the stack"""
return self.server.llen(self.key) def push(self, request):
"""Push a request"""
self.server.lpush(self.key, self._encode_request(request)) def pop(self, timeout=0):
"""Pop a request"""
if timeout > 0:
data = self.server.blpop(self.key, timeout)
if isinstance(data, tuple):
data = data[1]
else:
data = self.server.lpop(self.key) if data:
return self._decode_request(data) # TODO: Deprecate the use of these names.
SpiderQueue = FifoQueue
SpiderStack = LifoQueue
SpiderPriorityQueue = PriorityQueue
# 主要的两个函数enqueue_request、next_request import importlib
import six from scrapy.utils.misc import load_object from . import connection, defaults # TODO: add SCRAPY_JOB support.
class Scheduler(object):
"""Redis-based scheduler Settings
--------
SCHEDULER_PERSIST : bool (default: False)
Whether to persist or clear redis queue.
SCHEDULER_FLUSH_ON_START : bool (default: False)
Whether to flush redis queue on start.
SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0)
How many seconds to wait before closing if no message is received.
SCHEDULER_QUEUE_KEY : str
Scheduler redis key.
SCHEDULER_QUEUE_CLASS : str
Scheduler queue class.
SCHEDULER_DUPEFILTER_KEY : str
Scheduler dupefilter redis key.
SCHEDULER_DUPEFILTER_CLASS : str
Scheduler dupefilter class.
SCHEDULER_SERIALIZER : str
Scheduler serializer. """ def __init__(self, server,
persist=False,
flush_on_start=False,
queue_key=defaults.SCHEDULER_QUEUE_KEY,
queue_cls=defaults.SCHEDULER_QUEUE_CLASS,
dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,
dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,
idle_before_close=0,
serializer=None):
"""Initialize scheduler. Parameters
----------
server : Redis
The redis server instance.
persist : bool
Whether to flush requests when closing. Default is False.
flush_on_start : bool
Whether to flush requests on start. Default is False.
queue_key : str
Requests queue key.
queue_cls : str
Importable path to the queue class.
dupefilter_key : str
Duplicates filter key.
dupefilter_cls : str
Importable path to the dupefilter class.
idle_before_close : int
Timeout before giving up. """
if idle_before_close < 0:
raise TypeError("idle_before_close cannot be negative") self.server = server
self.persist = persist
self.flush_on_start = flush_on_start
self.queue_key = queue_key
self.queue_cls = queue_cls
self.dupefilter_cls = dupefilter_cls
self.dupefilter_key = dupefilter_key
self.idle_before_close = idle_before_close
self.serializer = serializer
self.stats = None def __len__(self):
return len(self.queue) @classmethod
def from_settings(cls, settings):
kwargs = {
'persist': settings.getbool('SCHEDULER_PERSIST'),
'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
} # If these values are missing, it means we want to use the defaults.
optional = {
# TODO: Use custom prefixes for this settings to note that are
# specific to scrapy-redis.
'queue_key': 'SCHEDULER_QUEUE_KEY',
'queue_cls': 'SCHEDULER_QUEUE_CLASS',
'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
# We use the default setting name to keep compatibility.
'dupefilter_cls': 'DUPEFILTER_CLASS',
'serializer': 'SCHEDULER_SERIALIZER',
}
for name, setting_name in optional.items():
val = settings.get(setting_name)
if val:
kwargs[name] = val # Support serializer as a path to a module.
if isinstance(kwargs.get('serializer'), six.string_types):
kwargs['serializer'] = importlib.import_module(kwargs['serializer']) server = connection.from_settings(settings)
# Ensure the connection is working.
server.ping() return cls(server=server, **kwargs) @classmethod
def from_crawler(cls, crawler):
instance = cls.from_settings(crawler.settings)
# FIXME: for now, stats are only supported from this constructor
instance.stats = crawler.stats
return instance def open(self, spider):
self.spider = spider try:
self.queue = load_object(self.queue_cls)(
server=self.server,
spider=spider,
key=self.queue_key % {'spider': spider.name},
serializer=self.serializer,
)
except TypeError as e:
raise ValueError("Failed to instantiate queue class '%s': %s",
self.queue_cls, e) self.df = load_object(self.dupefilter_cls).from_spider(spider) if self.flush_on_start:
self.flush()
# notice if there are requests already in the queue to resume the crawl
if len(self.queue):
spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason):
if not self.persist:
self.flush() def flush(self):
self.df.clear()
self.queue.clear() def enqueue_request(self, request):
if not request.dont_filter and self.df.request_seen(request):
self.df.log(request, self.spider)
return False
if self.stats:
self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
self.queue.push(request)
return True def next_request(self):
block_pop_timeout = self.idle_before_close
request = self.queue.pop(block_pop_timeout)
if request and self.stats:
self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
return request def has_pending_requests(self):
return len(self) > 0
utils.py
# 兼容python2和python3 import six def bytes_to_str(s, encoding='utf-8'):
"""Returns a str if a bytes object is given."""
if six.PY3 and isinstance(s, bytes):
return s.decode(encoding)
return s
spiders.py
# 继承了RedisMixin和Spider # RedisMixin重载了start_request(),使用自己编写的函数next_requests,实现从redis中获取数据 from scrapy import signals
from scrapy.exceptions import DontCloseSpider
from scrapy.spiders import Spider, CrawlSpider from . import connection, defaults
from .utils import bytes_to_str class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None # Redis client placeholder.
server = None def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests() def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal. This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None) if crawler is None:
raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip():
raise ValueError("redis_key must not be empty") if self.redis_batch_size is None:
# TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),
) try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__) self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
fetch_one = self.server.spop if use_set else self.server.lpop
# XXX: Do we need to use a timeout here?
found = 0
# TODO: Use redis pipeline execution.
while found < self.redis_batch_size:
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data)
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data) if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key) def make_request_from_data(self, data):
"""Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to
provide your own message decoding. Parameters
----------
data : bytes
Message from redis. """
url = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(url) def schedule_next_requests(self):
"""Schedules a request if available"""
# TODO: While there is capacity, schedule a batch of redis requests.
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self) def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider class RedisSpider(RedisMixin, Spider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: False)
Use SET operations to retrieve messages from the redis queue. If False,
the messages are retrieve using the LPOP command.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj class RedisCrawlSpider(RedisMixin, CrawlSpider):
"""Spider that reads urls from redis queue when idle. Attributes
----------
redis_key : str (default: REDIS_START_URLS_KEY)
Redis key where to fetch start URLs from..
redis_batch_size : int (default: CONCURRENT_REQUESTS)
Number of messages to fetch from redis on each attempt.
redis_encoding : str (default: REDIS_ENCODING)
Encoding to use when decoding messages from redis queue. Settings
--------
REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
Default Redis key where to fetch start URLs from..
REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
Default number of messages to fetch from redis on each attempt.
REDIS_START_URLS_AS_SET : bool (default: True)
Use SET operations to retrieve messages from the redis queue.
REDIS_ENCODING : str (default: "utf-8")
Default encoding to use when decoding messages from redis queue. """ @classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
pip install mmh3 pip install redis
# -*- coding: utf-8 -*-
import mmh3
import redis
import math
import time class PyBloomFilter(object):
# 内置100个随机种子
SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518] # capacity是预先估计要去重的数量
# error_rate表示错误率
# conn表示redis的连接客户端
# key表示在redis中的键的名字前缀
def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate)) # 需要的总bit位数
self.k = math.ceil(math.log1p(2)*self.m/capacity) # 需要最少的hash次数
self.mem = math.ceil(self.m/8/1024/1024) # 需要的多少M内存
self.blocknum = math.ceil(self.mem/512) # 需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块
self.seeds = self.SEEDS[0:self.k]
self.key = key
self.N = 2**31-1
self.redis = conn
# print(self.mem)
# print(self.k) def add(self, value):
name = self.key + "_" + str(ord(value[0])%self.blocknum)
hashs = self.get_hashs(value)
for hash in hashs:
self.redis.setbit(name, hash, 1) def is_exist(self, value):
name = self.key + "_" + str(ord(value[0])%self.blocknum)
hashs = self.get_hashs(value)
exist = True
for hash in hashs:
exist = exist & self.redis.getbit(name, hash)
return exist def get_hashs(self, value):
hashs = list()
for seed in self.seeds:
hash = mmh3.hash(value, seed)
if hash >= 0:
hashs.append(hash)
else:
hashs.append(self.N - hash)
return hashs pool = redis.ConnectionPool(host='192.168.1.1', port=6379, db=0)
conn = redis.StrictRedis(connection_pool=pool) if __name__ == "__main__":
bf = PyBloomFilter(conn=conn)
bf.add('www.jobbole.com')
bf.add('www.zhihu.com')
print(bf.is_exist('www.zhihu.com'))
print(bf.is_exist('www.lagou.com'))
import logging
import time from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint from . import defaults
from .connection import get_redis_from_settings
from ScrapyRedisTest.utils.bloomfilter import conn, PyBloomFilter logger = logging.getLogger(__name__) # TODO: Rename class to RedisDupeFilter.
class RFPDupeFilter(BaseDupeFilter):
"""Redis-based request duplicates filter. This class can also be used with default Scrapy's scheduler. """ logger = logger def __init__(self, server, key, debug=False):
"""Initialize the duplicates filter. Parameters
----------
server : redis.StrictRedis
The redis server instance.
key : str
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests. """
self.server = server
self.key = key
self.debug = debug
self.logdupes = True self.bf = PyBloomFilter(conn=conn, key=key) @classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key. Parameters
----------
settings : scrapy.settings.Settings Returns
-------
RFPDupeFilter
A RFPDupeFilter instance. """
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug) @classmethod
def from_crawler(cls, crawler):
"""Returns instance from crawler. Parameters
----------
crawler : scrapy.crawler.Crawler Returns
-------
RFPDupeFilter
Instance of RFPDupeFilter. """
return cls.from_settings(crawler.settings) def request_seen(self, request):
"""Returns True if request was already seen. Parameters
----------
request : scrapy.http.Request Returns
-------
bool """
fp = self.request_fingerprint(request) if self.bf.is_exist(fp):
return True
else:
self.bf.add(fp)
return False
# This returns the number of values added, zero if already exists.
# added = self.server.sadd(self.key, fp)
# return added == 0 def request_fingerprint(self, request):
"""Returns a fingerprint for a given request. Parameters
----------
request : scrapy.http.Request Returns
-------
str """
return request_fingerprint(request) def close(self, reason=''):
"""Delete data on close. Called by Scrapy's scheduler. Parameters
----------
reason : str, optional """
self.clear() def clear(self):
"""Clears fingerprints data."""
self.server.delete(self.key) def log(self, request, spider):
"""Logs given request. Parameters
----------
request : scrapy.http.Request
spider : scrapy.spiders.Spider """
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
Scrapy学习-23-分布式爬虫的更多相关文章
- Scrapy+redis实现分布式爬虫
概述 什么是分布式爬虫 需要搭建一个由n台电脑组成的机群,然后在每一台电脑中执行同一组程序,让其对同一网络资源进行联合且分布的数据爬取. 原生Scrapy无法实现分布式的原因 原生Scrapy中调度器 ...
- 基于Python,scrapy,redis的分布式爬虫实现框架
原文 http://www.xgezhang.com/python_scrapy_redis_crawler.html 爬虫技术,无论是在学术领域,还是在工程领域,都扮演者非常重要的角色.相比于其他 ...
- scrapy如何实现分布式爬虫
使用scrapy爬虫的时候,记录一下如何分布式爬虫问题: 关键在于多台主机协作的关键:共享爬虫队列 主机:维护爬取队列从机:负责数据抓取,数据处理,数据存储 队列如何维护:Redis队列Redis 非 ...
- scrapy——7 scrapy-redis分布式爬虫,用药助手实战,Boss直聘实战,阿布云代理设置
scrapy——7 什么是scrapy-redis 怎么安装scrapy-redis scrapy-redis常用配置文件 scrapy-redis键名介绍 实战-利用scrapy-redis分布式爬 ...
- Scrapy 教程(八)-分布式爬虫
scrapy 本身并不是一个分布式框架,而 Scrapy-redis 库使得分布式成为可能: Scrapy-redis 并没有重构框架,而是基于redis数据库重写了框架的某些组件. 分布式框架要解决 ...
- 基于scrapy框架的分布式爬虫
分布式 概念:可以使用多台电脑组件一个分布式机群,让其执行同一组程序,对同一组网络资源进行联合爬取. 原生的scrapy是无法实现分布式 调度器无法被共享 管道无法被共享 基于 scrapy+redi ...
- Scrapy分布式爬虫,分布式队列和布隆过滤器,一分钟搞定?
使用Scrapy开发一个分布式爬虫?你知道最快的方法是什么吗?一分钟真的能 开发好或者修改出 一个分布式爬虫吗? 话不多说,先让我们看看怎么实践,再详细聊聊细节~ 快速上手 Step 0: 首先安装 ...
- Python分布式爬虫必学框架Scrapy打造搜索引擎 学习教程
Python分布式爬虫打造搜索引擎Scrapy精讲—用Django实现搜索的自动补全功能 elasticsearch(搜索引擎)提供了自动补全接口 1.创建搜索自动补全字段suggest自动补全需要用 ...
- python分布式爬虫打造搜索引擎--------scrapy实现
最近在网上学习一门关于scrapy爬虫的课程,觉得还不错,以下是目录还在更新中,我觉得有必要好好的做下笔记,研究研究. 第1章 课程介绍 1-1 python分布式爬虫打造搜索引擎简介 07:23 第 ...
- Python分布式爬虫必学框架Scrapy打造搜索引擎
Python分布式爬虫必学框架Scrapy打造搜索引擎 部分课程截图: 点击链接或搜索QQ号直接加群获取其它资料: 链接:https://pan.baidu.com/s/1-wHr4dTAxfd51M ...
随机推荐
- 绘制字符串:imagestring()
<?php //1. 绘制图像资源(创建一个画布) $image = imagecreatetruecolor(500, 300); //2. 先分配一个绿色 $green = imagecol ...
- php中foreach循环遍历二维数组
最近在用tp3.2框架,在查询的时候用到了select(),这条语句返回的是二维数组,所以在对返回的数据做处理时,遇到了些麻烦,百度了下foreach,终于用foreach解决了数据的筛选问题 (因为 ...
- JZOJ 3463. 【NOIP2013模拟联考5】军训
3463. [NOIP2013模拟联考5]军训(training) (Standard IO) Time Limits: 2000 ms Memory Limits: 262144 KB Deta ...
- Python虚拟机类机制之绑定方法和非绑定方法(七)
Bound Method和Unbound Method 在Python中,当对作为属性的函数进行引用时,会有两种形式,一种称为Bound Method,这种形式是通过类的实例对象进行属性引用,而另一种 ...
- package.json文件特殊符号含义
package.json文件里的^和~表示什么意思呢 In the simplest terms, the tilde matches the most recent minor version (t ...
- Windows核心编程小结2
这一节看看内存管理相关的信息 首先看看虚拟内存 虚拟地址空间 32位系统 --- 4GB = 232 64 位系统 ---- 16EB = 264 虚拟内存表 当一个应用程序从硬盘加载到RAM时, ...
- maven文件报错(pom.xml或者jar包缺失)解决方法
相信很多朋友在myeclipse上把maven配置好了,但是新建maven项目的时候会报错,下面我来总结以下我遇到的问题. 新建完maven项目后,pom.xml报错 1.报错的原因:很多时候我们在下 ...
- shell执行mysql的脚本(包括mysql执行shell脚本)
在Shell中执行mysql的脚本,这里介绍比较容易使用的一种方法 首先写好sql的脚本,后缀为.sql,比如 sql_file.sql:内容如下 #这是SQL的脚本create table if n ...
- MySQL误操作后如何快速恢复数据?
摘要: 利用binlog闪回误操作数据. 基本上每个跟数据库打交道的程序员(当然也可能是你同事)都会碰一个问题,MySQL误操作后如何快速回滚?比如,delete一张表,忘加限制条件,整张表没了.假如 ...
- python和tensorflow安装
一.Python安装 python采用anaconda安装,简单方便,下载python3.6的anaconda linux64的sh安装文件. 1.bash Anaconda-2.1.0-Linux ...