# -*- coding: utf-8 -*-
import re
from time import sleep import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule class AngelSpider(CrawlSpider):
name = 'angel'
allowed_domains = ['angelimg.spbeen.com']
start_urls = ['http://angelimg.spbeen.com/'] base_url = "http://angelimg.spbeen.com"
rules = (
Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/\d+$'), callback='parse_item', follow=False),
) def parse_item(self, response):
print(response.url)
item = response.meta.get('item',False)
if item:
pass
else:
item = {}
item['files'] = []
item['file_urls'] = []
dir_name = response.xpath('.//div[@class="article"]/h2/text()').extract_first()
item['dir_name'] = dir_name.split('【')[0]
item['dir_name'] = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","", item['dir_name']) img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()
item['file_urls'].append(img_url)
# 如果有下一页 请求下一页,没有数据丢回管道
next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first() #sleep(1)
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})
else:
yield item

  

管道 继承文件管道

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import hashlib
import os from scrapy.pipelines.files import FilesPipeline class AngelimgPipeline(object):
def process_item(self, item, spider):
return item from scrapy.http import Request
from scrapy.utils.python import to_bytes class DealFilePathPipeline(FilesPipeline):
def get_media_requests(self, item, info):
return [Request(x,meta={'item':item}) for x in item.get(self.files_urls_field, [])] def file_path(self, request, response=None, info=None):
## start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
'file_path(request, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url # detect if file_key() method has been overridden
if not hasattr(self.file_key, '_base'):
_warn()
return self.file_key(url)
## end of deprecation warning block
item = request.meta.get('item',{})
media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
print(item)
return 'full2/{}/{}{}'.format(item['dir_name'],media_guid, media_ext)
return 'full/%s%s' % (media_guid, media_ext) # deprecated
def file_key(self, url):
return self.file_path(url) file_key._base = True

  setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for angelImg project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'angelImg' SPIDER_MODULES = ['angelImg.spiders']
NEWSPIDER_MODULE = 'angelImg.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'angelImg (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Referer":"http://angelimg.spbeen.com/"
} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'angelImg.middlewares.AngelimgSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'angelImg.middlewares.AngelimgDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
#'angelImg.pipelines.AngelimgPipeline': 300,
'angelImg.pipelines.DealFilePathPipeline': 200,
#'scrapy.pipelines.files.FilesPipeline': 2
} FILES_STORE='file_doload' # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

python文件管道 下载图集的更多相关文章

  1. PYTHON文件多线程下载

    其实,在一般的文件编程中,这有两个概念要说明: 第一是,下载一个大文件,将这个大文件多为多线程. 第二是,下载N多小文件,将每个线程指定下载多个小文件. 现在实现的是多线程下载一个大文件. 今天完成了 ...

  2. 2、Python djang 框架下的word Excel TXT Image 等文件的下载

    2.python实现文件下载 (1)方法一.直接用a标签的href+数据库中文件地址,即可下载.缺点:word excel是直接弹框下载,对于image txt 等文件的下载方式是直接在新页面打开. ...

  3. python爬虫之下载文件的方式总结以及程序实例

    python爬虫之下载文件的方式以及下载实例 目录 第一种方法:urlretrieve方法下载 第二种方法:request download 第三种方法:视频文件.大型文件下载 实战演示 第一种方法: ...

  4. Python selenium 文件自动下载 (自动下载器)

    MyGithub:https://github.com/williamzxl 最新代码已经上传到Github,以下版本为stupid版本. 由于在下载过程中需要下载不同文件,所以可以把所有类型放在Va ...

  5. Python 文件操作函数

    这个博客是 Building powerful image classification models using very little data 的前期准备,用于把图片数据按照教程指示放到规定的文 ...

  6. python文件打包格式,pip包管理

    1..whl是python文件的一种打包格式, 在有些情况下,可以将文件的后缀名改为.zip并解压 2.cmd中,提示pip版本太低,先升级pip   pip install --upgrade pi ...

  7. Python文件系统功能:os模块

    Python文件系统功能:os模块 1.os模块方法分类 (1)目录: chdir() 改变工作目录 chroot() 设定当前进程的根目录 listdir() 列出指定目录下的所有文件名 mkdir ...

  8. 利用pyinstaller 打包Python文件

    1.下载安装pyinstaller模块 cmd 命令: pip install pyinstaller cmd命令: pip list 查看自己安装的模块 2.建议把要大包的Python文件单独放到新 ...

  9. 随手用python写一个下载jdk源码爬虫

    最近在研读jdk源码,网上找了下资源,发现都不完整. 后来新发现了一个有完整源码的地方,主要包括了java,c,c++的东西,装逼需要,就想拿来玩玩.但是,找了好多种下载打开的方式,发现都不对.于是, ...

随机推荐

  1. 高可用服务之Keepalived利用脚本实现服务的可用性检测

    上一篇博客主要聊到了keepalived高可用LVS集群的相关配置,回顾请参考https://www.cnblogs.com/qiuhom-1874/p/13659428.html:keepalive ...

  2. 关键CSS和Webpack: 减少阻塞渲染的CSS的自动化解决方案

    原文地址: Critical CSS and Webpack: Automatically Minimize Render-Blocking CSS 原文作者: Anthony Gore 译者: 蜗牛 ...

  3. 浅谈Charles —— 青花瓷

    Charles -- 青花瓷 网络抓包工具 可以拦截 iPhone/Android 手机中 App 的非加密网络请求数据 使用 手机&电脑在同一个局域网 确保电脑能够通过路由器访问互联网 电脑 ...

  4. k8s健康检查(七)

    默认的健康检查 强大的自愈能力是 Kubernetes 这类容器编排引擎的一个重要特性.自愈的默认实现方式是自动重启发生故障的容器.除此之外,用户还可以利用 Liveness 和 Readiness ...

  5. yum管理——搭建iso镜像私有yum源仓库(1)

    在服务器上一般是没有光驱的,那么我们怎么制作iso本地repo镜像源仓库? 通过本地iso镜像,copy到linux系统中一个目录中,进行伪文件系统挂载,执行如下命令: 挂载:mount -o loo ...

  6. Redis5设计与源码分析读后感(三)跳跃表

    一.引言 有序集合在日常开发中相当常见,比如做排名等相关的功能,肯定要用到排序的功能,那么常见底层实现有很多种: 数组 :不便于元素的插入和删除 链表 :查询效率低,需要遍历所有元素 平衡树OR红黑树 ...

  7. Java基础一篇过(二)泛型

    一.啥是泛型 概述 泛型是Java SE 1.5的新特性,泛型的本质是参数化类型,即所操作的数据类型被指定为一个参数. 格式 类名<类型名> 标记符 E - Element (在集合中使用 ...

  8. pwnable.kr-cmd1-witeup

    执行分析程序,知过滤掉了flag.sh.tmp字段,但在linux下可通过通配符匹配文件哦. 哦对,参数输入的命令中,没有环境变量的支持,所有文件和命令必须用绝对路径哦.

  9. ios7.1发布企业证书测试包的问题

    关于升级了ios7.1之后发布企业版证书的测试包不能下载的问题,这个苹果也挺坑的,什么都不说,也不警告一下,直接就不能用了 用xcode的organizer里面的console里发现安装的时候提示这个 ...

  10. 海量数据分库分表方案(二)技术选型与sharding-jdbc实现

    上一章已经讲述分库分表算法选型,本章主要讲述分库分表技术选型 文中关联上一章,若下文出现提及其时,可以点击 分库分表算法方案与技术选型(一) 主要讲述 框架比较 sharding-jdbc.zdal ...