1. # -*- coding: utf-8 -*-
  2. import re
  3. from time import sleep
  4.  
  5. import scrapy
  6. from scrapy.linkextractors import LinkExtractor
  7. from scrapy.spiders import CrawlSpider, Rule
  8.  
  9. class AngelSpider(CrawlSpider):
  10. name = 'angel'
  11. allowed_domains = ['angelimg.spbeen.com']
  12. start_urls = ['http://angelimg.spbeen.com/']
  13.  
  14. base_url = "http://angelimg.spbeen.com"
  15. rules = (
  16. Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/\d+$'), callback='parse_item', follow=False),
  17. )
  18.  
  19. def parse_item(self, response):
  20. print(response.url)
  21. item = response.meta.get('item',False)
  22. if item:
  23. pass
  24. else:
  25. item = {}
  26. item['files'] = []
  27. item['file_urls'] = []
  28. dir_name = response.xpath('.//div[@class="article"]/h2/text()').extract_first()
  29. item['dir_name'] = dir_name.split('【')[0]
  30. item['dir_name'] = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","", item['dir_name'])
  31.  
  32. img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()
  33. item['file_urls'].append(img_url)
  34. # 如果有下一页 请求下一页,没有数据丢回管道
  35. next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()
  36.  
  37. #sleep(1)
  38. if next_url:
  39. next_url = self.base_url + next_url
  40. yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})
  41. else:
  42. yield item

  

管道 继承文件管道

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Define your item pipelines here
  4. #
  5. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7. import hashlib
  8. import os
  9.  
  10. from scrapy.pipelines.files import FilesPipeline
  11.  
  12. class AngelimgPipeline(object):
  13. def process_item(self, item, spider):
  14. return item
  15.  
  16. from scrapy.http import Request
  17. from scrapy.utils.python import to_bytes
  18.  
  19. class DealFilePathPipeline(FilesPipeline):
  20. def get_media_requests(self, item, info):
  21. return [Request(x,meta={'item':item}) for x in item.get(self.files_urls_field, [])]
  22.  
  23. def file_path(self, request, response=None, info=None):
  24. ## start of deprecation warning block (can be removed in the future)
  25. def _warn():
  26. from scrapy.exceptions import ScrapyDeprecationWarning
  27. import warnings
  28. warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
  29. 'file_path(request, response=None, info=None) instead',
  30. category=ScrapyDeprecationWarning, stacklevel=1)
  31.  
  32. # check if called from file_key with url as first argument
  33. if not isinstance(request, Request):
  34. _warn()
  35. url = request
  36. else:
  37. url = request.url
  38.  
  39. # detect if file_key() method has been overridden
  40. if not hasattr(self.file_key, '_base'):
  41. _warn()
  42. return self.file_key(url)
  43. ## end of deprecation warning block
  44. item = request.meta.get('item',{})
  45. media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
  46. media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
  47. print(item)
  48. return 'full2/{}/{}{}'.format(item['dir_name'],media_guid, media_ext)
  49. return 'full/%s%s' % (media_guid, media_ext)
  50.  
  51. # deprecated
  52. def file_key(self, url):
  53. return self.file_path(url)
  54.  
  55. file_key._base = True

  setting.py

  1. # -*- coding: utf-8 -*-
  2.  
  3. # Scrapy settings for angelImg project
  4. #
  5. # For simplicity, this file contains only settings considered important or
  6. # commonly used. You can find more settings consulting the documentation:
  7. #
  8. # https://doc.scrapy.org/en/latest/topics/settings.html
  9. # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  10. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  11.  
  12. BOT_NAME = 'angelImg'
  13.  
  14. SPIDER_MODULES = ['angelImg.spiders']
  15. NEWSPIDER_MODULE = 'angelImg.spiders'
  16.  
  17. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  18. #USER_AGENT = 'angelImg (+http://www.yourdomain.com)'
  19.  
  20. # Obey robots.txt rules
  21. ROBOTSTXT_OBEY = False
  22.  
  23. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  24. #CONCURRENT_REQUESTS = 32
  25.  
  26. # Configure a delay for requests for the same website (default: 0)
  27. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
  28. # See also autothrottle settings and docs
  29. #DOWNLOAD_DELAY = 3
  30. # The download delay setting will honor only one of:
  31. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  32. #CONCURRENT_REQUESTS_PER_IP = 16
  33.  
  34. # Disable cookies (enabled by default)
  35. #COOKIES_ENABLED = False
  36.  
  37. # Disable Telnet Console (enabled by default)
  38. #TELNETCONSOLE_ENABLED = False
  39.  
  40. # Override the default request headers:
  41. DEFAULT_REQUEST_HEADERS = {
  42. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  43. # 'Accept-Language': 'en',
  44. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
  45. "Referer":"http://angelimg.spbeen.com/"
  46. }
  47.  
  48. # Enable or disable spider middlewares
  49. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  50. #SPIDER_MIDDLEWARES = {
  51. # 'angelImg.middlewares.AngelimgSpiderMiddleware': 543,
  52. #}
  53.  
  54. # Enable or disable downloader middlewares
  55. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  56. #DOWNLOADER_MIDDLEWARES = {
  57. # 'angelImg.middlewares.AngelimgDownloaderMiddleware': 543,
  58. #}
  59.  
  60. # Enable or disable extensions
  61. # See https://doc.scrapy.org/en/latest/topics/extensions.html
  62. #EXTENSIONS = {
  63. # 'scrapy.extensions.telnet.TelnetConsole': None,
  64. #}
  65.  
  66. # Configure item pipelines
  67. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  68. ITEM_PIPELINES = {
  69. #'angelImg.pipelines.AngelimgPipeline': 300,
  70. 'angelImg.pipelines.DealFilePathPipeline': 200,
  71. #'scrapy.pipelines.files.FilesPipeline': 2
  72. }
  73.  
  74. FILES_STORE='file_doload'
  75.  
  76. # Enable and configure the AutoThrottle extension (disabled by default)
  77. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
  78. #AUTOTHROTTLE_ENABLED = True
  79. # The initial download delay
  80. #AUTOTHROTTLE_START_DELAY = 5
  81. # The maximum download delay to be set in case of high latencies
  82. #AUTOTHROTTLE_MAX_DELAY = 60
  83. # The average number of requests Scrapy should be sending in parallel to
  84. # each remote server
  85. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  86. # Enable showing throttling stats for every response received:
  87. #AUTOTHROTTLE_DEBUG = False
  88.  
  89. # Enable and configure HTTP caching (disabled by default)
  90. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  91. #HTTPCACHE_ENABLED = True
  92. #HTTPCACHE_EXPIRATION_SECS = 0
  93. #HTTPCACHE_DIR = 'httpcache'
  94. #HTTPCACHE_IGNORE_HTTP_CODES = []
  95. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

python文件管道 下载图集的更多相关文章

  1. PYTHON文件多线程下载

    其实,在一般的文件编程中,这有两个概念要说明: 第一是,下载一个大文件,将这个大文件多为多线程. 第二是,下载N多小文件,将每个线程指定下载多个小文件. 现在实现的是多线程下载一个大文件. 今天完成了 ...

  2. 2、Python djang 框架下的word Excel TXT Image 等文件的下载

    2.python实现文件下载 (1)方法一.直接用a标签的href+数据库中文件地址,即可下载.缺点:word excel是直接弹框下载,对于image txt 等文件的下载方式是直接在新页面打开. ...

  3. python爬虫之下载文件的方式总结以及程序实例

    python爬虫之下载文件的方式以及下载实例 目录 第一种方法:urlretrieve方法下载 第二种方法:request download 第三种方法:视频文件.大型文件下载 实战演示 第一种方法: ...

  4. Python selenium 文件自动下载 (自动下载器)

    MyGithub:https://github.com/williamzxl 最新代码已经上传到Github,以下版本为stupid版本. 由于在下载过程中需要下载不同文件,所以可以把所有类型放在Va ...

  5. Python 文件操作函数

    这个博客是 Building powerful image classification models using very little data 的前期准备,用于把图片数据按照教程指示放到规定的文 ...

  6. python文件打包格式,pip包管理

    1..whl是python文件的一种打包格式, 在有些情况下,可以将文件的后缀名改为.zip并解压 2.cmd中,提示pip版本太低,先升级pip   pip install --upgrade pi ...

  7. Python文件系统功能:os模块

    Python文件系统功能:os模块 1.os模块方法分类 (1)目录: chdir() 改变工作目录 chroot() 设定当前进程的根目录 listdir() 列出指定目录下的所有文件名 mkdir ...

  8. 利用pyinstaller 打包Python文件

    1.下载安装pyinstaller模块 cmd 命令: pip install pyinstaller cmd命令: pip list 查看自己安装的模块 2.建议把要大包的Python文件单独放到新 ...

  9. 随手用python写一个下载jdk源码爬虫

    最近在研读jdk源码,网上找了下资源,发现都不完整. 后来新发现了一个有完整源码的地方,主要包括了java,c,c++的东西,装逼需要,就想拿来玩玩.但是,找了好多种下载打开的方式,发现都不对.于是, ...

随机推荐

  1. ajax之---上传图片和预览

    views.py def upload_img(request): nid=str(uuid.uuid4()) ret={'status':True,'data':None,'message':Non ...

  2. JS将数字转换为中文

    <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title> ...

  3. 通过Xshell实现socket代理访问公司内网

    首先连接上Server,点击查看---隧道窗格 之后点击转移规则--空白处右键,添加 选择Dynamic,之后选择一个本地没有被占用的端口, 确定 浏览器设置 之后就可以访问公司内部的网站了

  4. Oracle闪回flashback

    参考资料:Using Oracle Flashback Technology Oracle 11g的新特性闪回操作 闪回查询 闪回查询 闪回版本查询 闪回事务查询 闪回数据 闪回表 闪回删除 闪回数据 ...

  5. hystrix源码之插件

    HystrixPlugins 获取并发相关类(HystrixConcurrencyStrategy).事件通知类(HystrixEventNotifier).度量信息类(HystrixMetricsP ...

  6. MyBatis常用实现方式

    MyBatis 是一个优秀的基于 java 的持久层框架,它内部封装了 jdbc,使开发者只需要关注 sql 语句本身,而不需要花费精力去处理加载驱动.创建连接.创建 statement 等繁杂的过程 ...

  7. 记一次Java获取本地摄像头(基于OpenCV)

    OpenCV官网下载地址(下载安装后,在安装目录可以找到动态链接库和OpenCv.jar) https://opencv.org/releases/ 安装完成后,这是我的安装目录 maven 依赖(这 ...

  8. C语言实现数据结构的邻接矩阵----数组生成矩阵、打印、深度优先遍历和广度优先遍历

    写在前面 图的存储结构有两种:一种是基于二维数组的邻接矩阵表示法. 另一种是基于链表的的邻接表表示法. 在邻接矩阵中,可以如下表示顶点和边连接关系: 说明: 将顶点对应为下标,根据横纵坐标将矩阵中的某 ...

  9. 基于NPOI的Excel导入导出类库

    概述 支持多sheet导入导出.导出字段过滤.特性配置导入验证,非空验证,唯一验证,错误标注等 用于基础配置和普通报表的导入导出,对于复杂需求,比如合并列,公式,导出图片等暂不支持 GitHub地址: ...

  10. Redis 作者 Antirez 与 Contributor Mattsta 之间关于 CRC 的 Battle

    大家好,我是 yes. 昨天表弟说有个学妹问他 Redis 为什么要用 CRC16(key) mod 16384 来计算 key 所处槽的位置,我想这 CRC 一般都是用来校验的,通过多项式转换成二进 ...