python文件管道 下载图集
- # -*- coding: utf-8 -*-
- import re
- from time import sleep
- import scrapy
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import CrawlSpider, Rule
- class AngelSpider(CrawlSpider):
- name = 'angel'
- allowed_domains = ['angelimg.spbeen.com']
- start_urls = ['http://angelimg.spbeen.com/']
- base_url = "http://angelimg.spbeen.com"
- rules = (
- Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/\d+$'), callback='parse_item', follow=False),
- )
- def parse_item(self, response):
- print(response.url)
- item = response.meta.get('item',False)
- if item:
- pass
- else:
- item = {}
- item['files'] = []
- item['file_urls'] = []
- dir_name = response.xpath('.//div[@class="article"]/h2/text()').extract_first()
- item['dir_name'] = dir_name.split('【')[0]
- item['dir_name'] = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","", item['dir_name'])
- img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()
- item['file_urls'].append(img_url)
- # 如果有下一页 请求下一页,没有数据丢回管道
- next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()
- #sleep(1)
- if next_url:
- next_url = self.base_url + next_url
- yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})
- else:
- yield item
管道 继承文件管道
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- import hashlib
- import os
- from scrapy.pipelines.files import FilesPipeline
- class AngelimgPipeline(object):
- def process_item(self, item, spider):
- return item
- from scrapy.http import Request
- from scrapy.utils.python import to_bytes
- class DealFilePathPipeline(FilesPipeline):
- def get_media_requests(self, item, info):
- return [Request(x,meta={'item':item}) for x in item.get(self.files_urls_field, [])]
- def file_path(self, request, response=None, info=None):
- ## start of deprecation warning block (can be removed in the future)
- def _warn():
- from scrapy.exceptions import ScrapyDeprecationWarning
- import warnings
- warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
- 'file_path(request, response=None, info=None) instead',
- category=ScrapyDeprecationWarning, stacklevel=1)
- # check if called from file_key with url as first argument
- if not isinstance(request, Request):
- _warn()
- url = request
- else:
- url = request.url
- # detect if file_key() method has been overridden
- if not hasattr(self.file_key, '_base'):
- _warn()
- return self.file_key(url)
- ## end of deprecation warning block
- item = request.meta.get('item',{})
- media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
- media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
- print(item)
- return 'full2/{}/{}{}'.format(item['dir_name'],media_guid, media_ext)
- return 'full/%s%s' % (media_guid, media_ext)
- # deprecated
- def file_key(self, url):
- return self.file_path(url)
- file_key._base = True
setting.py
- # -*- coding: utf-8 -*-
- # Scrapy settings for angelImg project
- #
- # For simplicity, this file contains only settings considered important or
- # commonly used. You can find more settings consulting the documentation:
- #
- # https://doc.scrapy.org/en/latest/topics/settings.html
- # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
- # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
- BOT_NAME = 'angelImg'
- SPIDER_MODULES = ['angelImg.spiders']
- NEWSPIDER_MODULE = 'angelImg.spiders'
- # Crawl responsibly by identifying yourself (and your website) on the user-agent
- #USER_AGENT = 'angelImg (+http://www.yourdomain.com)'
- # Obey robots.txt rules
- ROBOTSTXT_OBEY = False
- # Configure maximum concurrent requests performed by Scrapy (default: 16)
- #CONCURRENT_REQUESTS = 32
- # Configure a delay for requests for the same website (default: 0)
- # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
- # See also autothrottle settings and docs
- #DOWNLOAD_DELAY = 3
- # The download delay setting will honor only one of:
- #CONCURRENT_REQUESTS_PER_DOMAIN = 16
- #CONCURRENT_REQUESTS_PER_IP = 16
- # Disable cookies (enabled by default)
- #COOKIES_ENABLED = False
- # Disable Telnet Console (enabled by default)
- #TELNETCONSOLE_ENABLED = False
- # Override the default request headers:
- DEFAULT_REQUEST_HEADERS = {
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
- "Referer":"http://angelimg.spbeen.com/"
- }
- # Enable or disable spider middlewares
- # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
- #SPIDER_MIDDLEWARES = {
- # 'angelImg.middlewares.AngelimgSpiderMiddleware': 543,
- #}
- # Enable or disable downloader middlewares
- # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
- #DOWNLOADER_MIDDLEWARES = {
- # 'angelImg.middlewares.AngelimgDownloaderMiddleware': 543,
- #}
- # Enable or disable extensions
- # See https://doc.scrapy.org/en/latest/topics/extensions.html
- #EXTENSIONS = {
- # 'scrapy.extensions.telnet.TelnetConsole': None,
- #}
- # Configure item pipelines
- # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- ITEM_PIPELINES = {
- #'angelImg.pipelines.AngelimgPipeline': 300,
- 'angelImg.pipelines.DealFilePathPipeline': 200,
- #'scrapy.pipelines.files.FilesPipeline': 2
- }
- FILES_STORE='file_doload'
- # Enable and configure the AutoThrottle extension (disabled by default)
- # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
- #AUTOTHROTTLE_ENABLED = True
- # The initial download delay
- #AUTOTHROTTLE_START_DELAY = 5
- # The maximum download delay to be set in case of high latencies
- #AUTOTHROTTLE_MAX_DELAY = 60
- # The average number of requests Scrapy should be sending in parallel to
- # each remote server
- #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
- # Enable showing throttling stats for every response received:
- #AUTOTHROTTLE_DEBUG = False
- # Enable and configure HTTP caching (disabled by default)
- # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
- #HTTPCACHE_ENABLED = True
- #HTTPCACHE_EXPIRATION_SECS = 0
- #HTTPCACHE_DIR = 'httpcache'
- #HTTPCACHE_IGNORE_HTTP_CODES = []
- #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
python文件管道 下载图集的更多相关文章
- PYTHON文件多线程下载
其实,在一般的文件编程中,这有两个概念要说明: 第一是,下载一个大文件,将这个大文件多为多线程. 第二是,下载N多小文件,将每个线程指定下载多个小文件. 现在实现的是多线程下载一个大文件. 今天完成了 ...
- 2、Python djang 框架下的word Excel TXT Image 等文件的下载
2.python实现文件下载 (1)方法一.直接用a标签的href+数据库中文件地址,即可下载.缺点:word excel是直接弹框下载,对于image txt 等文件的下载方式是直接在新页面打开. ...
- python爬虫之下载文件的方式总结以及程序实例
python爬虫之下载文件的方式以及下载实例 目录 第一种方法:urlretrieve方法下载 第二种方法:request download 第三种方法:视频文件.大型文件下载 实战演示 第一种方法: ...
- Python selenium 文件自动下载 (自动下载器)
MyGithub:https://github.com/williamzxl 最新代码已经上传到Github,以下版本为stupid版本. 由于在下载过程中需要下载不同文件,所以可以把所有类型放在Va ...
- Python 文件操作函数
这个博客是 Building powerful image classification models using very little data 的前期准备,用于把图片数据按照教程指示放到规定的文 ...
- python文件打包格式,pip包管理
1..whl是python文件的一种打包格式, 在有些情况下,可以将文件的后缀名改为.zip并解压 2.cmd中,提示pip版本太低,先升级pip pip install --upgrade pi ...
- Python文件系统功能:os模块
Python文件系统功能:os模块 1.os模块方法分类 (1)目录: chdir() 改变工作目录 chroot() 设定当前进程的根目录 listdir() 列出指定目录下的所有文件名 mkdir ...
- 利用pyinstaller 打包Python文件
1.下载安装pyinstaller模块 cmd 命令: pip install pyinstaller cmd命令: pip list 查看自己安装的模块 2.建议把要大包的Python文件单独放到新 ...
- 随手用python写一个下载jdk源码爬虫
最近在研读jdk源码,网上找了下资源,发现都不完整. 后来新发现了一个有完整源码的地方,主要包括了java,c,c++的东西,装逼需要,就想拿来玩玩.但是,找了好多种下载打开的方式,发现都不对.于是, ...
随机推荐
- ajax之---上传图片和预览
views.py def upload_img(request): nid=str(uuid.uuid4()) ret={'status':True,'data':None,'message':Non ...
- JS将数字转换为中文
<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title> ...
- 通过Xshell实现socket代理访问公司内网
首先连接上Server,点击查看---隧道窗格 之后点击转移规则--空白处右键,添加 选择Dynamic,之后选择一个本地没有被占用的端口, 确定 浏览器设置 之后就可以访问公司内部的网站了
- Oracle闪回flashback
参考资料:Using Oracle Flashback Technology Oracle 11g的新特性闪回操作 闪回查询 闪回查询 闪回版本查询 闪回事务查询 闪回数据 闪回表 闪回删除 闪回数据 ...
- hystrix源码之插件
HystrixPlugins 获取并发相关类(HystrixConcurrencyStrategy).事件通知类(HystrixEventNotifier).度量信息类(HystrixMetricsP ...
- MyBatis常用实现方式
MyBatis 是一个优秀的基于 java 的持久层框架,它内部封装了 jdbc,使开发者只需要关注 sql 语句本身,而不需要花费精力去处理加载驱动.创建连接.创建 statement 等繁杂的过程 ...
- 记一次Java获取本地摄像头(基于OpenCV)
OpenCV官网下载地址(下载安装后,在安装目录可以找到动态链接库和OpenCv.jar) https://opencv.org/releases/ 安装完成后,这是我的安装目录 maven 依赖(这 ...
- C语言实现数据结构的邻接矩阵----数组生成矩阵、打印、深度优先遍历和广度优先遍历
写在前面 图的存储结构有两种:一种是基于二维数组的邻接矩阵表示法. 另一种是基于链表的的邻接表表示法. 在邻接矩阵中,可以如下表示顶点和边连接关系: 说明: 将顶点对应为下标,根据横纵坐标将矩阵中的某 ...
- 基于NPOI的Excel导入导出类库
概述 支持多sheet导入导出.导出字段过滤.特性配置导入验证,非空验证,唯一验证,错误标注等 用于基础配置和普通报表的导入导出,对于复杂需求,比如合并列,公式,导出图片等暂不支持 GitHub地址: ...
- Redis 作者 Antirez 与 Contributor Mattsta 之间关于 CRC 的 Battle
大家好,我是 yes. 昨天表弟说有个学妹问他 Redis 为什么要用 CRC16(key) mod 16384 来计算 key 所处槽的位置,我想这 CRC 一般都是用来校验的,通过多项式转换成二进 ...