一.爬取斗鱼主播

　1.　爬虫文件

# -*- coding: utf-8 -*-

import scrapy

import json

from Douyu.items import DouyuItem

class DouyuSpider(scrapy.Spider):

    name = 'douyu'

    # allowed_domains = ['www.xxx.com']

    baseurl = 'http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset='

    # 偏移量,指的是起始值,从0开始的偏移值

    offset = 0

    start_urls = [baseurl + str(offset)]

    def parse(self, response):

        # 对获取的数据进转jsao格式后进行判断

        data = json.loads(response.text)['data']

        if len(data) == 0:

            return

        data = json.loads(response.text)['data']

        # //循环data这个列表,拿到的是每一个主播信息的字典

        for each in data:

            name = each['nickname']

            img_url = each['vertical_src']

            # //实例一个item对象来装获取到的数据

            item = DouyuItem()

            item['name'] = name

            item['img_url'] = img_url

            # 这边要记得返回，否则管道文件接不到数据

            yield item

        # 获取所有页的数据

        # 这样不容出错,上面有判断了状态表示码,如果为1就不会走if这边了

        self.offset += 20

        url = self.baseurl + str(self.offset)

        yield scrapy.Request(url=url, callback=self.parse)

　　2.item

import scrapy

class DouyuItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    name=scrapy.Field()  #保存昵称

    img_url=scrapy.Field()  #保存图片url

　　3.pipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.pipelines.images import ImagesPipeline

from Douyu.settings import IMAGES_STORE as images_store

import os

import scrapy

#文字存储

class DouyuPipeline(object):

    f = None

    def open_spider(self, spider):

        self.f = open('./douyu.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):

        name = item['name']

        img_url = item['img_url']

        self.f.write(name + ":" + img_url + "\n")

        return item

    def close_spider(self, spider):

        self.f.close()

# 设置照片存储

class ImagesPipieline(ImagesPipeline):

    # 从爬虫文件赤岸过来的item中获取诈骗的url,对照片的url进行请求,获取照片

    # 照片默认获取保存到settingts.py中IMGS_STORE,自己要去设置路径

    def get_media_requests(self, item, info):

        img_url = item['img_url']

        yield scrapy.Request(img_url)

    # 对图片修改名字

    def item_completed(self, results, item, info):

        # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确就放到imgpath里面

        # results:把图片从文件读出来的信息

        img_path = [x['path'] for ok, x in results if ok]

        os.rename(images_store + img_path[0], images_store + item['name'] + '.jpg')

　　4.settings

# -*- coding: utf-8 -*-

# Scrapy settings for Douyu project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'Douyu'

SPIDER_MODULES = ['Douyu.spiders']

NEWSPIDER_MODULE = 'Douyu.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

# DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

# CONCURRENT_REQUESTS_PER_DOMAIN = 16

# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

# TELNETCONSOLE_ENABLED = False

# Override the default request headers:

# DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

# }

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

# SPIDER_MIDDLEWARES = {

#    'Douyu.middlewares.DouyuSpiderMiddleware': 543,

# }

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# DOWNLOADER_MIDDLEWARES = {

#    'Douyu.middlewares.DouyuDownloaderMiddleware': 543,

# }

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

# EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

# }

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

    'Douyu.pipelines.DouyuPipeline': 300,

    'Douyu.pipelines.ImagesPipieline': 301,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

# AUTOTHROTTLE_ENABLED = True

# The initial download delay

# AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

# AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

# HTTPCACHE_ENABLED = True

# HTTPCACHE_EXPIRATION_SECS = 0

# HTTPCACHE_DIR = 'httpcache'

# HTTPCACHE_IGNORE_HTTP_CODES = []

# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 图片的存储路径

#settings都要大写,这边的字一个都不能错

IMAGES_STORE ='D:/scrapy/Douyu/imgs/'

基于scrapy的一些实例的更多相关文章

基于scrapy爬虫的天气数据采集(python)
基于scrapy爬虫的天气数据采集(python) 一.实验介绍 1.1. 知识点本节实验中将学习和实践以下知识点: Python基本语法 Scrapy框架爬虫的概念二.实验效果三.项目实战 ...
Python分布式爬虫打造搜索引擎完整版-基于Scrapy、Redis、elasticsearch和django打造一个完整的搜索引擎网站
Python分布式爬虫打造搜索引擎基于Scrapy.Redis.elasticsearch和django打造一个完整的搜索引擎网站 https://github.com/mtianyan/Artic ...
爬虫学习之基于Scrapy的爬虫自动登录
###概述在前面两篇(爬虫学习之基于Scrapy的网络爬虫和爬虫学习之简单的网络爬虫)文章中我们通过两个实际的案例,采用不同的方式进行了内容提取.我们对网络爬虫有了一个比较初级的认识,只要发起请求获 ...
vc 基于对话框多线程编程实例——线程之间的通信
vc基于对话框多线程编程实例——线程之间的通信实例:
SpringMVC详解（三）------基于注解的入门实例
前两篇博客我们讲解了基于XML 的入门实例,以及SpringMVC运行的详细流程.但是我们发现基于 XML 的配置还是比较麻烦的,而且,每个 Handler 类只能有一个方法,在实际开发中肯定是不可能 ...
转载 SpringMVC详解（三）------基于注解的入门实例
目录 1.在 web.xml 文件中配置前端处理器 2.在 springmvc.xml 文件中配置处理器映射器,处理器适配器,视图解析器 3.编写 Handler 4.编写视图 index.jsp ...
基于Scrapy框架的Python新闻爬虫
概述该项目是基于Scrapy框架的Python新闻爬虫,能够爬取网易,搜狐,凤凰和澎湃网站上的新闻,将标题,内容,评论,时间等内容整理并保存到本地详细代码下载:http://www.demoda ...
基于scrapy框架的爬虫
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中. scrapy 框架高性能的网络请求高性能的数据解析高性能的 ...
基于scrapy框架输入关键字爬取有关贴吧帖子
基于scrapy框架输入关键字爬取有关贴吧帖子站点分析首先进入一个贴吧,要想达到输入关键词爬取爬取指定贴吧,必然需要利用搜索引擎点进看到有四种搜索方式,分别试一次,观察url变化我们得知: 搜 ...

随机推荐

3.Hive中查看数据来源文件和具体位置方法
虚拟列 -- 当 hive 产生了非预期的或 null 的时候,可以通过虚拟列进行诊断,判断哪行数据出现问题 INPUT__FILE__NAME (输入文件名)map任务读入File的全路径 ...
ensemble 的2篇入门文章
python 篇: http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn ...
ls -al
ls -al:显示当前文件下所有的文件
BZOJ3223 文艺平衡树(splay)
题目背景这是一道经典的Splay模板题——文艺平衡树. 题目描述您需要写一种数据结构(可参考题目标题),来维护一个有序数列,其中需要提供以下操作:翻转一个区间,例如原有序序列是5 4 3 2 1, ...
centos7 安装pip
首先安装 python3 安装过程1.安装相关依赖 1 sudo yum install openssl-devel -y 2 sudo yum install zlib-devel -y 2.安装s ...
国外物联网平台（8）：Telit
国外物联网平台(8) ——Telit 马智定位 We Bring IoT to Life Telit提供世界上最全面的高性能物联网模块.连接服务和软件. 产品体系模块 Telit提供丰富专业的物联 ...
2014-4-2解决无法访问github和google的问题
github是个好地方,但是上不去就蛋疼了. 今天github上不去,果断f12下,看下network,发现里面好多请求都是指向 github.global.ssl.fastly.net这个域名的,然 ...
百度云BDCloudVideoView播放器的初体验
今天试用了一下百度云BDCloudVideoView,记录下遇到的坑. 前面一切还好,按照他的要求各种导入,然后开始码代码,起实就是抄例子.然后各种坑开始了下面这个你看名称能知道它是个啥吗? mVi ...
ftp操作方法整理
1.整理简化了下C#的ftp操作,方便使用 1.支持创建多级目录 2.批量删除 3.整个目录上传 4.整个目录删除 5.整个目录下载 2.调用方法展示, var ftp ...
VINS-Mono Installation
Prerequisites 1.1 ** Ubuntu 16.04, ROS Kinetic ** sudo apt-get install ros-kinetic-cv-bridge ros-kin ...

基于scrapy的一些实例

一.爬取斗鱼主播

1. 爬虫文件

2.item

3.pipeline

4.settings

基于scrapy的一些实例的更多相关文章

随机推荐

热门专题

　1.　爬虫文件

　　2.item

　　3.pipeline

　　4.settings