一.爬取斗鱼主播

 1. 爬虫文件

# -*- coding: utf-8 -*-
import scrapy
import json
from Douyu.items import DouyuItem class DouyuSpider(scrapy.Spider):
name = 'douyu'
# allowed_domains = ['www.xxx.com']
baseurl = 'http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset='
# 偏移量,指的是起始值,从0开始的偏移值
offset = 0
start_urls = [baseurl + str(offset)] def parse(self, response):
# 对获取的数据进转jsao格式后进行判断
data = json.loads(response.text)['data'] if len(data) == 0:
return
data = json.loads(response.text)['data']
# //循环data这个列表,拿到的是每一个主播信息的字典
for each in data:
name = each['nickname']
img_url = each['vertical_src']
# //实例一个item对象来装获取到的数据
item = DouyuItem()
item['name'] = name
item['img_url'] = img_url
# 这边要记得返回,否则管道文件接不到数据
yield item # 获取所有页的数据
# 这样不容出错,上面有判断了状态表示码,如果为1就不会走if这边了
self.offset += 20
url = self.baseurl + str(self.offset)
yield scrapy.Request(url=url, callback=self.parse)

  2.item

import scrapy

class DouyuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name=scrapy.Field() #保存昵称
img_url=scrapy.Field() #保存图片url

  3.pipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from Douyu.settings import IMAGES_STORE as images_store
import os
import scrapy #文字存储
class DouyuPipeline(object):
f = None def open_spider(self, spider):
self.f = open('./douyu.txt', 'w', encoding='utf-8') def process_item(self, item, spider):
name = item['name']
img_url = item['img_url']
self.f.write(name + ":" + img_url + "\n")
return item def close_spider(self, spider):
self.f.close() # 设置照片存储
class ImagesPipieline(ImagesPipeline):
# 从爬虫文件赤岸过来的item中获取诈骗的url,对照片的url进行请求,获取照片
# 照片默认获取保存到settingts.py中IMGS_STORE,自己要去设置路径
def get_media_requests(self, item, info):
img_url = item['img_url'] yield scrapy.Request(img_url) # 对图片修改名字
def item_completed(self, results, item, info):
# 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确就放到imgpath里面
# results:把图片从文件读出来的信息
img_path = [x['path'] for ok, x in results if ok]
os.rename(images_store + img_path[0], images_store + item['name'] + '.jpg')

  4.settings

# -*- coding: utf-8 -*-

# Scrapy settings for Douyu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'Douyu' SPIDER_MODULES = ['Douyu.spiders']
NEWSPIDER_MODULE = 'Douyu.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False # Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# } # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'Douyu.middlewares.DouyuSpiderMiddleware': 543,
# } # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'Douyu.middlewares.DouyuDownloaderMiddleware': 543,
# } # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# } # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Douyu.pipelines.DouyuPipeline': 300,
'Douyu.pipelines.ImagesPipieline': 301,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # 图片的存储路径
#settings都要大写,这边的字一个都不能错
IMAGES_STORE ='D:/scrapy/Douyu/imgs/'

基于scrapy的一些实例的更多相关文章

  1. 基于scrapy爬虫的天气数据采集(python)

    基于scrapy爬虫的天气数据采集(python) 一.实验介绍 1.1. 知识点 本节实验中将学习和实践以下知识点: Python基本语法 Scrapy框架 爬虫的概念 二.实验效果 三.项目实战 ...

  2. Python分布式爬虫打造搜索引擎完整版-基于Scrapy、Redis、elasticsearch和django打造一个完整的搜索引擎网站

    Python分布式爬虫打造搜索引擎 基于Scrapy.Redis.elasticsearch和django打造一个完整的搜索引擎网站 https://github.com/mtianyan/Artic ...

  3. 爬虫学习之基于Scrapy的爬虫自动登录

    ###概述 在前面两篇(爬虫学习之基于Scrapy的网络爬虫和爬虫学习之简单的网络爬虫)文章中我们通过两个实际的案例,采用不同的方式进行了内容提取.我们对网络爬虫有了一个比较初级的认识,只要发起请求获 ...

  4. vc 基于对话框多线程编程实例——线程之间的通信

     vc基于对话框多线程编程实例——线程之间的通信 实例:

  5. SpringMVC详解(三)------基于注解的入门实例

    前两篇博客我们讲解了基于XML 的入门实例,以及SpringMVC运行的详细流程.但是我们发现基于 XML 的配置还是比较麻烦的,而且,每个 Handler 类只能有一个方法,在实际开发中肯定是不可能 ...

  6. 转载 SpringMVC详解(三)------基于注解的入门实例

    目录 1.在 web.xml 文件中配置前端处理器 2.在 springmvc.xml 文件中配置处理器映射器,处理器适配器,视图解析器 3.编写 Handler 4.编写 视图 index.jsp ...

  7. 基于Scrapy框架的Python新闻爬虫

    概述 该项目是基于Scrapy框架的Python新闻爬虫,能够爬取网易,搜狐,凤凰和澎湃网站上的新闻,将标题,内容,评论,时间等内容整理并保存到本地 详细 代码下载:http://www.demoda ...

  8. 基于scrapy框架的爬虫

    Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架. 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中. scrapy 框架 高性能的网络请求 高性能的数据解析 高性能的 ...

  9. 基于scrapy框架输入关键字爬取有关贴吧帖子

    基于scrapy框架输入关键字爬取有关贴吧帖子 站点分析 首先进入一个贴吧,要想达到输入关键词爬取爬取指定贴吧,必然需要利用搜索引擎 点进看到有四种搜索方式,分别试一次,观察url变化 我们得知: 搜 ...

随机推荐

  1. Odometry的发布和发布odom到base_link的tf变换

    转载自http://www.ncnynl.com/archives/201702/1328.html ROS发布nav_msgs/Odometry消息,以及通过tf从“odom”坐标系到“base_l ...

  2. 黑盒测试实践--Day7 12.1

    黑盒测试实践--Day7 12.1 今天完成任务情况: 录制小组作业中的自动化测试工具实践视频 汇总大家提交的各种作业模块,打包完成小组共同作业 小组成员完成个人情况说明后在截止时间前分别提交作业 小 ...

  3. 验证码测试-demo

    <!DOCTYPE html><html><head><meta charset="UTF-8"><title>Inse ...

  4. [GO]变量内存和变量地址

    package main import "fmt" func main() { //每个变量都有两层含义,变量的内存和变量的地址 fmt.Printf("a = %d\n ...

  5. HDU 3729 I'm Telling the Truth (二分匹配)

    题意:给定 n 个人成绩排名区间,然后问你最多有多少人成绩是真实的. 析:真是没想到二分匹配,....后来看到,一下子就明白了,原来是水题,二分匹配,只要把每个人和他对应的区间连起来就好,跑一次二分匹 ...

  6. datagrid 自定义 pager

    $(document).ready(function(){ var p = $('.easyui-datagrid').datagrid('getPager'); $(p).pagination({ ...

  7. Instruments Tutorial for iOS: How To Debug Memory Leaks【转】

    If you're new here, you may want to subscribe to my RSS feed or follow me on Twitter. Thanks for vis ...

  8. MongoDB整理笔记の体系架构

    MongoDB 是一个可移植的数据库,它在流行的每一个平台上都可以使用,即所谓的跨平台特性. 一个运行着的MongoDB 数据库就可以看成是一个MongoDB Server,该Server 由实例和数 ...

  9. 阿里云RDS外网无法访问解决办法

    为了安全起见,阿里云的RDS数据库设置只能内网连接,那么为了方便查询数据,我每次都去连接内网服务器远程桌面. 这次我因为网络问题,远程桌面非常不稳定,一会掉线一会掉线,只能另想办法. 同事那里有个批处 ...

  10. IIS将http强转为https(重定向和重写)

    最近接到一个需求,客户希望无论是http还是https请求都可以访问,并且http能转换成https.研究了一圈发现iis的重定向和重写都可以实现http强转https,记录一下. 用到的东东: In ...