scrapy框架基础

一、简介

Scrapy是一个为了爬取网站数据，提取结构性数据而编写的应用框架，非常出名，非常强悍。所谓的框架就是一个已经被集成了各种功能（高性能异步下载，队列，分布式，解析，持久化等）的具有很强通用性的项目模板。对于框架的学习，重点是要学习其框架的特性、各个功能的用法即可。

二、下载安装

Linux:
- pip install scrapy
widows:
- 1. pip3 install wheel
- 2. 下载twistedhttp://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
- 3.进入到下载目录，执行 pip3 install Twisted-18.9.0-cp36-cp36m-win_amd64.whl
- 4.pip3 install pywin32
- 5. pip3 install scrapy

三.简单使用

　　1.创建scrapy工程：scrapy startproject projectName （projectName 项目名）

　　2.创建爬虫文件：scrapy genspider spiderName www.xxx.com (爬虫文件的名称, 起始的url)

　　3.编写爬虫文件:在步骤2执行完毕后，会在项目的spiders中生成一个应用名的py爬虫文件，文件源码如下：

# -*- coding: utf-8 -*-

import scrapy

class FistSpider(scrapy.Spider):

    # 爬虫文件的名称

    name = 'fist'

    # 允许的域名(只有被允许的域名才能发送)，一般不用，注掉都可以被发送

    # allowed_domains = ['www.xxx.com']

    # 起始url列表(只能放url)

    start_urls = ['https://baidu.com/']

    # 用来实现数据解析

    def parse(self, response):

        print("响应对象为：",response)

4.设置修改settings.py配置文件相关配置:　

# 修改内容及其结果如下：

19行：USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' #伪装请求载体身份

22行：ROBOTSTXT_OBEY = False  #可以忽略或者不遵守robots协议

5. 运行爬虫文件

　　　　scrapy crawl 爬虫文件名

　　　　scrapy crawl 爬虫文件名 --nolog # 不打印日志运行

持久化存储

存储爬取到的糗事百科热点的内容与发布作者

1、基于终端指令的持久化存储：只可以将parse返回值进行本地文件的存储

class TwoSpider(scrapy.Spider):

    name = 'qiubai'

    # allowed_domains = ['www.xxx.com']   # 允许的域名

    start_urls = ['https://www.qiushibaike.com/text/']  # 起始url列表(只能放url)

# 1.基于终端指令的持久化存储：只可以将parse返回值进行本地文件的存储.

    def parse(self, response):

        div_list = response.xpath('//div[@id="content-left"]/div')

        all_data_list = []

        for div in div_list:

            # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()

            # 如果可以保证xpath返回的列表中只有一个元素，则可以直接使用下述方法

            author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()

            content = div.xpath('./a/div/span//text()').extract()  # 将列表中所有的selector对象中的数据取出返回也是列表

            content = ''.join(content)

            dic = {

                'author': author,

                'content':content

            }

            all_data_list.append(dic)

        return all_data_list

scrapy crawl qiubai -o qiubai.csv # 运行qiubai爬虫文件并将返回内容存到qiubai.csv中

2.基于管道的持久化存储（通用性较强）

1.进行数据解析
2.在item类中定义相关的属性(为了存储解析到的数据)
3.在parse方法中实例化一个item类型的对象
4.将解析到的数据存储到item类型的对象中
5.使用yiled item 将item对象提交给管道（process_item方法，提交一次调用一次）
6.在管道文件的process_item方法中接收item且对item中存储数据进行持久化存储
7.在settings配置文件中开启管道

    def parse(self, response):

        div_list = response.xpath('//div[@id="content-left"]/div')

        all_data_list = []

        for div in div_list:

            # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()

            # 如果可以保证xpath返回的列表中只有一个元素，则可以直接使用下述方法

            author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()

            content = div.xpath('./a/div/span//text()').extract()  # 将列表中所有的selector对象中的数据取出返回也是列表

            content = ''.join(content)

            # 实例化一个item类型的对象：用来存储解析到的数据

            item = FirstbloodItem()

            item["author"] = author

            item["content"] = content

            # 向管道提交item类型的对象

            yield item

在item类中定义相关的属性 items.py

import scrapy

class FirstbloodItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    # 定义属性

    author = scrapy.Field()

    content = scrapy.Field()

在管道文件的process_item方法中接收item且对item中存储数据进行持久化存储 pipeline.py :

可以存到磁盘，数据库中

import pymysql

from redis import Redis

class FirstbloodPipeline(object):

    fp = None

    # 该方法只会被执行一次，

    def open_spider(self, spider):

        self.fp = open('./qiubai.txt', 'w', encoding='utf-8')

    # 用于处理item的方法,爬虫文件每次提交一次item方法就会被调用一次

    def process_item(self, item, spider):

        # 取出item中的值

        author = item["author"]

        content = item["content"]

        self.fp.write(author + ":" + content + "\n")

        return item  #返回给了下一个即将被执行的管道类

    # 关闭文件夹

    def close_spider(self,spider):

        self.fp.close()

class MysqlPileLine(object):

    # 连接

    conn = None

    cusor = None

    def open_spider(self, spider):

        # 连接数据库

        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password="", db='spider')

        print(self.conn)

    def process_item(self, item, spider):

        self.cusor = self.conn.cursor()

        try:

            self.cusor.execute('insert into qiubai values("%s","%s")'%(item['author'],item['content']))

            self.conn.commit()

        except Exception as e:

            print(e)

            self.conn.rollback()

        return item  # 返回给下一个即将被执行的管道类

    def close_spider(self,spider):

        self.cusor.close()

        self.conn.close()

class RedisPipeLine(object):

    conn = None

    def open_spider(self,spider):

        self.conn = Redis(host='127.0.0.1',port=6379)

    def process_item(self,item,spider):

        dic = {

            'author':item['author'],

            'content':item['content']

        }

        self.conn.lpush('qiubaiData',dic)

在settings配置文件中开启管道

# 开启管道，让管道类生效

ITEM_PIPELINES = {

   'firstBlood.pipelines.FirstbloodPipeline': 300,  # 数值表示优先级

   'firstBlood.pipelines.MySqlPipeline': 301,

   'firstBlood.pipelines.RedisbloodPipeline': 303,

}

使用scrapy爬取chouti的热点内容与发布者

（递归爬取解析多页页面数据）

import scrapy

from choutiPro.items import ChoutiproItem

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    # allowed_domains = ['www.xxx.com']

    start_urls = ['https://dig.chouti.com/r/scoff/hot/1']

    #定义一个通用的url模板

    url = 'https://dig.chouti.com/r/scoff/hot/%d'

    pageNum = 1

    def parse(self, response):

        div_list = response.xpath('//div[@id="content-list"]/div')

        for div in div_list:

            content = div.xpath('./div[2]/div[1]/a/text()').extract_first()

            author = div.xpath('./div[2]/div[2]/a[4]/b/text()').extract_first()

            item = ChoutiproItem()

            item['author'] = author

            item['content'] = content

            yield item

        #进行手动请求的发送

        if self.pageNum <= 120:

            self.pageNum += 1

            new_url = format(self.url%self.pageNum)

            yield scrapy.Request(url=new_url,callback=self.parse)