scrapy爬虫

a. 配置文件

#settings.py

DEPTH_LIMIT = 1    			#指定“递归”的层数

ROBOTSTXT_OBEY = False		#对方网站规定哪些网址可以爬，这个选项表示不遵循此规定

b. 选择器

.//  		#表示对象的子孙中

./    		#儿子

./dev 		#儿子中的div标签

./div[@id='i1']		#儿子中的div标签且id='i1'

obj.extract()		#列表中每一个对象转换字符串 => []

obj.extract_first	#列表中的每一个对象转换字符串 => 列表第一个元素

//div/text()            #获取某个标签的文本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from scrapy.selector import Selector, HtmlXPathSelector

from scrapy.http import HtmlResponse

html = """<!DOCTYPE html>

<html>

    <head lang="en">

        <meta charset="UTF-8">

        <title></title>

    </head>

    <body>

        <ul>

            <li class="item-"><a id='i1' href="link.html">first item</a></li>

            <li class="item-0"><a id='i2' href="llink.html">first item</a></li>

            <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>

        </ul>

        <div><a href="llink2.html">second item</a></div>

    </body>

</html>

"""

response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')

# hxs = HtmlXPathSelector(response)

# print(hxs)

# hxs = Selector(response=response).xpath('//a')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[2]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[@id]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[@id="i1"]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()

# print(hxs)

# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()

# print(hxs)

# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()

# print(hxs)

# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()

# print(hxs)

# ul_list = Selector(response=response).xpath('//body/ul/li')

# for item in ul_list:

#     v = item.xpath('./a/span')

#     # 或

#     # v = item.xpath('a/span')

#     # 或

#     # v = item.xpath('*/a/span')

#     print(v)

c. 结构化处理

setting.py

ITEM_PIPELINES = {

   'day96.pipelines.Day96Pipeline': 300,

}

DB = "....."

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exceptions import DropItem

class Day96Pipeline(object):

    def __init__(self,conn_str):

        self.conn_str = conn_str

    @classmethod

    def from_crawler(cls, crawler):

        """

        初始化时候，用于创建pipeline对象

        :param crawler:

        :return:

        """

        conn_str = crawler.settings.get('DB')

        return cls(conn_str)

    def open_spider(self,spider):

        """

        爬虫开始执行时，调用

        :param spider:

        :return:

        """

        self.conn = open(self.conn_str, 'a')

    def close_spider(self,spider):

        """

        爬虫关闭时，被调用

        :param spider:

        :return:

        """

        self.conn.close()

    def process_item(self, item, spider):

        """

        每当数据需要持久化时，就会被调用

        :param item:

        :param spider:

        :return:

        """

        # if spider.name == 'chouti'

        tpl = "%s\n%s\n\n" %(item['title'],item['href'])

        self.conn.write(tpl)

        # 交给下一个pipeline处理

        return item

        # 丢弃item，不交给

        # raise DropItem()

pipelines.py

d. 常用命令

scrapy startproject sp1

cd p1

scrapy genspider baidu baidu.com      #创建爬虫

scrapy crawl baidu

scrapy crawl baidu --nolog

e. 目录结构

sp1

	- scrapy.cfg		 #初始配置文件

	- sp1

		- spiders		 #目录

		- items.py		 #格式化

		- pipelines.py	 #持久化

		- middlewares.py #中间件

		- settings.py    #配置

事例

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    def parse(self, response):

        hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')

        for obj in hxs:

            a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first()

            print(a.strip())

获取抽屉新闻标题

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    visited_urls = set()

    def parse(self, response):

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:

            md5_url = self.md5(url)

            if md5_url in self.visited_urls:

                print('已经存在',url)

            else:

                self.visited_urls.add(md5_url)

                print(url)

    def md5(self,url):

        import hashlib

        obj = hashlib.md5()

        obj.update(bytes(url,encoding='utf-8'))

        return obj.hexdigest()

获取抽屉当前页的所有页码

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.http import Request

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    visited_urls = set()

    def parse(self, response):

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:

            md5_url = self.md5(url)

            if md5_url in self.visited_urls:

                pass

            else:

                print(url)

                self.visited_urls.add(md5_url)

                url = "http://dig.chouti.com%s" %url

                #将新要访问的url增加到调度器

                yield Request(url=url,callback=self.parse)

    def md5(self,url):

        import hashlib

        obj = hashlib.md5()

        obj.update(bytes(url,encoding='utf-8'))

        return obj.hexdigest()

获取抽屉所有页码

a. 避免重复的url

setting.py 

DUPEFILTER_CLASS = "day96.duplication.RepeatFilter"

class RepeatFilter(object):

    def __init__(self):

        self.visited_set = set()

    @classmethod

    def from_settings(cls, settings):

        print('...')

        return cls()

    def request_seen(self, request):

        if request.url in self.visited_set:

            return True

        self.visited_set.add(request.url)

        return False

    def open(self):  # can return deferred

        print('open')

        pass

    def close(self, reason):  # can return a deferred

        print('close')

        pass

    def log(self, request, spider):  # log that a request has been filtered

        # print('log....')

        pass

duplication.py

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.http import Request

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    from scrapy.dupefilter import RFPDupeFilter

    def parse(self, response):

        print(response.url)

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:

            url = "http://dig.chouti.com%s" %url

            #将新要访问的url增加到调度器

            yield Request(url=url,callback=self.parse)

    def md5(self,url):

        import hashlib

        obj = hashlib.md5()

        obj.update(bytes(url,encoding='utf-8'))

        return obj.hexdigest()

chouti.py

116

传智播客

#爬传智播客的老师的名称

#scrapy startproject mySpider

#cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/items.py

    import scrapy

    class MyspiderItem(scrapy.Item):

        name = scrapy.Field()

        title = scrapy.Field()

        info = scrapy.Field()

#cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/spiders/itcastspider.py

    import scrapy

    from mySpider.items import ItcastItem

    #创建一个爬虫类

    class ItcastSpider(scrapy.Spider):

        #爬虫名

        name = "itcast"

        #允许爬虫作用的范围

        allowd_domains = ["http://www.itcast.cn"]

        #爬虫起始的url

        start_urls = ["http://www.itcast.cn/channel/teacher.shtml#"]

        def parse(self, response):

            #通过scripy自带的xpath匹配出所有老师的根节点列表集合

            teacher_list = response.xpath('//div[@class="li_txt"]')

            teacherItem = []

            #遍历根节点集合

            for each in teacher_list:

                item = ItcastItem()

                name = each.xpath('./h3/text()').extract()

                title = each.xpath('./h4/text()').extract()

                info = each.xpath('./p/text()').extract()

                print("--------------",type(name))

                item['name'] = name[0]

                item['title'] = title[0]

                item['info'] = info[0]

                teacherItem.append(item)

            return teacherItem

#保存到json文件

scrapy crawl itcast -o itcast.json

#保存到csv文件

scrapy crawl itcast -o itcast.csv

爬传智播客的老师的名称

沛齐

scrapy爬虫的更多相关文章

scrapy爬虫结果插入mysql数据库
1.通过工具创建数据库scrapy
Python之Scrapy爬虫框架安装及简单使用
题记:早已听闻python爬虫框架的大名.近些天学习了下其中的Scrapy爬虫框架,将自己理解的跟大家分享.有表述不当之处,望大神们斧正. 一.初窥Scrapy Scrapy是一个为了爬取网站数据,提 ...
Linux搭建Scrapy爬虫集成开发环境
安装Python 下载地址:http://www.python.org/, Python 有 Python 2 和 Python 3 两个版本, 语法有些区别,ubuntu上自带了python2.7. ...
Scrapy 爬虫
Scrapy 爬虫使用指南完全教程 scrapy note command 全局命令: startproject :在 project_name 文件夹下创建一个名为 project_name ...
[Python爬虫] scrapy爬虫系列 <一>.安装及入门介绍
前面介绍了很多Selenium基于自动测试的Python爬虫程序,主要利用它的xpath语句,通过分析网页DOM树结构进行爬取内容,同时可以结合Phantomjs模拟浏览器进行鼠标或键盘操作.但是,更 ...
同时运行多个scrapy爬虫的几种方法（自定义scrapy项目命令）
试想一下,前面做的实验和例子都只有一个spider.然而,现实的开发的爬虫肯定不止一个.既然这样,那么就会有如下几个问题:1.在同一个项目中怎么创建多个爬虫的呢?2.多个爬虫的时候是怎么将他们运行起来 ...
如何让你的scrapy爬虫不再被ban之二（利用第三方平台crawlera做scrapy爬虫防屏蔽）
我们在做scrapy爬虫的时候,爬虫经常被ban是常态.然而前面的文章如何让你的scrapy爬虫不再被ban,介绍了scrapy爬虫防屏蔽的各种策略组合.前面采用的是禁用cookies.动态设置use ...
如何让你的scrapy爬虫不再被ban
前面用scrapy编写爬虫抓取了自己博客的内容并保存成json格式的数据(scrapy爬虫成长日记之创建工程-抽取数据-保存为json格式的数据)和写入数据库(scrapy爬虫成长日记之将抓取内容写入 ...
scrapy爬虫成长日记之将抓取内容写入mysql数据库
前面小试了一下scrapy抓取博客园的博客(您可在此查看scrapy爬虫成长日记之创建工程-抽取数据-保存为json格式的数据),但是前面抓取的数据时保存为json格式的文本文件中的.这很显然不满足我 ...
【图文详解】scrapy爬虫与动态页面——爬取拉勾网职位信息（2）
上次挖了一个坑,今天终于填上了,还记得之前我们做的拉勾爬虫吗?那时我们实现了一页的爬取,今天让我们再接再厉,实现多页爬取,顺便实现职位和公司的关键词搜索功能. 之前的内容就不再介绍了,不熟悉的请一定要 ...

随机推荐

[JLOI2012]时间流逝树上高斯消元概率期望
题面题意:(感觉题面写的题意是错的?)有\(n\)种能量不同的圈,设当前拥有的圈的集合为\(S\),则: 1,每天有\(p\)概率失去一个能量最小的圈.特别的,如果\(S = \varnothing ...
使用Hexo写博客
首先,你需要搭建一个Hexo博客网站- 使用Hexo搭建GitHub博客(2018年Mac版) 原生方式新建文章 Hexo的项目结构是在网站根目录的source/_posts目录下存放你的博客文档,以 ...
C#调用GDI+1.1中的函数实现高斯模糊、USM锐化等经典效果。
http://www.cnblogs.com/Imageshop/archive/2012/12/13/2815712.html 在GDI+1.1的版本中,MS加入不少新的特性,其中的特效类Effec ...
Ubuntu在vncviewer下Tab键失效
打开命令行,运行如下命令即可解决: xfconf-query -c xfce4-keyboard-shortcuts -p /xfwm4/custom/'<'Super'>'Tab -r ...
mybatis基础犯错总结
1.关于mybatis的文件一般都是其mapper文件出错: 首先关于输入参数parameterType出错: (1)基本数据类型:如果输入参数只有一个,其数据类型可以是基本数据类型,也可以是自己定的 ...
C语言基本类型的字节数
OpenCV---Numpy数组的使用以及创建图片
一:对头像的所有像素进行访问,并UI图像进行像素取反 (一)for循环取反 import cv2 as cv import numpy as np def access_pixels(image): ...
vue-router的link样式设置问题
发现router-link添加上去后文字上会出现下划线,打开调试工具发现router-link其实是由a来实现的,在reset的时候 a { text-decoraction: none; } 至于点 ...
树形dp的进阶（一）
①树的重心的性质的运用 ②缩点以后寻找规律树的直径! ③树形dp上的公式转换 ④和期望有关的树形dp + 一点排列组合的知识 ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ 一:Codeforces Round #364 ...
Flexbox兼容性
.flex-container{ display: -webkit-box; /* Chrome 4+, Safari 3.1, iOS Safari 3.2+ */ display: -moz-bo ...

scrapy爬虫

事例

scrapy爬虫的更多相关文章

随机推荐

热门专题