scrapy爬虫

a. 配置文件

#settings.py

DEPTH_LIMIT = 1    			#指定“递归”的层数

ROBOTSTXT_OBEY = False		#对方网站规定哪些网址可以爬，这个选项表示不遵循此规定

b. 选择器

.//  		#表示对象的子孙中

./    		#儿子

./dev 		#儿子中的div标签

./div[@id='i1']		#儿子中的div标签且id='i1'

obj.extract()		#列表中每一个对象转换字符串 => []

obj.extract_first	#列表中的每一个对象转换字符串 => 列表第一个元素

//div/text()            #获取某个标签的文本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from scrapy.selector import Selector, HtmlXPathSelector

from scrapy.http import HtmlResponse

html = """<!DOCTYPE html>

<html>

    <head lang="en">

        <meta charset="UTF-8">

        <title></title>

    </head>

    <body>

        <ul>

            <li class="item-"><a id='i1' href="link.html">first item</a></li>

            <li class="item-0"><a id='i2' href="llink.html">first item</a></li>

            <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>

        </ul>

        <div><a href="llink2.html">second item</a></div>

    </body>

</html>

"""

response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')

# hxs = HtmlXPathSelector(response)

# print(hxs)

# hxs = Selector(response=response).xpath('//a')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[2]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[@id]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[@id="i1"]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')

# print(hxs)

# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()

# print(hxs)

# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()

# print(hxs)

# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()

# print(hxs)

# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()

# print(hxs)

# ul_list = Selector(response=response).xpath('//body/ul/li')

# for item in ul_list:

#     v = item.xpath('./a/span')

#     # 或

#     # v = item.xpath('a/span')

#     # 或

#     # v = item.xpath('*/a/span')

#     print(v)

c. 结构化处理

setting.py

ITEM_PIPELINES = {

   'day96.pipelines.Day96Pipeline': 300,

}

DB = "....."

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exceptions import DropItem

class Day96Pipeline(object):

    def __init__(self,conn_str):

        self.conn_str = conn_str

    @classmethod

    def from_crawler(cls, crawler):

        """

        初始化时候，用于创建pipeline对象

        :param crawler:

        :return:

        """

        conn_str = crawler.settings.get('DB')

        return cls(conn_str)

    def open_spider(self,spider):

        """

        爬虫开始执行时，调用

        :param spider:

        :return:

        """

        self.conn = open(self.conn_str, 'a')

    def close_spider(self,spider):

        """

        爬虫关闭时，被调用

        :param spider:

        :return:

        """

        self.conn.close()

    def process_item(self, item, spider):

        """

        每当数据需要持久化时，就会被调用

        :param item:

        :param spider:

        :return:

        """

        # if spider.name == 'chouti'

        tpl = "%s\n%s\n\n" %(item['title'],item['href'])

        self.conn.write(tpl)

        # 交给下一个pipeline处理

        return item

        # 丢弃item，不交给

        # raise DropItem()

pipelines.py

d. 常用命令

scrapy startproject sp1

cd p1

scrapy genspider baidu baidu.com      #创建爬虫

scrapy crawl baidu

scrapy crawl baidu --nolog

e. 目录结构

sp1

	- scrapy.cfg		 #初始配置文件

	- sp1

		- spiders		 #目录

		- items.py		 #格式化

		- pipelines.py	 #持久化

		- middlewares.py #中间件

		- settings.py    #配置

事例

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    def parse(self, response):

        hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')

        for obj in hxs:

            a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first()

            print(a.strip())

获取抽屉新闻标题

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    visited_urls = set()

    def parse(self, response):

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:

            md5_url = self.md5(url)

            if md5_url in self.visited_urls:

                print('已经存在',url)

            else:

                self.visited_urls.add(md5_url)

                print(url)

    def md5(self,url):

        import hashlib

        obj = hashlib.md5()

        obj.update(bytes(url,encoding='utf-8'))

        return obj.hexdigest()

获取抽屉当前页的所有页码

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.http import Request

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    visited_urls = set()

    def parse(self, response):

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:

            md5_url = self.md5(url)

            if md5_url in self.visited_urls:

                pass

            else:

                print(url)

                self.visited_urls.add(md5_url)

                url = "http://dig.chouti.com%s" %url

                #将新要访问的url增加到调度器

                yield Request(url=url,callback=self.parse)

    def md5(self,url):

        import hashlib

        obj = hashlib.md5()

        obj.update(bytes(url,encoding='utf-8'))

        return obj.hexdigest()

获取抽屉所有页码

a. 避免重复的url

setting.py 

DUPEFILTER_CLASS = "day96.duplication.RepeatFilter"

class RepeatFilter(object):

    def __init__(self):

        self.visited_set = set()

    @classmethod

    def from_settings(cls, settings):

        print('...')

        return cls()

    def request_seen(self, request):

        if request.url in self.visited_set:

            return True

        self.visited_set.add(request.url)

        return False

    def open(self):  # can return deferred

        print('open')

        pass

    def close(self, reason):  # can return a deferred

        print('close')

        pass

    def log(self, request, spider):  # log that a request has been filtered

        # print('log....')

        pass

duplication.py

# -*- coding: utf-8 -*-

import scrapy

import sys

import io

from scrapy.http import Request

from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):

    name = 'chouti'

    allowed_domains = ['chouti.com']

    start_urls = ['http://dig.chouti.com/']

    from scrapy.dupefilter import RFPDupeFilter

    def parse(self, response):

        print(response.url)

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:

            url = "http://dig.chouti.com%s" %url

            #将新要访问的url增加到调度器

            yield Request(url=url,callback=self.parse)

    def md5(self,url):

        import hashlib

        obj = hashlib.md5()

        obj.update(bytes(url,encoding='utf-8'))

        return obj.hexdigest()

chouti.py

116

传智播客

#爬传智播客的老师的名称

#scrapy startproject mySpider

#cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/items.py

    import scrapy

    class MyspiderItem(scrapy.Item):

        name = scrapy.Field()

        title = scrapy.Field()

        info = scrapy.Field()

#cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/spiders/itcastspider.py

    import scrapy

    from mySpider.items import ItcastItem

    #创建一个爬虫类

    class ItcastSpider(scrapy.Spider):

        #爬虫名

        name = "itcast"

        #允许爬虫作用的范围

        allowd_domains = ["http://www.itcast.cn"]

        #爬虫起始的url

        start_urls = ["http://www.itcast.cn/channel/teacher.shtml#"]

        def parse(self, response):

            #通过scripy自带的xpath匹配出所有老师的根节点列表集合

            teacher_list = response.xpath('//div[@class="li_txt"]')

            teacherItem = []

            #遍历根节点集合

            for each in teacher_list:

                item = ItcastItem()

                name = each.xpath('./h3/text()').extract()

                title = each.xpath('./h4/text()').extract()

                info = each.xpath('./p/text()').extract()

                print("--------------",type(name))

                item['name'] = name[0]

                item['title'] = title[0]

                item['info'] = info[0]

                teacherItem.append(item)

            return teacherItem

#保存到json文件

scrapy crawl itcast -o itcast.json

#保存到csv文件

scrapy crawl itcast -o itcast.csv

爬传智播客的老师的名称

沛齐

scrapy爬虫的更多相关文章

scrapy爬虫结果插入mysql数据库
1.通过工具创建数据库scrapy
Python之Scrapy爬虫框架安装及简单使用
题记:早已听闻python爬虫框架的大名.近些天学习了下其中的Scrapy爬虫框架,将自己理解的跟大家分享.有表述不当之处,望大神们斧正. 一.初窥Scrapy Scrapy是一个为了爬取网站数据,提 ...
Linux搭建Scrapy爬虫集成开发环境
安装Python 下载地址:http://www.python.org/, Python 有 Python 2 和 Python 3 两个版本, 语法有些区别,ubuntu上自带了python2.7. ...
Scrapy 爬虫
Scrapy 爬虫使用指南完全教程 scrapy note command 全局命令: startproject :在 project_name 文件夹下创建一个名为 project_name ...
[Python爬虫] scrapy爬虫系列 <一>.安装及入门介绍
前面介绍了很多Selenium基于自动测试的Python爬虫程序,主要利用它的xpath语句,通过分析网页DOM树结构进行爬取内容,同时可以结合Phantomjs模拟浏览器进行鼠标或键盘操作.但是,更 ...
同时运行多个scrapy爬虫的几种方法（自定义scrapy项目命令）
试想一下,前面做的实验和例子都只有一个spider.然而,现实的开发的爬虫肯定不止一个.既然这样,那么就会有如下几个问题:1.在同一个项目中怎么创建多个爬虫的呢?2.多个爬虫的时候是怎么将他们运行起来 ...
如何让你的scrapy爬虫不再被ban之二（利用第三方平台crawlera做scrapy爬虫防屏蔽）
我们在做scrapy爬虫的时候,爬虫经常被ban是常态.然而前面的文章如何让你的scrapy爬虫不再被ban,介绍了scrapy爬虫防屏蔽的各种策略组合.前面采用的是禁用cookies.动态设置use ...
如何让你的scrapy爬虫不再被ban
前面用scrapy编写爬虫抓取了自己博客的内容并保存成json格式的数据(scrapy爬虫成长日记之创建工程-抽取数据-保存为json格式的数据)和写入数据库(scrapy爬虫成长日记之将抓取内容写入 ...
scrapy爬虫成长日记之将抓取内容写入mysql数据库
前面小试了一下scrapy抓取博客园的博客(您可在此查看scrapy爬虫成长日记之创建工程-抽取数据-保存为json格式的数据),但是前面抓取的数据时保存为json格式的文本文件中的.这很显然不满足我 ...
【图文详解】scrapy爬虫与动态页面——爬取拉勾网职位信息（2）
上次挖了一个坑,今天终于填上了,还记得之前我们做的拉勾爬虫吗?那时我们实现了一页的爬取,今天让我们再接再厉,实现多页爬取,顺便实现职位和公司的关键词搜索功能. 之前的内容就不再介绍了,不熟悉的请一定要 ...

随机推荐

HPP注入详解
###HPP参数污染的定义 HTTP Parameter Pollution简称HPP,所以有的人也称之为“HPP参数污染”,HPP是一种注入型的漏洞,攻击者通过在HTTP请求中插入特定的参数来发 ...
IntelJ 快捷键
1.在IntelJ中和Eclipse中稍有不同,在Eclipse中,输入main再按Alt+/即可自动补全main函数,但是在IntellJ中则是输入psvm,选中即可 2.在方法体内部有for循环, ...
最近遇到的DISCUZ一些问题解决方法
“抱歉,您的请求来路不正确或表单验证串不符,无法提交” 打开“source\class\helper\helper_form.php”, 然后把“$_GET[‘formhash’] == formha ...
Servlet3.0 新特性
Servlet3.0 的注解 Servlet 允许开发人员采用注解的方式来配置 Servlet.Filter.Listener. Servlet3.0 规范在 javax.servlet.annota ...
Lua弱表Weak table
定义:弱表的使用就是使用弱引用,很多程度上是对内存的控制. 1.weak表示一个表,它拥有metatable,并且metatable定义了__mode字段. 2.弱引用不会导致对象的引用计数变化.换言 ...
jinja2 中的 Template 批量替换json字符串中的内容
项目中用到elasticsearch,使用Json格式查询方式,一个查询语句中有好几个地方需要替换,且替换的值都相同.最开始把json转为字符串发方式,利用format函数处理,发现再转回json时无 ...
Java常量池详解之Integer缓存
一个Java question,求输出结果 public class IntegerTest { public static void main(String[] args) { objPoolT ...
horizon源码分析(二)
源码版本:H版一.简要回顾对于请求: 地址:/dashboard/admin/instances/ 方式:POST 参数: instances_filter_q: action:instances ...
Python学习笔记（三十九）— 内置模块（8）XML基础
摘抄自:https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001432002075 ...
CentOS部署.NetCore服务
1. 安装CentOs,可使用最小安装包镜像:http://isoredirect.centos.org/centos/7/isos/x86_64/CentOS-7-x86_64-Minimal-17 ...

scrapy爬虫

事例

scrapy爬虫的更多相关文章

随机推荐

热门专题