18.scrapy_maitian

ershoufang.py

# -*- coding: utf-8 -*-

import scrapy

class ErshoufangSpider(scrapy.Spider):

    name = 'ershoufang'

    allowed_domains = ['maitian.com']

    start_urls = ['http://maitian.com/']

    def parse(self, response):

        pass

zufang_spider.py

import scrapy

from maitian.items import MaitianItem

class MaitianSpider(scrapy.Spider):

    name = "zufang"

    start_urls = ['http://bj.maitian.cn/zfall/PG1']

    def parse(self, response):

        for zufang_itme in response.xpath('//div[@class="list_title"]'):

            yield {

                'title': zufang_itme.xpath('./h1/a/text()').extract_first().strip(),

                'price': zufang_itme.xpath('./div[@class="the_price"]/ol/strong/span/text()').extract_first().strip(),

                'area': zufang_itme.xpath('./p/span/text()').extract_first().replace('㎡', '').strip(),

                'district': zufang_itme.xpath('./p//text()').re(r'昌平|朝阳|东城|大兴|丰台|海淀|石景山|顺义|通州|西城')[0],

            }

        next_page_url = response.xpath(

            '//div[@id="paging"]/a[@class="down_page"]/@href').extract_first()

        if next_page_url is not None:

            yield scrapy.Request(response.urljoin(next_page_url))

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MaitianItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    title = scrapy.Field()

    price = scrapy.Field()

    area = scrapy.Field()

    district = scrapy.Field()

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

class MaitianSpiderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the spider middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_spider_input(self, response, spider):

        # Called for each response that goes through the spider

        # middleware and into the spider.

        # Should return None or raise an exception.

        return None

    def process_spider_output(self, response, result, spider):

        # Called with the results returned from the Spider, after

        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.

        for i in result:

            yield i

    def process_spider_exception(self, response, exception, spider):

        # Called when a spider or process_spider_input() method

        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict

        # or Item objects.

        pass

    def process_start_requests(self, start_requests, spider):

        # Called with the start requests of the spider, and works

        # similarly to the process_spider_output() method, except

        # that it doesn’t have a response associated.

        # Must return only requests (not items).

        for r in start_requests:

            yield r

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

class MaitianDownloaderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the downloader middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_request(self, request, spider):

        # Called for each request that goes through the downloader

        # middleware.

        # Must either:

        # - return None: continue processing this request

        # - or return a Response object

        # - or return a Request object

        # - or raise IgnoreRequest: process_exception() methods of

        #   installed downloader middleware will be called

        return None

    def process_response(self, request, response, spider):

        # Called with the response returned from the downloader.

        # Must either;

        # - return a Response object

        # - return a Request object

        # - or raise IgnoreRequest

        return response

    def process_exception(self, request, exception, spider):

        # Called when a download handler or a process_request()

        # (from other downloader middleware) raises an exception.

        # Must either:

        # - return None: continue processing this exception

        # - return a Response object: stops process_exception() chain

        # - return a Request object: stops process_exception() chain

        pass

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo

from scrapy.conf import settings

class MaitianPipeline(object):

    def __init__(self):

        host = settings['MONGODB_HOST']

        port = settings['MONGODB_PORT']

        db_name = settings['MONGODB_DBNAME']

        client = pymongo.MongoClient(host=host, port=port)

        db = client[db_name]

        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):

        zufang = dict(item)

        self.post.insert(zufang)

        return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for maitian project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'maitian'

SPIDER_MODULES = ['maitian.spiders']

NEWSPIDER_MODULE = 'maitian.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'maitian (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'maitian.middlewares.MaitianSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'maitian.middlewares.MaitianDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#ITEM_PIPELINES = {

#    'maitian.pipelines.MaitianPipeline': 300,

#}

ITEM_PIPELINES = {

   'maitian.pipelines.DuplicatesPipeline': 301,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

ITEM_PIPELINES = {'maitian.pipelines.MaitianPipeline': 300,}

MONGODB_HOST = '127.0.0.1'

MONGODB_PORT = 27017

MONGODB_DBNAME = 'maitian'

MONGODB_DOCNAME = 'zufang'

18.scrapy_maitian的更多相关文章

CSharpGL(18)分别处理glDrawArrays()和glDrawElements()两种方式下的拾取(ColorCodedPicking)
CSharpGL(18)分别处理glDrawArrays()和glDrawElements()两种方式下的拾取(ColorCodedPicking) 我在(Modern OpenGL用Shader拾取 ...
ABP(现代ASP.NET样板开发框架)系列之18、ABP应用层——权限验证
点这里进入ABP系列文章总目录 ABP(现代ASP.NET样板开发框架)系列之18.ABP应用层——权限验证 ABP是“ASP.NET Boilerplate Project (ASP.NET样板项目 ...
ASP.NET MVC5+EF6+EasyUI 后台管理系统（18）-权限管理系统-表数据
系列目录这一节,我们插入数据来看看数据流,让各位同学,知道这个权限表交互是怎么一个流程,免得大家后天雾里来雾里去首先我再解释一些表,SysUser和SysRole表不用解释了. SysRoleSys ...
C#开发微信门户及应用(18)-微信企业号的通讯录管理开发之成员管理
在上篇随笔<C#开发微信门户及应用(17)-微信企业号的通讯录管理开发之部门管理>介绍了通讯录的部门的相关操作管理,通讯录管理包括部门管理.成员管理.标签管理三个部分,本篇主要介绍成员的管 ...
[MySQL Reference Manual] 18 复制
18 复制 18 复制 18.1 复制配置 18.1.1 基于Binary Log的数据库复制配置 18.1.2 配置基于Binary log的复制 18.1.2.1 设置复制master的配置 18 ...
Hihocoder 太阁最新面经算法竞赛18
Hihocoder 太阁最新面经算法竞赛18 source: https://hihocoder.com/contest/hihointerview27/problems 题目1 : Big Plus ...
grep-2.26 sed-4.2.2 awk-4.1.4 wget-1.18 pcregrep-8.39 pcre2grep-10.22 for windows 最新版本静态编译
-------------------------------------------------------------------------------------------- grep (G ...
《C#本质论》读书笔记（18）多线程处理
.NET Framework 4.0 看(本质论第3版) .NET Framework 4.5 看(本质论第4版) .NET 4.0为多线程引入了两组新API:TPL(Task Parallel Li ...
Java随机生成18位身份证号
package com.ihome.data; import java.text.SimpleDateFormat; import java.util.Calendar; import java.ut ...

随机推荐

WPS Office for Mac如何修改Word文档文字排列？WPS office修改Word文档文字排列方向教程
Word文档如何改变文字的排列方向?最新版WPS Office for Mac修复了文字排版相关的细节问题,可以更快捷的进行Word编辑,WPS Office在苹果电脑中如何修改Word文档文字排列方 ...
PDO基础
//PDO:数据访问抽象层 $dsn = "mysql:dbname=mydb;host=localhost";//造PDO对象 $pdo = new PDO($dsn," ...
NTT数论变换
数论变换NTT 前置知识 FFT:NTT的思想和FFT一样(FFT介绍) 概述数论变换,即NTT(Number Theory Transformation?),是基于数论域的FFT,一般我们默认FF ...
OpenStack与KVM的区别与联系
转:https://www.aliyun.com/zixun/content/2_6_280418.html OpenStack与KVM都是目前IT界比较热门的两个词汇.它们都是开源的,都与Linux ...
faster-rcnn代码阅读-proposal层
这一节讲述proposal层,和这一层有关的结构图如下: proposal层的prototxt定义如下: layer { name: 'proposal' type: 'Python' bottom: ...
bootstrap中container和container-fluid的区别
container和container-fluid 在bootstrap中,两者都是设置文本居中,但是它们还是有很大差别的 container 是随屏幕宽度的变化而变化的,是阶段性变化,有一个随浏览器 ...
scrapy爬取cnblogs文章列表
scrapy爬取cnblogs文章目标任务安装爬虫创建爬虫编写 items.py 编写 spiders/cnblogs.py 编写 pipelines.py 编写 settings.py 运行 ...
Openstack nova-scheduler 源码分析 — Filters/Weighting
目录目录前言调度器 FilterScheduler调度器的工作流程 Filters 过滤器 Filters 类型 Weighting 权重源码实现关键文件及其意义阶段一nova-sched ...
HIVE文件
注册表的本地实体文件, 察看位置,以及映射本地文件到注册表中的位置, HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\hivelist 在这里写 ...
std::unorder_set你插入元素的顺序不一定就是元素在里面的元素
去看了下cppreference,里面写了根据哈希值排序了

18.scrapy_maitian

18.scrapy_maitian的更多相关文章

随机推荐

热门专题