18.scrapy_maitian

ershoufang.py

# -*- coding: utf-8 -*-

import scrapy

class ErshoufangSpider(scrapy.Spider):

    name = 'ershoufang'

    allowed_domains = ['maitian.com']

    start_urls = ['http://maitian.com/']

    def parse(self, response):

        pass

zufang_spider.py

import scrapy

from maitian.items import MaitianItem

class MaitianSpider(scrapy.Spider):

    name = "zufang"

    start_urls = ['http://bj.maitian.cn/zfall/PG1']

    def parse(self, response):

        for zufang_itme in response.xpath('//div[@class="list_title"]'):

            yield {

                'title': zufang_itme.xpath('./h1/a/text()').extract_first().strip(),

                'price': zufang_itme.xpath('./div[@class="the_price"]/ol/strong/span/text()').extract_first().strip(),

                'area': zufang_itme.xpath('./p/span/text()').extract_first().replace('㎡', '').strip(),

                'district': zufang_itme.xpath('./p//text()').re(r'昌平|朝阳|东城|大兴|丰台|海淀|石景山|顺义|通州|西城')[0],

            }

        next_page_url = response.xpath(

            '//div[@id="paging"]/a[@class="down_page"]/@href').extract_first()

        if next_page_url is not None:

            yield scrapy.Request(response.urljoin(next_page_url))

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MaitianItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    title = scrapy.Field()

    price = scrapy.Field()

    area = scrapy.Field()

    district = scrapy.Field()

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

class MaitianSpiderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the spider middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_spider_input(self, response, spider):

        # Called for each response that goes through the spider

        # middleware and into the spider.

        # Should return None or raise an exception.

        return None

    def process_spider_output(self, response, result, spider):

        # Called with the results returned from the Spider, after

        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.

        for i in result:

            yield i

    def process_spider_exception(self, response, exception, spider):

        # Called when a spider or process_spider_input() method

        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict

        # or Item objects.

        pass

    def process_start_requests(self, start_requests, spider):

        # Called with the start requests of the spider, and works

        # similarly to the process_spider_output() method, except

        # that it doesn’t have a response associated.

        # Must return only requests (not items).

        for r in start_requests:

            yield r

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

class MaitianDownloaderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the downloader middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_request(self, request, spider):

        # Called for each request that goes through the downloader

        # middleware.

        # Must either:

        # - return None: continue processing this request

        # - or return a Response object

        # - or return a Request object

        # - or raise IgnoreRequest: process_exception() methods of

        #   installed downloader middleware will be called

        return None

    def process_response(self, request, response, spider):

        # Called with the response returned from the downloader.

        # Must either;

        # - return a Response object

        # - return a Request object

        # - or raise IgnoreRequest

        return response

    def process_exception(self, request, exception, spider):

        # Called when a download handler or a process_request()

        # (from other downloader middleware) raises an exception.

        # Must either:

        # - return None: continue processing this exception

        # - return a Response object: stops process_exception() chain

        # - return a Request object: stops process_exception() chain

        pass

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo

from scrapy.conf import settings

class MaitianPipeline(object):

    def __init__(self):

        host = settings['MONGODB_HOST']

        port = settings['MONGODB_PORT']

        db_name = settings['MONGODB_DBNAME']

        client = pymongo.MongoClient(host=host, port=port)

        db = client[db_name]

        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):

        zufang = dict(item)

        self.post.insert(zufang)

        return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for maitian project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'maitian'

SPIDER_MODULES = ['maitian.spiders']

NEWSPIDER_MODULE = 'maitian.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'maitian (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'maitian.middlewares.MaitianSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'maitian.middlewares.MaitianDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#ITEM_PIPELINES = {

#    'maitian.pipelines.MaitianPipeline': 300,

#}

ITEM_PIPELINES = {

   'maitian.pipelines.DuplicatesPipeline': 301,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

ITEM_PIPELINES = {'maitian.pipelines.MaitianPipeline': 300,}

MONGODB_HOST = '127.0.0.1'

MONGODB_PORT = 27017

MONGODB_DBNAME = 'maitian'

MONGODB_DOCNAME = 'zufang'

18.scrapy_maitian的更多相关文章

CSharpGL(18)分别处理glDrawArrays()和glDrawElements()两种方式下的拾取(ColorCodedPicking)
CSharpGL(18)分别处理glDrawArrays()和glDrawElements()两种方式下的拾取(ColorCodedPicking) 我在(Modern OpenGL用Shader拾取 ...
ABP(现代ASP.NET样板开发框架)系列之18、ABP应用层——权限验证
点这里进入ABP系列文章总目录 ABP(现代ASP.NET样板开发框架)系列之18.ABP应用层——权限验证 ABP是“ASP.NET Boilerplate Project (ASP.NET样板项目 ...
ASP.NET MVC5+EF6+EasyUI 后台管理系统（18）-权限管理系统-表数据
系列目录这一节,我们插入数据来看看数据流,让各位同学,知道这个权限表交互是怎么一个流程,免得大家后天雾里来雾里去首先我再解释一些表,SysUser和SysRole表不用解释了. SysRoleSys ...
C#开发微信门户及应用(18)-微信企业号的通讯录管理开发之成员管理
在上篇随笔<C#开发微信门户及应用(17)-微信企业号的通讯录管理开发之部门管理>介绍了通讯录的部门的相关操作管理,通讯录管理包括部门管理.成员管理.标签管理三个部分,本篇主要介绍成员的管 ...
[MySQL Reference Manual] 18 复制
18 复制 18 复制 18.1 复制配置 18.1.1 基于Binary Log的数据库复制配置 18.1.2 配置基于Binary log的复制 18.1.2.1 设置复制master的配置 18 ...
Hihocoder 太阁最新面经算法竞赛18
Hihocoder 太阁最新面经算法竞赛18 source: https://hihocoder.com/contest/hihointerview27/problems 题目1 : Big Plus ...
grep-2.26 sed-4.2.2 awk-4.1.4 wget-1.18 pcregrep-8.39 pcre2grep-10.22 for windows 最新版本静态编译
-------------------------------------------------------------------------------------------- grep (G ...
《C#本质论》读书笔记（18）多线程处理
.NET Framework 4.0 看(本质论第3版) .NET Framework 4.5 看(本质论第4版) .NET 4.0为多线程引入了两组新API:TPL(Task Parallel Li ...
Java随机生成18位身份证号
package com.ihome.data; import java.text.SimpleDateFormat; import java.util.Calendar; import java.ut ...

随机推荐

Dart编程布尔值
Dart为布尔数据类型提供内置支持.Dart中的布尔数据类型仅支持两个值true和false.关键字bool用于表示DART中的布尔值. 在dart中声明布尔变量的语法如下所示 bool var_na ...
Android下载Android源码
使用Git,命令是:git clone http://android.googlesource.com/platform/frameworks/base.git
NX二次开发-NXOpen获取边的端点NXOpen::Edge::GetVertices
NX9+VS2012 #include <NXOpen/Features_BlockFeatureBuilder.hxx> #include <NXOpen/Features_Fea ...
AsyncAwait
using System; using System.Diagnostics; using System.Threading; using System.Threading.Tasks; namesp ...
Spring源码由浅入深系列二类结构
BeanFactory 上一章中,我们提过Spring的依赖注入容器是BeanFactory.BeanFactory是一个基础接口,它有一个默认实现类:DefaultListableBeanFacto ...
Python3 From Zero——{最初的意识：002～字符串和文本}
一.使用多个界定符分割字符串字符串.split(',')形式只适用于单一分割符的情况:多分割符同时应用的时候,可使用re.split() >>> line = 'asdf fjdk ...
iOS 7.1的Safari为meta标签新增minimal-ui属性，在网页加载时隐藏地址栏与导航栏
在 iOS 7.1 的 Safari 中为 meta 标签新增 minimal-ui 属性,让网页在加载时便可隐藏顶部的地址栏与底部的导航栏. 如何实现?你只需将“minimal-ui”加入 view ...
assignment of day nine
一.简述定义函数的三种方式 1.空函数:用于占位 2.有参函数:有参数的函数 3.无参函数:没有参数的函数二.简述函数的返回值 1.如果函数没有返回值,默认返回None 2.函数可以通过return ...
5个CSS3技术实现设计增强
层叠样式表(css)是Web设计的一种语言,CSS的下一代版本CSS3已经蓄势待发.你是否可望开始使用它们却又不知从何下手呢?虽然还有一些新属性没有得到官方的确认,但是一些浏览器已经开始支持来自CSS ...
sparkStreaming的transformation和action详解
根据Spark官方文档中的描述,在Spark Streaming应用中,一个DStream对象可以调用多种操作,主要分为以下几类 Transformations Window Operations J ...

18.scrapy_maitian

18.scrapy_maitian的更多相关文章

随机推荐

热门专题