爬取链家网租房图使用ImagesPipeline保存图片

# 爬虫文件

# -*- coding: utf-8 -*-

import scrapy

import os

from urllib import request

from lianjia.items import LianjiaItem

class LianjiaspiderSpider(scrapy.Spider):

    name = 'lianjiaSpider'

    # allowed_domains = ['www.xxx.com']

    start_urls = ['https://bj.lianjia.com/zufang/l1rp5/#contentList ']

    def parse(self, response):

        div_list = response.xpath('//div[@class="content__list"]/div[@class="content__list--item"]')

        # print(len(div_list))

        for div in div_list:

            title = div.xpath('.//div[@class="content__list--item--main"]/p[1]/a/text()').get()

            title = title.strip()

            detail_url = div.xpath('.//div[@class="content__list--item--main"]/p[1]/a/@href').get()

            detail_url = "https://bj.lianjia.com" + detail_url

            # print(detail_url)

            location = div.xpath('.//div[@class="content__list--item--main"]/p[2]//text()').getall()

            location = list(map(lambda x:x.replace("\n","").replace("-","").replace("/","").strip(),location))

            location = "".join(location)

            # print(location)

            price = div.xpath('.//div[@class="content__list--item--main"]/span//text()').getall()

            price = price[0]+price[1]

            # print(price)

            yield scrapy.Request(url=detail_url, callback=self.parse_detail,meta={'info':(title,location,price,detail_url)})

        # 2-100页的url

        for i in range(2,101):

            next_url = "https://bj.lianjia.com/zufang/pg%dl1rp5/#contentList" % i

            yield scrapy.Request(url=next_url, callback=self.parse)

    def parse_detail(self,response):

        title,location,price,detail_url = response.meta.get("info")

        # pic_src = response.xpath("//div[@class='content__thumb--box']/ul/li[2]/img/@src").get()

        pic_srcs = response.xpath("//div[@class='content__thumb--box']/ul//img/@src").getall()

        # print('户型图链接:',pic_srcs)

        print('房源链接:',detail_url)

        item = LianjiaItem()

        item["title"] = title

        item["location"] = location

        item["price"] = price

        item['detail_url']=detail_url

        # item['pic_srcs'] = pic_srcs

        item['image_urls'] = pic_srcs

        yield item

# 管道文件

# 保存图片

# 普通方法保存图片

import os

from urllib import request

class LianjiaPipeline(object):

    def __init__(self):

        # 获取当前pipeline文件所在的目录路径 os.path.dirname(__file__)

        # 获取最外层bmw的路径os.path.dirname(os.path.dirname(__file__))

        # 在最外层bmw目录下创建一个文件夹 images, 获取images的路径

        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') # 生成images文件夹

        if not os.path.exists(self.path):

            print("images文件夹不存在")

            os.mkdir(self.path)  # 创建images文件夹

    def process_item(self, item, spider):

        location = item['location']

        urls = item['pic_srcs']

        per_house_pic_path = os.path.join(self.path,location)

        # path2=self.path  # G:\Crawler and Data\21days_spider\lianjia\images

       # 处理路径拼接  打印出来的是一个斜杠的  但是系统里是两个斜杠的, 会报错

        per_house_pic_path = per_house_pic_path.replace('/','\\')

        print('每一个户型图的保存路径:',per_house_pic_path)

        if not os.path.exists(per_house_pic_path):

            os.mkdir(per_house_pic_path)

        for url in urls:

            # 每个图片的url

            url = url.replace('126x86.jpg','780x439.jpg')  # 更改保存图片的大小

            # 切割图片url  拼接图片的名称  防止图片保存被覆盖 不然最后爬下的始终只有一张图片

            pic_name = url.split('.')[2][-9:-1]  # 防止图片被覆盖

            # os.path.join 的两个参数:户型图文件夹 和 图片的名称 拼接出来图片路径

            request.urlretrieve(url=url,filename=os.path.join(per_house_pic_path,pic_name+'.png'))

        return item

# item文件

class LianjiaItem(scrapy.Item):

    # define the fields for your item here like:

    # 普通的字段

    title = scrapy.Field()

    detail_url = scrapy.Field()

    location = scrapy.Field()

    price = scrapy.Field()

    pic_srcs = scrapy.Field()

# setting中

ITEM_PIPELINES = {

   'lianjia.pipelines.LianjiaPipeline': 300,

}

# 使用scrapy中的 image pipleline方法保存图片

import os

from urllib import request

from scrapy.pipelines.images import ImagesPipeline

from lianjia import settings

class LjImagesPipeline(ImagesPipeline):

    # 这个方法是下载请求前调用的, 就是发送下载请求的时候调用

    def get_media_requests(self,item,info):

        request_objs = super(LjImagesPipeline,self).get_media_requests(item,info)

        for request_obj in request_objs:

            request_obj.item = item   # 把item绑定到request上面,为了下面的方法可以通过request获取item

        return request_objs

    def file_path(self,request,response=None,info=None):

        # 这个方法是图片被存储的时候调用,来获取这个图片存储的路径

        path = super(LjImagesPipeline,self).file_path(request,response,info)

        location = request.item.get('location')

        # 获取图片存储路径    images文件夹路径

        images_store = settings.IMAGES_STORE

        # 判断这里有没有目录   每个房源的目录(这里面存房子图片)

        per_house_pic_path = os.path.join(images_store, location)

        if not os.path.exists(per_house_pic_path):

            os.mkdir(per_house_pic_path)

        image_name = path.replace('full/','') # 加个斜杠/是把full删除

        # print('image_name:',image_name)  #c554f76249059833f3a454830ec2cc2067465968.jpg

        image_path = os.path.join(per_house_pic_path,image_name)

        return image_path

# 对应的item文件

class LianjiaItem(scrapy.Item):

    # define the fields for your item here like:

    # 普通的字段

    title = scrapy.Field()

    detail_url = scrapy.Field()

    location = scrapy.Field()

    price = scrapy.Field()

    # pic_srcs = scrapy.Field()

    # 使用Images Pipeline需要的字段

    image_urls=scrapy.Field()

    images = scrapy.Field()

#settings文件

ITEM_PIPELINES = {

   # 'lianjia.pipelines.LianjiaPipeline': 300,

   # "scrapy.pipelines.images.ImagesPipeline":1 #不执行管道文件

  'lianjia.pipelines.LjImagesPipeline': 1,   #执行管道文件里重写的两个方法

}

# 图片下载的路径 供image.pipelines使用

import os

# 图片存储路径

IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') # 生成images文件夹

# 总结:

1.  def process_item()方法中 self.path 获取到的是images文件夹的路径, 要在这个文件下面保存每一个户型图的图片

2.  在window系统的路径拼接, os.path.join() 生成的路径通过print打印出来是一个斜杠/, 但是系统找路径的时候是找的双斜杠//, 这个时候就会报错.

爬取链家网租房图使用ImagesPipeline保存图片的更多相关文章

Scrapy实战篇（一）之爬取链家网成交房源数据（上）
今天,我们就以链家网南京地区为例,来学习爬取链家网的成交房源数据. 这里推荐使用火狐浏览器,并且安装firebug和firepath两款插件,你会发现,这两款插件会给我们后续的数据提取带来很大的方便. ...
Python的scrapy之爬取链家网房价信息并保存到本地
因为有在北京租房的打算,于是上网浏览了一下链家网站的房价,想将他们爬取下来,并保存到本地. 先看链家网的源码..房价信息都保存在 ul 下的li 里面爬虫结构: 其中封装了一个数据库处理模 ...
Scrapy实战篇（九）之爬取链家网天津租房数据
以后有可能会在天津租房子,所以想将链家网上面天津的租房数据抓下来,以供分析使用. 思路: 1.以初始链接https://tj.lianjia.com/zufang/rt200600000001/?sh ...
Scrapy实战篇（二）之爬取链家网成交房源数据（下）
在上一小节中,我们已经提取到了房源的具体信息,这一节中,我们主要是对提取到的数据进行后续的处理,以及进行相关的设置. 数据处理我们这里以把数据存储到mongo数据库为例.编写pipelines.py ...
适合初学者的Python爬取链家网教程
前言文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: TinaLY PS:如有需要Python学习资料的小伙伴可以加点击下 ...
python - 爬虫入门练习爬取链家网二手房信息
import requests from bs4 import BeautifulSoup import sqlite3 conn = sqlite3.connect("test.db&qu ...
Python——Scrapy爬取链家网站所有房源信息
用scrapy爬取链家全国以上房源分类的信息: 路径: items.py # -*- coding: utf-8 -*- # Define here the models for your scrap ...
Python爬虫项目--爬取链家热门城市新房
本次实战是利用爬虫爬取链家的新房(声明: 内容仅用于学习交流, 请勿用作商业用途) 环境 win8, python 3.7, pycharm 正文 1. 目标网站分析通过分析, 找出相关url, 确 ...
python爬虫：爬取链家深圳全部二手房的详细信息
1.问题描述: 爬取链家深圳全部二手房的详细信息,并将爬取的数据存储到CSV文件中 2.思路分析: (1)目标网址:https://sz.lianjia.com/ershoufang/ (2)代码结构 ...

随机推荐

VLC-FM PLAYLIST
VLC-FM-PLAYLIST.xspf <?xml version="1.0" encoding="UTF-8"?> <playlist x ...
C# 获取系统Icon、获取文件相关的Icon
原文:C# 获取系统Icon.获取文件相关的Icon 1.获取系统Icon工具下载SystemIcon.exe using System; using System.Collections.Gener ...
Emgu-WPF学习使用-识别二维码的位置
原文:Emgu-WPF学习使用-识别二维码的位置参考链接:http://blog.csdn.net/gaobobo138968/article/details/47663607 我完全参 ...
WPF编游戏系列之八银行界面及金额校验
原文:WPF编游戏系列之八银行界面及金额校验在前面<WPF编游戏系列之四用户控件>一文中通过用户控件创建了"My Shop"中物品列表框.本篇继 ...
《Docker 实战》第三章 Docker Hub 寻宝游戏
# 秘密仓库和密码 docker run --rm -it --name password dockerinaction/ch3_ex2_huntanswer
基于Netbeans的安卓Android开发环境配置 - CSDN博客
原文:基于Netbeans的安卓Android开发环境配置 - CSDN博客基于Netbeans的安卓Android开发环境配置一.准备工作 NetBeans 勾选网页中的Accept-选择对应系 ...
零元学Expression Blend 4 - Chapter 23 Deep Zoom Composer与Deep Zoom功能
原文:零元学Expression Blend 4 - Chapter 23 Deep Zoom Composer与Deep Zoom功能最近有机会在工作上用到Deep Zoom这个功能,我就顺便介绍 ...
改善C#程序的建议7：正确停止线程
原文:改善C#程序的建议7:正确停止线程开发者总尝试对自己的代码有更多的控制.“让那个还在工作的线程马上停止下来”就是诸多要求中的一种.然而事与愿违,这里面至少存在两个问题: 第一个问题是:正如线程 ...
Qt 下快速读写Excel指南（尘中远）
Qt Windows 下快速读写Excel指南很多人搜如何读写excel都会看到用QAxObject来进行操作,很多人试了之后都会发现一个问题,就是慢,非常缓慢!因此很多人得出结论是QAxObjec ...
SQLite的使用（包括编译安装的步骤）
SQLite官网http://www.sqlite.org/ SQLite简介 SQLite是一款轻型的数据库,是遵守ACID(原子性.一致性.隔离性和持久性)的关系式数据库管理系统.SQLite实现 ...

爬取链家网租房图 使用ImagesPipeline保存图片

爬取链家网租房图 使用ImagesPipeline保存图片的更多相关文章

随机推荐

热门专题

爬取链家网租房图使用ImagesPipeline保存图片

爬取链家网租房图使用ImagesPipeline保存图片的更多相关文章