scrapy全栈抓xpc练习
# spider文件
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Request
import json
import string
import random
from xpc.items import PostItem, CommentItem, CopyItem # 多个item def strip(s):
# s存在就去空,不存在就返回空
if s:
return s.strip()
return "" # 使用scrapy.Request和scrapy.FormRequest发送请求的时候,默认会把cookies保存下来
# 模拟登录的时候不用scrapy框架,直接使用request模块
cookies = dict(
Authorization='4F635191B0602B5D3B06024483B0602AAF8B06023C2F6259656D'
)
# 上面的cookies是网站返回的,需要先登陆的一下把这个cookies找到 # 生成26个字母+数字
def gen_sessionid():
return "".join(random.choices(string.ascii_lowercase + string.digits, k=26)) class XinpianchangSpider(scrapy.Spider):
name = 'XinPianChang'
allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=tabArticle']
# 假设从21页开始访问,这里就需要带上cookies,这时候最开始设置的cookies就不能用了,网站会返回4个cookies。需要从写start_requests函数
# start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-21']
page_count = 0 # 重写父类中的 start_requests方法,该方法默认对start_urls中的url发get请求
# def start_requests(self):
# for url in self.start_urls:
# # data = {
# # "kw": "cat"
# # }
# # post请求发送,使用FormRequest
# # yield scrapy.FormRequest(url=url, callback=self.parse, formdata=data)
#
# c = cookies.copy()
# c.update(PHPSESSID=gen_sessionid(),
# SERVER_ID='b52601c8-285bdd26',
# channel_page='apU%3D')
# yield Request(url, cookies=c, dont_filter=True) def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
self.page_count += 1
if self.page_count >= 100:
cookies.update(PHPSESSID=gen_sessionid())
self.page_count = 0 url_list = response.xpath('//ul[@class="video-list"]/li/@data-articleid').extract()
for pid in url_list:
detail_url = 'https://www.xinpianchang.com/a{}?from=ArticleList'.format(pid)
# print(detail_url)
request = response.follow(detail_url, callback=self.parse_post)
request.meta['pid'] = pid
yield request # 进入作品的详情页请求 pages = response.xpath('//div[@class="page"]/a/@href').extract()
for page_url in pages:
# print("列表页翻页url", page_url) # page_url是一个相对路径,不完整的
yield response.follow(page_url, self.parse, cookies=cookies) def parse_post(self, response):
pid = response.meta['pid']
post = PostItem()
post['pid'] = pid
post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()
# video_url = 'https://openapi-vtom.vmovier.com/v3/video/5E34203E92450?expand=resource&usage=xpc_web'
# response.text拿到网页返回的源码
vid = re.findall('vid: "(.*?)",', response.text)[0]
# print(vid)
video_url = 'https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc_web'.format(vid)
cates = response.xpath('//span[@class="cate v-center"]/a/text()').extract()
post['category'] = ''.join([cate.strip() for cate in cates])
post['create_time'] = response.xpath('//span[contains(@class,"update-time")]/i/text()').get()
post['play_count'] = response.xpath('//i[contains(@class,"play-counts")]/text()').get()
desc_lst = response.xpath('//p[contains(@class,"desc")]//text()').extract()
post['desc'] = ' '.join([i.strip() for i in desc_lst]) # 请求这个video_url, 多了一步这个注意一下
request = Request(video_url, callback=self.parse_video)
# 把之前获取到的post通过meta传到下一个函数中. 这个post是请求传参
request.meta['post'] = post
yield request # 获取评论链接‘https://app.xinpianchang.com/comments?resource_id=10664352&type=article&page=1&per_page=24’
comment_url = "https://app.xinpianchang.com/comments?resource_id={}&type=article&page=1&per_page=24".format(
pid)
request = Request(comment_url, callback=self.parse_comment)
# 把之前获取到的post通过meta传到下一个函数中
request.meta['pid'] = pid
yield request # 获取作者页链接
creator_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li')
composer_url = 'https://www.xinpianchang.com/u{}?from=articleList'
# cid = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li/a/@data-userid')
for creator in creator_list:
cid = creator.xpath('./a/@data-userid').get()
composer_url = 'https://www.xinpianchang.com/u{}?from=articleList'.format(cid)
request = response.follow(composer_url, self.parse_composer)
request.meta['cid'] = cid
# 避免在cookies更新之后,不断的添加到请求头里面,避免请求头里带有一串cookies
request.meta['dont_merge_cookies'] = True
yield request # 作者和视频的对应关系
cr = CopyItem()
cr['pid'] = pid
cr['cid'] = cid
cr['pcid'] = pid + cid
cr['role'] = creator.xpath('./div[@class="creator-info"]/span/text()').get()
# print("cr", cr)
yield cr def parse_video(self, response): # 这个response是json格式
post = response.meta['post']
# 先把返回的json转化一下, 注意一下
result = json.loads(response.text)
post['video_url'] = result['data']['resource']['default']['url']
# 直接返回给管道了
yield post def parse_comment(self, response):
result = json.loads(response.text)
for c in result['data']['list']:
comment = CommentItem()
comment['uname'] = c['userInfo']['username']
comment['user_id'] = c['userInfo']['id']
# comment['user_page'] = c['userInfo']['web_url']
comment['content'] = c['content']
comment['content_id'] = c['id']
print(comment)
yield comment # 如果有下一页
if result['data']['next_page_url']:
next_page = 'https://app.xinpianchang.com' + result['data']['next_page_url']
# print("next_page", next_page)
yield response.follow(next_page, self.parse_comment) def parse_composer(self, response):
pass
# settings文件
# -*- coding: utf-8 -*- # Scrapy settings for xpc project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'xpc' SPIDER_MODULES = ['xpc.spiders']
NEWSPIDER_MODULE = 'xpc.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'xpc (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
# 如果使用自定义cookie就把COOKIES_ENABLED设置为True
# 如果使用settings的cookie就把COOKIES_ENABLED设置为False
COOKIES_ENABLED = True
COOKIES_DEBUG = True # 可以打印出来详细的cookies信息 # Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False # Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)',
} # Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'xpc.middlewares.XpcSpiderMiddleware': 543,
# } # Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'xpc.middlewares.XpcDownloaderMiddleware': 543,
# } # Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# } # Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'xpc.pipelines.XpcPipeline': 300, # 优先级高
# 'xpc.pipelines.MysqlPipeline': 301,
# 'xpc.pipelines.RedisPipeline': 302,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = False # True缓存访问过的网页,不会真实的发请求
# HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # 日志类型: INFO DEBUG ERROR
LOG_LEVEL = 'DEBUG'
# item文件
# -*- coding: utf-8 -*-
import scrapy class PostItem(scrapy.Item):
# 保存视频信息
# 自定义字段,有多个表的时候需要写个table_name
table_name = 'posts' # 下面的是数据字段
pid = scrapy.Field()
title = scrapy.Field()
category = scrapy.Field()
create_time = scrapy.Field()
play_count = scrapy.Field()
desc = scrapy.Field()
video_url = scrapy.Field() class CommentItem(scrapy.Item):
# 保存评论信息
table_name = 'comments'
content_id = scrapy.Field()
pid = scrapy.Field()
cid = scrapy.Field()
uname = scrapy.Field()
user_id = scrapy.Field()
content = scrapy.Field()
user_page = scrapy.Field() class CopyItem(scrapy.Item):
table_name = 'copyrights'
pcid = scrapy.Field() # 表的主键
pid = scrapy.Field()
cid = scrapy.Field()
role = scrapy.Field()
# pipeline文件
# -*- coding: utf-8 -*- import csv
from xpc.items import PostItem, CommentItem, CopyItem
import pymysql
from redis import Redis
import os class XpcPipeline(object):
def __init__(self):
# 当前文件的上一级
store_file = os.path.dirname(__file__) + '/xpc.csv'
# 打开文件
self.file = open(store_file, 'w', newline="")
# csv 写法
self.writer = csv.writer(self.file) def open_spider(self, spider):
print("pipeline 开始爬虫......")
# 执行多个不同的item时
def process_item(self, item, spider):
if isinstance(item, PostItem):
print("这是发布信息:", item)
elif isinstance(item, CommentItem):
print("这是评论信息:", item)
elif isinstance(item, CopyItem):
print("这是版权信息:", item)
return item # 返回给下一个要执行的管道类 def close_spider(self, spider):
print("pipeline 结束爬虫......") # 连接数据库
class MysqlPipeline(object):
conn = None
cursor = None def open_spider(self, spider):
self.conn = pymysql.Connect(
host='127.0.0.1',
port=3306,
user='root',
password='',
db='test_db',
charset='utf8'
)
print("数据库连接成功") def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into test_db values("%s", "%s")' % (item['author'], item['content']))
self.conn.commit()
except Exception as e:
print("数据库插入异常:", e)
print("数据库执行回滚")
self.conn.rollback()
return item def close_spider(self, spider):
print("断开数据库连接")
self.cursor.close()
self.conn.close() # 连接数据库
class RedisPipeline(object):
conn = None
cursor = None def open_spider(self, spider):
self.conn = Redis(
host='127.0.0.1',
port=6379
)
print("数据库连接成功") def process_item(self, item, spider):
dic = {
"author": item["author"],
"content": item["content"]
}
self.conn.lpush("队列名字", dic) def close_spider(self, spider):
print("断开数据库连接")
self.cursor.close()
self.conn.close()
scrapy全栈抓xpc练习的更多相关文章
- 爬虫系列---scrapy全栈数据爬取框架(Crawlspider)
		一 简介 crawlspider 是Spider的一个子类,除了继承spider的功能特性外,还派生了自己更加强大的功能. LinkExtractors链接提取器,Rule规则解析器. 二 强大的链接 ... 
- 大数据全栈式开发语言 – Python
		前段时间,ThoughtWorks在深圳举办一次社区活动上,有一个演讲主题叫做“Fullstack JavaScript”,是关于用JavaScript进行前端.服务器端,甚至数据库(MongoDB) ... 
- 为什么说Python 是大数据全栈式开发语言
		欢迎大家访问我的个人网站<刘江的博客和教程>:www.liujiangblog.com 主要分享Python 及Django教程以及相关的博客 交流QQ群:453131687 原文链接 h ... 
- 《从零开始做一个MEAN全栈项目》(2)
		欢迎关注本人的微信公众号"前端小填填",专注前端技术的基础和项目开发的学习. 上一节简单介绍了什么是MEAN全栈项目,这一节将简要介绍三个内容:(1)一个通用的MEAN项目的技 ... 
- 《web全栈工程师的自我修养》阅读笔记
		在买之前以为这本书是教你怎么去做一个web全栈工程师,以及介绍需要掌握的哪些技术的书,然而看的过程中才发现,是一本方法论的书.读起来的感觉有点像红衣教主的<我的互联网方法论>,以一些自己的 ... 
- Win10构建Python全栈开发环境With WSL
		目录 Win10构建Python全栈开发环境With WSL 启动WSL 总结 对<Dev on Windows with WSL>的补充 Win10构建Python全栈开发环境With ... 
- python全栈开发中级班全程笔记(第二模块、第四章)(常用模块导入)
		python全栈开发笔记第二模块 第四章 :常用模块(第二部分) 一.os 模块的 详解 1.os.getcwd() :得到当前工作目录,即当前python解释器所在目录路径 impor ... 
- 学习笔记之Python全栈开发/人工智能公开课_腾讯课堂
		Python全栈开发/人工智能公开课_腾讯课堂 https://ke.qq.com/course/190378 https://github.com/haoran119/ke.qq.com.pytho ... 
- Python全栈面试题
		Mr.Seven 博客园 首页 新随笔 联系 订阅 管理 随笔-132 文章-153 评论-516 不吹不擂,你想要的Python面试都在这里了[315+道题] 写在前面 近日恰逢学生毕 ... 
随机推荐
- python语法基础-函数-进阶-长期维护
			############### 函数的命名空间和作用域 ############## """ # 函数进阶 命名空间和作用域 命名空间 全局命名空间——我们自 ... 
- Weave实现跨主机容器互联
			安装与启动 直接从github下载二进制文件安装. docker_host1(服务器1): 下载weave # sudo wget -O /usr/local/bin/weave https://ra ... 
- 使用dtree构建框架导航
			前言: 该例子就是个框架导航 , 左边包含dtree的框架,点击上面的节点右边框架显示 说明步骤: 1. 首先获得dtree http://www.destroydrop.com/javascrip ... 
- REVIT 卸载工具,完美彻底卸载清除干净revit各种残留注册表和文件
			一些同学安装revit出错了,也有时候想重新安装revit的时候会出现这种本电脑windows系统已安装revit,你要是不留意直接安装,只会安装revit的附件,revit是不会安装上的.这种原因呢 ... 
- 吴裕雄--天生自然python学习笔记:Python3 XML 解析
			什么是 XML? XML 指可扩展标记语言(eXtensible Markup Language),标准通用标记语言的子集,是一种用于标记电子文件使其具有结构性的标记语言. XML 被设计用来传输和存 ... 
- JVM笔记(二)
			内存分配1)对象的内存分配,往大的方向讲,就是在堆上分配2)对象优先在Eden分3)大对象直接进入老年代4)长期存活的对象进入老年代:对象在Survivor区每“熬过”一次Minor GC,年数加1, ... 
- mongoDB连接信息及生成对应的collection生成代码
			.net,个人封装MONGODDB的操作. using System; using System.Collections.Generic; using System.Linq; using Syste ... 
- 关于JavaScript中bind、applay、call的区别
			在JavaScript中this的指向一直是一个困扰我们的问题,在JavaScript中this的指向是不固定的,但是我们可以通过使用bind().call().apply()来改变this的指向,但 ... 
- 吴裕雄--天生自然 python开发学习笔记:pycharm无法使用ctrl+c/v复制粘贴的问题
			在使用pycharm的时候发现不能正常使用ctrl+c/v进行复制粘贴,也无法使用tab键对大段代码进行整体缩进.后来发现是因为安装了vim插件的问题,在setting里找到vim插件,取消勾选即可解 ... 
- 接口自动化测试平台 http://120.79.232.23
			接口自动化测试平台 http://120.79.232.23 T Name Latest commit message Commit time .idea 修改自动化用例修改接口时,其他接口信息被删的 ... 
