scrapy爬取《坏蛋是怎样练成的4》

scrapy具体介绍就不用说了，自己百度一下。或者参考以下文档

https://blog.csdn.net/u011054333/article/details/70165401

直接在cmd里运行

scrapy startproject huaidan

scrapy genspider huaidan huaida4.com

然后贴代码放到spiders文件夹里

 1 # -*- coding: utf-8 -*-

 2 import scrapy

 3 from scrapy.http import Request

 4 from urllib import parse

 5 import re

 6

 7 class huaidan(scrapy.Spider):

 8     name = "huaidan"

 9     allowed_domains = ["www.huaidan4.com"]

10     start_urls = ["http://www.huaidan4.com/di-yi-fen-lei.html",

11                   "http://www.huaidan4.com/di-er-juan.html",

12                   "http://www.huaidan4.com"]

13

14

15     #提取下一页文章url交给scrpy进行下载

16     def parse(self, response):

17         #获取文章url

18         all_article=response.css('.container ul li a::attr(href)').extract()

19         all_url=[]

20         for article_url in all_article:

21             if article_url in all_url:

22                 pass

23             else:

24                 all_url.append(article_url)

25                 yield Request(url=article_url,encoding='utf-8',callback=self.parse_detail)

26

27

28

29

30     #提取文章的具体字段

31     def parse_detail(self,response):

32         #获取文章标题

33         article_title = response.xpath('//*[@id="content"]/div[1]/div[1]/h2/text()').extract_first()

34

35         #获取创建时间

36         create_time = response.xpath('//*[@id="content"]/div[1]/div[1]/span/text()[2]').extract_first().strip()

37

38         #获取文章正文

39         article_text = response.css('.post_entry,p::text').extract_first()

40         #处理正文标点符号和无用的信息

41         article_text = re.sub('</?\w+[^>]*>','',article_text)

42         article_text = article_text.replace("\', \'","")

43         article_text = article_text.replace("\\u3000","").strip()

44         article_text = article_text.replace("\\xa0\\xa0\\xa0\\xa0","")

45         article_text = article_text.replace("(新书上传，求收藏，推荐!!!!!!!!!!!!!!!!!!!!)","")

46         article_text = article_text.replace("\\r\\n", "\n")

47         article_text = article_text.replace("免费小说", "")

48         article_text = article_text.replace("www.huaidan4.com", "")

49         article_text = article_text.replace("neirong_2();", "")

50         article_text = article_text.replace("dibutuijian();", "")

51         article_text = article_text.replace("◎欢迎参与讨论，请在这里发表您的看法、交流您的观点。", "")

52         article_text = article_text.replace("《坏蛋是怎样炼成的4》是继曹三少坏蛋是怎样炼成的3的又一作品，作者是曹三少，如果你喜欢坏蛋是怎样炼成的4，请收藏本站以便下次阅读。","")

53         article_text = re.sub('/?\s+', '', article_text)

54

55         #保存文件

56         self.save_article(article_title,create_time,str(article_text))

57

58     #保存文件的方法

59     def save_article(self,article_title,create_time,article_text):

60         biaoti = re.sub('\W+','-',article_title)

61         with open(biaoti+'.txt','w',encoding='utf-8') as file:

62             neirong = (article_title+'\n'+create_time+'\n'+article_text)

63             file.write(neirong)

64             file.close()

以上内容初步完成了把文章保存在本地

---------------------------------------------------------------------------------------------------------------------------------------------------------------

下面内容完成把文章保存到mysql数据库

items.py负责存放爬取节点数据

import scrapy

class HuaidanItem(scrapy.Item):

    catalogues=scrapy.Field()

    id=scrapy.Field()

    article_title = scrapy.Field()

    article_text = scrapy.Field()

    create_time = scrapy.Field()

piplines负责处理items里的内容

# -*- coding: utf-8 -*-

import pymysql

from twisted.enterprise import adbapi

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#from scrapy.pipelines.images import ImagesPipeline

#from scrapy.pipelines.files import FilesPipeline

class HuaidanPipeline(object):

    def process_item(self, item, spider):

        return item

#直接插入到mysql数据库

class MysqlPiplines(object):

    def __init__(self):

        self.db=pymysql.connect(host="192.168.7.5",user="huaidan",password="huaidan123",database="huaidan",charset = 'utf8')

        self.cursor=self.db.cursor()

    def process_item(self, item, spider):

        self.insert(item["catalogues"],int(item["id"]),item["article_title"],item["create_time"],item["article_text"])

        return item

    def insert(self,catalogues,id,article_title,create_time,article_text):

        selectsql="select id from diyijuan where id = %d " \

                  " union select id from dierjuan where id =%d" \

                  " union select id from disanjuan where id =%d" \

                  " union select id from other where id =%d " % (id,id,id,id)

        self.cursor.execute(selectsql)

        if self.cursor.fetchone() is None:

            insertsql="insert into %s values (%d,'%s','%s','%s');" % (catalogues,id,article_title,create_time,article_text)

            try:

                self.cursor.execute(insertsql)

                self.db.commit()

            except:

                self.db.rollback()

    def spider_closed(self,spider):

        self.db.close()

#异步插入到mysql数据库

class MysqlTwisted(object):

    def __init__(self,dbpool):

        self.dbpool = dbpool

    @classmethod

    def from_settings(cls,settings):

        dbparms = dict(

            host = settings["MYSQL_HOST"],

            user = settings["MYSQL_USER"],

            passwd = settings["MYSQL_PASSWORD"],

            db = settings["MYSQL_DBNAME"],

            charset = 'utf8',

            cursorclass = pymysql.cursors.DictCursor,

            use_unicode = True,

        )

        dbpool=adbapi.ConnectionPool("pymysql", **dbparms)

        return cls(dbpool)

    # 使用twisted讲mysql插入变成异步执行

    def process_item(self, item, spider):

        query = self.dbpool.runInteraction(self.do_insert,item)

        query.addErrback(self.handle_error)

    # 处理异步插入异常

    def handle_error(self,faileure):

        print(faileure)

    # 执行具体的插入

    def do_insert(self,cursor,item):

        #查询id是否已经存在

        id=int(item["id"])

        selectsql = "select id from diyijuan where id = %d " \

                    " union select id from dierjuan where id =%d" \

                    " union select id from disanjuan where id =%d" \

                    " union select id from other where id =%d " % (id,id,id,id)

        cursor.execute(selectsql)

        #如果执行不成功，代表不存在数据库。则执行插入步骤

        if cursor.fetchone() is None:

            insertsql = "insert into %s values (%d,'''%s''','''%s''','''%s''');" % (

            item["catalogues"], id, item["article_title"], item["create_time"], item["article_text"])

            cursor.execute(insertsql)

class myarticlepipline(object):

        def process_item(self, item, spider):

            return item

settings.py负责存放整体设置

# -*- coding: utf-8 -*-

import os

# Scrapy settings for huaidan project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'huaidan'

SPIDER_MODULES = ['huaidan.spiders']

NEWSPIDER_MODULE = 'huaidan.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'huaidan (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'huaidan.middlewares.HuaidanSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'huaidan.middlewares.HuaidanDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#下面内容代表执行piplines动作顺序，数字越小，越先执行。

ITEM_PIPELINES = {

   #'huaidan.pipelines.HuaidanPipeline': 300,

    #'scrapy.pipelines.files.FilesPipeline':2,

    #'huaidan.pipelines.myarticlepipline':1,

    #'huaidan.pipelines.MysqlPiplines':2,    #直接插入到mysql数据库的方法

    'huaidan.pipelines.MysqlTwisted':1,     #异步插入到mysql数据库的方法

}

project_dir = os.path.abspath(os.path.dirname(__file__))

FILES_URLS=FIELD =""

FILES_STORE = os.path.join(project_dir,'files')

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

#把数据库信息存放到settings里可以直接调用

MYSQL_HOST = "192.168.7.5"

MYSQL_DBNAME = "huaidan"

MYSQL_USER = "huaidan"

MYSQL_PASSWORD = "huaidan123"

scrapy爬取《坏蛋是怎样练成的4》的更多相关文章

【转载】教你分分钟学会用python爬虫框架Scrapy爬取心目中的女神
原文:教你分分钟学会用python爬虫框架Scrapy爬取心目中的女神本博文将带领你从入门到精通爬虫框架Scrapy,最终具备爬取任何网页的数据的能力.本文以校花网为例进行爬取,校花网:http:/ ...
第三百三十四节，web爬虫讲解2—Scrapy框架爬虫—Scrapy爬取百度新闻，爬取Ajax动态生成的信息
第三百三十四节,web爬虫讲解2—Scrapy框架爬虫—Scrapy爬取百度新闻,爬取Ajax动态生成的信息 crapy爬取百度新闻,爬取Ajax动态生成的信息,抓取百度新闻首页的新闻rul地址有多 ...
Scrapy爬取美女图片续集 (原创)
上一篇咱们讲解了Scrapy的工作机制和如何使用Scrapy爬取美女图片,而今天接着讲解Scrapy爬取美女图片,不过采取了不同的方式和代码实现,对Scrapy的功能进行更深入的运用.(我的新书< ...
python scrapy爬取HBS 汉堡南美航运公司柜号信息
下面分享个scrapy的例子利用scrapy爬取HBS 船公司柜号信息 1.前期准备查询提单号下的柜号有哪些,主要是在下面的网站上,输入提单号,然后点击查询 https://www.hamburg ...
scrapy 爬取纵横网实战
前言闲来无事就要练练代码,不知道最近爬取什么网站好,就拿纵横网爬取我最喜欢的雪中悍刀行练手吧准备 python3 scrapy 项目创建: cmd命令行切换到工作目录创建scrapy项目两条命 ...
如何提升scrapy爬取数据的效率
在配置文件中修改相关参数: 增加并发默认的scrapy开启的并发线程为32个,可以适当的进行增加,再配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. ...
提高Scrapy爬取效率
1.增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. 2.降低 ...
scrapy爬取效率提升配置
增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. 降低日志级别 ...
提高scrapy爬取效率配置
提高scrapy爬取效率配置 #增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发 ...
Scrapy爬取美女图片 (原创)
有半个月没有更新了,最近确实有点忙.先是华为的比赛,接着实验室又有项目,然后又学习了一些新的知识,所以没有更新文章.为了表达我的歉意,我给大家来一波福利... 今天咱们说的是爬虫框架.之前我使用pyt ...

随机推荐

Solon 开发
Solon 开发一.注入或手动获取配置二.注入或手动获取Bean 三.构建一个Bean的三种方式四.Bean 扫描的三种方式五.切面与环绕拦截六.提取Bean的函数进行定制开发七.自定义注 ...
【刷题-LeetCode】228. Summary Ranges
Summary Ranges Given a sorted integer array without duplicates, return the summary of its ranges. Ex ...
golang取地址操作采坑：for idx,item := range arr中的item是个独立对象
先看代码: package main import "fmt" func main() { type s struct { A string B int32 } arr := [] ...
Unable to open 'free_base.cpp': Unable to read file 'c:\Program Files\Microsoft VS Code\minkernel\crts\ucrt\src\appcrt\heap\free_base.cpp'
问题 vscode编写C++程序,使用microsoft C++ Unable to open 'cvt.cpp': Unable to read file 'c:\Program Files\Mic ...
Cesium入门7 - Adding Terrain - 添加地形
Cesium入门7 - Adding Terrain - 添加地形 Cesium中文网:http://cesiumcn.org/ | 国内快速访问:http://cesium.coinidea.com ...
CMake语法—普通变量与函数（Normal Variable And Function）
目录 CMake语法-普通变量与函数(Normal Variable And Function) 1 CMake普通变量与函数示例 1.1 CMakeLists.txt 1.2 执行CMake配置脚本 ...
Maven常用设置
1,maven属性设置 <properties>设置maven的常用属性 <properties> 属性设置  ...
集合框架-Map集合-HashMap存储自定义对象
1 package cn.itcast.p6.hashmap.demo; 2 3 import java.util.HashMap; 4 import java.util.Iterator; 5 im ...
009 Linux 文件大小统计与排序( du于df和sort)
@ 目录 01 du 与 df 作用与区别? du(disk usage) df(disk free) 02 du 常用命令示例 03 sort 常用参数 04 常用组合 du + sort + he ...
TypeScript入门文档
typescript入门文档链接d地址:https://ts.xcatliu.com/basics/type-of-function.html 博主个人站点:www.devloper.top

scrapy爬取《坏蛋是怎样练成的4》

scrapy爬取《坏蛋是怎样练成的4》的更多相关文章

随机推荐

热门专题