scrapy具体介绍就不用说了,自己百度一下。或者参考以下文档

https://blog.csdn.net/u011054333/article/details/70165401

直接在cmd里运行

scrapy startproject huaidan

scrapy genspider huaidan huaida4.com

然后贴代码放到spiders文件夹里

 1 # -*- coding: utf-8 -*-
2 import scrapy
3 from scrapy.http import Request
4 from urllib import parse
5 import re
6
7 class huaidan(scrapy.Spider):
8 name = "huaidan"
9 allowed_domains = ["www.huaidan4.com"]
10 start_urls = ["http://www.huaidan4.com/di-yi-fen-lei.html",
11 "http://www.huaidan4.com/di-er-juan.html",
12 "http://www.huaidan4.com"]
13
14
15 #提取下一页文章url交给scrpy进行下载
16 def parse(self, response):
17 #获取文章url
18 all_article=response.css('.container ul li a::attr(href)').extract()
19 all_url=[]
20 for article_url in all_article:
21 if article_url in all_url:
22 pass
23 else:
24 all_url.append(article_url)
25 yield Request(url=article_url,encoding='utf-8',callback=self.parse_detail)
26
27
28
29
30 #提取文章的具体字段
31 def parse_detail(self,response):
32 #获取文章标题
33 article_title = response.xpath('//*[@id="content"]/div[1]/div[1]/h2/text()').extract_first()
34
35 #获取创建时间
36 create_time = response.xpath('//*[@id="content"]/div[1]/div[1]/span/text()[2]').extract_first().strip()
37
38 #获取文章正文
39 article_text = response.css('.post_entry,p::text').extract_first()
40 #处理正文标点符号和无用的信息
41 article_text = re.sub('</?\w+[^>]*>','',article_text)
42 article_text = article_text.replace("\', \'","")
43 article_text = article_text.replace("\\u3000","").strip()
44 article_text = article_text.replace("\\xa0\\xa0\\xa0\\xa0","")
45 article_text = article_text.replace("(新书上传,求收藏,推荐!!!!!!!!!!!!!!!!!!!!)","")
46 article_text = article_text.replace("\\r\\n", "\n")
47 article_text = article_text.replace("免费小说", "")
48 article_text = article_text.replace("www.huaidan4.com", "")
49 article_text = article_text.replace("neirong_2();", "")
50 article_text = article_text.replace("dibutuijian();", "")
51 article_text = article_text.replace("◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。", "")
52 article_text = article_text.replace("《坏蛋是怎样炼成的4》是继曹三少坏蛋是怎样炼成的3的又一作品,作者是曹三少,如果你喜欢坏蛋是怎样炼成的4,请收藏本站以便下次阅读。","")
53 article_text = re.sub('/?\s+', '', article_text)
54
55 #保存文件
56 self.save_article(article_title,create_time,str(article_text))
57
58 #保存文件的方法
59 def save_article(self,article_title,create_time,article_text):
60 biaoti = re.sub('\W+','-',article_title)
61 with open(biaoti+'.txt','w',encoding='utf-8') as file:
62 neirong = (article_title+'\n'+create_time+'\n'+article_text)
63 file.write(neirong)
64 file.close()

以上内容初步完成了把文章保存在本地

---------------------------------------------------------------------------------------------------------------------------------------------------------------

下面内容完成把文章保存到mysql数据库

items.py负责存放爬取节点数据

import scrapy

class HuaidanItem(scrapy.Item):
catalogues=scrapy.Field()
id=scrapy.Field()
article_title = scrapy.Field()
article_text = scrapy.Field()
create_time = scrapy.Field()

piplines负责处理items里的内容

# -*- coding: utf-8 -*-

import pymysql
from twisted.enterprise import adbapi # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#from scrapy.pipelines.images import ImagesPipeline
#from scrapy.pipelines.files import FilesPipeline class HuaidanPipeline(object):
def process_item(self, item, spider):
return item #直接插入到mysql数据库
class MysqlPiplines(object):
def __init__(self):
self.db=pymysql.connect(host="192.168.7.5",user="huaidan",password="huaidan123",database="huaidan",charset = 'utf8')
self.cursor=self.db.cursor() def process_item(self, item, spider):
self.insert(item["catalogues"],int(item["id"]),item["article_title"],item["create_time"],item["article_text"])
return item def insert(self,catalogues,id,article_title,create_time,article_text):
selectsql="select id from diyijuan where id = %d " \
" union select id from dierjuan where id =%d" \
" union select id from disanjuan where id =%d" \
" union select id from other where id =%d " % (id,id,id,id)
self.cursor.execute(selectsql)
if self.cursor.fetchone() is None:
insertsql="insert into %s values (%d,'%s','%s','%s');" % (catalogues,id,article_title,create_time,article_text)
try:
self.cursor.execute(insertsql)
self.db.commit()
except:
self.db.rollback() def spider_closed(self,spider):
self.db.close() #异步插入到mysql数据库
class MysqlTwisted(object):
def __init__(self,dbpool):
self.dbpool = dbpool @classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
db = settings["MYSQL_DBNAME"],
charset = 'utf8',
cursorclass = pymysql.cursors.DictCursor,
use_unicode = True,
) dbpool=adbapi.ConnectionPool("pymysql", **dbparms)
return cls(dbpool) # 使用twisted讲mysql插入变成异步执行
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error) # 处理异步插入异常
def handle_error(self,faileure):
print(faileure) # 执行具体的插入
def do_insert(self,cursor,item):
#查询id是否已经存在
id=int(item["id"])
selectsql = "select id from diyijuan where id = %d " \
" union select id from dierjuan where id =%d" \
" union select id from disanjuan where id =%d" \
" union select id from other where id =%d " % (id,id,id,id)
cursor.execute(selectsql)
#如果执行不成功,代表不存在数据库。则执行插入步骤
if cursor.fetchone() is None:
insertsql = "insert into %s values (%d,'''%s''','''%s''','''%s''');" % (
item["catalogues"], id, item["article_title"], item["create_time"], item["article_text"]) cursor.execute(insertsql) class myarticlepipline(object):
def process_item(self, item, spider): return item

  

settings.py负责存放整体设置

# -*- coding: utf-8 -*-
import os
# Scrapy settings for huaidan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'huaidan' SPIDER_MODULES = ['huaidan.spiders']
NEWSPIDER_MODULE = 'huaidan.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'huaidan (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'huaidan.middlewares.HuaidanSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'huaidan.middlewares.HuaidanDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#下面内容代表执行piplines动作顺序,数字越小,越先执行。
ITEM_PIPELINES = {
#'huaidan.pipelines.HuaidanPipeline': 300,
#'scrapy.pipelines.files.FilesPipeline':2,
#'huaidan.pipelines.myarticlepipline':1,
#'huaidan.pipelines.MysqlPiplines':2, #直接插入到mysql数据库的方法
'huaidan.pipelines.MysqlTwisted':1, #异步插入到mysql数据库的方法
}
project_dir = os.path.abspath(os.path.dirname(__file__))
FILES_URLS=FIELD =""
FILES_STORE = os.path.join(project_dir,'files') # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #把数据库信息存放到settings里可以直接调用
MYSQL_HOST = "192.168.7.5"
MYSQL_DBNAME = "huaidan"
MYSQL_USER = "huaidan"
MYSQL_PASSWORD = "huaidan123"

  

scrapy爬取《坏蛋是怎样练成的4》的更多相关文章

  1. 【转载】教你分分钟学会用python爬虫框架Scrapy爬取心目中的女神

    原文:教你分分钟学会用python爬虫框架Scrapy爬取心目中的女神 本博文将带领你从入门到精通爬虫框架Scrapy,最终具备爬取任何网页的数据的能力.本文以校花网为例进行爬取,校花网:http:/ ...

  2. 第三百三十四节,web爬虫讲解2—Scrapy框架爬虫—Scrapy爬取百度新闻,爬取Ajax动态生成的信息

    第三百三十四节,web爬虫讲解2—Scrapy框架爬虫—Scrapy爬取百度新闻,爬取Ajax动态生成的信息 crapy爬取百度新闻,爬取Ajax动态生成的信息,抓取百度新闻首页的新闻rul地址 有多 ...

  3. Scrapy爬取美女图片续集 (原创)

    上一篇咱们讲解了Scrapy的工作机制和如何使用Scrapy爬取美女图片,而今天接着讲解Scrapy爬取美女图片,不过采取了不同的方式和代码实现,对Scrapy的功能进行更深入的运用.(我的新书< ...

  4. python scrapy爬取HBS 汉堡南美航运公司柜号信息

    下面分享个scrapy的例子 利用scrapy爬取HBS 船公司柜号信息 1.前期准备 查询提单号下的柜号有哪些,主要是在下面的网站上,输入提单号,然后点击查询 https://www.hamburg ...

  5. scrapy 爬取纵横网实战

    前言 闲来无事就要练练代码,不知道最近爬取什么网站好,就拿纵横网爬取我最喜欢的雪中悍刀行练手吧 准备 python3 scrapy 项目创建: cmd命令行切换到工作目录创建scrapy项目  两条命 ...

  6. 如何提升scrapy爬取数据的效率

    在配置文件中修改相关参数: 增加并发 默认的scrapy开启的并发线程为32个,可以适当的进行增加,再配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. ...

  7. 提高Scrapy爬取效率

    1.增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. 2.降低 ...

  8. scrapy爬取效率提升配置

    增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. 降低日志级别 ...

  9. 提高scrapy爬取效率配置

    提高scrapy爬取效率配置 #增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发 ...

  10. Scrapy爬取美女图片 (原创)

    有半个月没有更新了,最近确实有点忙.先是华为的比赛,接着实验室又有项目,然后又学习了一些新的知识,所以没有更新文章.为了表达我的歉意,我给大家来一波福利... 今天咱们说的是爬虫框架.之前我使用pyt ...

随机推荐

  1. Sharding Sphere的分库分表

    什么是 ShardingSphere? 1.一套开源的分布式数据库中间件解决方案 2.有三个产品:Sharding-JDBC 和 Sharding-Proxy 3.定位为关系型数据库中间件,合理在分布 ...

  2. Flowable实战(六)集成JPA

      上文提到,Flowable所有的表单数据都保存在一张表(act_hi_varinst)中,随着时间的推移,表中数据越来越多,再加上数据没有结构优化,查询使用效率会越来越低.   在Flowable ...

  3. 学习javaScript必知必会(6)~类、类的定义、prototype 原型、json对象

    一.定义类:使用的是funciton,因为在js中没有定义类的class语句,只有function. ■ 举例: //定义一个Person类(通过类的无参构造函数定义类) function Perso ...

  4. Ajax_同源策略以及跨域问题

    Ajax_同源策略 同源策略是浏览器的一种安全策略, 同源指的是:协议.域名.端口.必须完全相同. 违背同源策略就是跨域. 而AJAX是默认遵循同源策略的: 同源说通俗一点呢就是页面跟获取请求的接口是 ...

  5. JavaScript之 函数节流(Throttling) 与 防抖(Debounce)

    Throttling:在监听鼠标移动事件.盒子滚动事件等使用节流技术能节省性能消耗 /** * 节流函数(每隔t执行一次) */ function Throttling(fn, t) { const ...

  6. PostgreSQL源码编译

    环境:Ubuntu 16.04+PostgresQL13.2 1.指定安装路径 ./configure --prefix=/opt/postgresql 2.编译安装 sudo make sudo m ...

  7. 聊一聊如何用C#轻松完成一个TCC分布式事务

    背景 银行跨行转账业务是一个典型分布式事务场景,假设 A 需要跨行转账给 B,那么就涉及两个银行的数据,无法通过一个数据库的本地事务保证转账的 ACID ,只能够通过分布式事务来解决. 在 聊一聊如何 ...

  8. freeswitch插件式模块接口实现方式

    概述 freeswitch的外围模块是插件式的,可以动态的加载和卸载,使用起来非常的灵活和方便. 如果我们自己来设计一个开源的代码框架,相信这种插件式的模块结构是非常适合多人合作的模式. 本文对fs的 ...

  9. [USACO18DEC]Sort It Out P

    初看本题毫无思路,只能从特殊的 \(K = 1\) 出发. 但是直接考虑构造一组字典序最小的方案还是不好构造,可以考虑先手玩一下样例.通过自己手玩的样例可以发现,貌似没有被选出来的数在原排列中都是递增 ...

  10. java中args是什么意思?

    1. 字符串变量名(args)属于引用变量,名字代号而已,可以自己取的. 2.总的来说就是个存放字符串数组用的, 去掉就不知道 "args" 声明的变量是什么类型了. 3.如果有 public sta ...