【url ---lib___】笔趣阁(抓取斗罗大陆完整)和(三寸天堂)

 # coding=gbk  #因为在黑屏下执行，所以代码会使用GBK

 url='http://www.biquge.info/10_10218/'

 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}

 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

     'Host':'www.xxbiquge.com',

     'Referer':'https://www.xxbiquge.com/2_2278/'}

 import time,lxml,pymysql

 from lxml import etree

 from urllib.request import Request

 from urllib.request import urlopen

 import os,sys,io

 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 

 def source(url):#获取源

    global UA

    text=urlopen(Request(url,None,UA),timeout=5)

    return text.read()

 def respon(text):#解析章

    global url

    seletor=etree.HTML(text)

    url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href")

    return url1

 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html')

    global UA1

    for i in url:

       i='https://www.xxbiquge.com'+i

       a=urlopen(Request(i,None,UA1),timeout=5).read()

       seletor=etree.HTML(a)

       text=seletor.xpath('//*[@id="content"]/text()')#内容

       c=''

       for aa in text:

           c=c+aa

       text1=seletor.xpath('//html/head/title/text()')[0].split('-')[0]#章节名

       #print(i,type(i),text1,type(text1))

       mysqlw(c,i,text1)

       time.sleep(3)

 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'2.html')

 #with open(c,'r') as f:

 #   a=f.read()

 def mysqlw(text,url,chapter):#写内容

    b1=time.time()

    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')

    cur=b.cursor()

    print(url,chapter,'w')

    #for i in cur.fetchall():

        #pass

    sql="""insert into douludalu(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter)

    print(sql)

    try:

        cur.execute(sql)

        b.commit()

        print("插入成功")

    except Exception as e:

        print(e)

        b.rollback()

    b.close()

    print("关闭",'耗时',time.time()-b1)

 def mysqlr(text):#读内容

    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')

    cur=b.cursor()

    sql='select * from douludalu where html="%s%s%%s"'%(',text,')

    cur.execute(sql)

    print(sql)

    for i in cur.fetchall():

        a=i[0]

        b=i[3]

        print(a,b)

 #a='2唐三已经挥出了八千余锤，铁坨不断的变小，已经不到最初时三分'

 #mysqlw(a,'1.html','第一章')

 def main():

    a=source('https://www.xxbiquge.com/2_2278/')

    b=respon(a)

    spider(b)

 #mysqlr('https://www.xxbiquge.com/2_2278/1036550.html')

 main()

——————————————————————————————————————————————————————————————————

三寸天堂

 # coding=gbk

 url='http://www.biquge.info/10_10218/'

 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}

 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

     'Host':'www.biquge.com.tw',

     'Referer':'http://www.biquge.com.tw/14_14055/',

     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}

 import time,lxml,pymysql,threading

 from lxml import etree

 from urllib.request import Request

 from urllib.request import urlopen

 import os,sys,io

 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 

 def source(url):#获取源

    global UA

    text=urlopen(Request(url,None,UA),timeout=5)

    return text.read()

 def respon(text):#解析章

    global url

    seletor=etree.HTML(text)

    url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href")

    return url1

 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html')

    global UA1

    i='http://www.biquge.com.tw/'+url

    print(i)

    a=urlopen(Request(i,None,UA1),timeout=5).read()

    if a is None:

        pass

    else:

        seletor=etree.HTML(a)

        text=seletor.xpath('//*[@id="content"]/text()')#内容

        c=''

        for aa in text:

            c=c+aa

        text1=seletor.xpath('//html/head/title/text()')[0]#章节名

    print(text1)

    #print(i,type(i),text1,type(text1))

    mysqlw(c,i,text1)

    time.sleep(3)

 def mysqlw(text,url,chapter):#写内容

    b1=time.time()

    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')

    cur=b.cursor()

    print(url,chapter,'')

    #for i in cur.fetchall():

        #pass

    sql="""insert into suibian(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter)

    try:

        cur.execute(sql)

        b.commit()

        print("插入成功")

    except Exception as e:

        print(e)

        b.rollback()

    b.close()

    print("关闭",'耗时',time.time()-b1)

 def mysqlr(text):#读内容

    b1=True

    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')

    cur=b.cursor()

    sql='select * from douludalu where html="%s%s%%s"'%(',text,')

    cur.execute(sql)

    print(sql)

    for i in cur.fetchall():

        a=i[0]

        b=i[3]

        print(a,b)

    if i[3] is None:

        b1=False

 def main():

    print(threading.current_thread().name)

    cc=time.time()

    print('开始时间%s'%cc)

    a=source('http://www.biquge.com.tw/14_14055/')

    b=respon(a)

    for i in b:

        #print(i)

        spider(i)

    ctime=time.time()-cc

    print('完成耗时%s'%ctime)

 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'1.html')

 #with open(c,'r') as f:

 #   a=f.read()

 main()

特别需要注意的是UA在Request中传值会出现错误，这时需要耐心来把问题解决

容易出现的错误【

　　1，协议中，referer错误，host错误

　　2，网页xpath错误，目测此网站的网页还是比较规则的

】

【url ---lib___】笔趣阁(抓取斗罗大陆完整)和(三寸天堂)的更多相关文章

Jsoup-基于Java实现网络爬虫-爬取笔趣阁小说
注意!仅供学习交流使用,请勿用在歪门邪道的地方!技术只是工具!关键在于用途! 今天接触了一款有意思的框架,作用是网络爬虫,他可以像操作JS一样对网页内容进行提取初体验Jsoup <!-- Ma ...
scrapy框架爬取笔趣阁
笔趣阁是很好爬的网站了,这里简单爬取了全部小说链接和每本的全部章节链接,还想爬取章节内容在biquge.py里在加一个爬取循环,在pipelines.py添加保存函数即可 1 创建一个scrapy项目 ...
python入门学习之Python爬取最新笔趣阁小说
Python爬取新笔趣阁小说,并保存到TXT文件中我写的这篇文章,是利用Python爬取小说编写的程序,这是我学习Python爬虫当中自己独立写的第一个程序,中途也遇到了一些困难,但是最后 ...
bs4爬取笔趣阁小说
参考链接:https://www.cnblogs.com/wt714/p/11963497.html 模块:requests,bs4,queue,sys,time 步骤:给出URL--> 访问U ...
免app下载笔趣阁小说
第一次更新:发现一个问题,就是有时候网页排版有问题的话容易下载到多余章节,如下图所示: 网站抽风多了一个正文一栏,这样的话就会重复下载1603--1703章节. 解决办法: 于是在写入内容前加了一个章 ...
HttpClients+Jsoup抓取笔趣阁小说，并保存到本地TXT文件
前言首先先介绍一下Jsoup:(摘自官网) jsoup is a Java library for working with real-world HTML. It provides a very ...
Python爬取笔趣阁小说，有趣又实用
上班想摸鱼?为了摸鱼方便,今天自己写了个爬取笔阁小说的程序.好吧,其实就是找个目的学习python,分享一下. 1. 首先导入相关的模块 import os import requests from ...
python应用：爬虫框架Scrapy系统学习第四篇——scrapy爬取笔趣阁小说
使用cmd创建一个scrapy项目: scrapy startproject project_name (project_name 必须以字母开头,只能包含字母.数字以及下划线<undersco ...
scrapycrawl 爬取笔趣阁小说
前言第一次发到博客上..不太会排版见谅最近在看一些爬虫教学的视频,有感而发,大学的时候看盗版小说网站觉得很能赚钱,心想自己也要搞个,正好想爬点小说能不能试试做个网站(网站搭建啥的都不会...) 站 ...

随机推荐

RAID概念记录
之前对RAID概念有一些基本的认知,这次同事培训k8s 的持久卷,提到了RAID的一些概念和用法,记录一下. RAID ( Redundant Array of Independent Disks ) ...
delphi7 treeview + 数据库实现动态节点维护
首先说下树节点对应的表的基本结构,必需要有的字段(节点编号,父节点编号,节点名称),其他字段根据你开发的需要添加从添加节点开始,一开始就取出表中最大节点编号,每次添加节点的时候,该节点编号增加1;添加 ...
跨域form下载方式批量下载
downloadFileForm:function(fid) { var url = "https://file.xxxx.com/fileDownload.do"; var in ...
CSS 自适应技巧
DIV的内容垂直居中不再MARGINT-TOP多少来居中显示 display:table-cell; #block-1{ width:100%; height:80px; display:tabl ...
JQ也要面向对象~在JQ中扩展静态方法和实例方法(jq扩展方法)
JQ也要面向对象,事实上,无论哪种开发语言,在开发功能时,都要把面向对象拿出来,用它的思想去干事,去理解事,面向对象会使问题简单化,清晰化,今天说两个概念“静态方法”与“实现方法”,这个在面向对象的语 ...
申请 Let's Encrypt 通配符 HTTPS 证书
目录一.背景知识 1.1.什么是通配符证书 1.2.什么是 Let's Encrypt 二.证书申请(certbot) 2.1.系统确定 2.2.工具安装 2.3.证书申请 2.4.证书查看 2.5 ...
Flutter 页面下拉刷新和上拉加载
flutter_easyrefresh 正如名字一样,EasyRefresh很容易就能在Flutter应用上实现下拉刷新以及上拉加载操作,它支持几乎所有的Flutter控件.它的功能与Android的 ...
Oracle拼接同一个字段多行的值
本文引用自- https://www.cnblogs.com/qianyuliang/p/6649983.html https://blog.csdn.net/defonds/article/de ...
C#使用MPI进行高性能计算
MPI.NET是用于Microsoft.NET环境的高性能.易于使用的消息传递接口(MPI)实现.mpi是编写在分布式内存系统(如计算集群)上运行的并行程序的事实上的标准,并且得到了广泛的实现.大多数 ...
vue中 :style 与 :class 三元运算符使用
参考链接:https://www.jianshu.com/p/31664974303d

【url ---lib___】笔趣阁(抓取斗罗大陆完整)和(三寸天堂)

【url ---lib___】笔趣阁(抓取斗罗大陆完整)和(三寸天堂)的更多相关文章

随机推荐

热门专题