node爬虫进阶版

手写了一个方便爬虫的小库：

const url = require('url')

const glib = require('zlib')

//默认头部

const _default_headers = {

    'Accept-Encoding': 'gzip, deflate, br',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'

}

//options(url,method,header)--http头部信息 isDebug--是否开启调试状态

module.exports = function(options, isDebug) {

    if(typeof options === "string") {

        options = {

            url: options,

            method: 'GET',

            headers: {}

        }

    } else {

        options = options || {}

        options.method = options.method || 'GET'

        options.headers = options.headers || {}

    }

    options.headers = Object.assign(_default_headers, options.headers)

    function debug(msg) {

        if(isDebug) {

            console.log(msg)

        }

    }

    return new Promise((resolve, reject) => {

        req(options)

        function req(options) {

            //判断是http还是https

            let urlObj = url.parse(options.url)

            let mod = null

            port = 0

            if(urlObj.protocol == 'https:') {

                mod = require('https')

                port = 443

            } else {

                mod = require('http')

                port = 80

            }

            let _req_options = {

                hostname: urlObj.hostname,

                port,

                path: urlObj.path,

                method: options.method,

                headers: options.headers

            }

            //开始模拟，爬取信息

            let req_obj = mod.request(_req_options, (res) => {

                if(res.statusCode!==200) {

                    //如果是重定向则重新在请求

                    if(res.statusCode == 301 || res.statusCode === 302) {

                        options.url = res.headers.location

                        debug('重定向: '+res.headers.location)

                        req(options)

                    } else {

                        reject(res.statusCode)

                    }

                } else {

                    //statusCode是200时接受data buffer

                    let data = []

                    res.on('data', buffer => {

                        data.push(buffer)

                    })

                    res.on('end', () =>{

                        let buffer = Buffer.concat(data)

                        //判断是否传输有误

                        if (res.headers['content-length'] != buffer.length) {

                            debug('收到数据有误，正在重新获取')

                            req(options)

                        }

                        //判断是否有用gzip

                        else if (res.headers['content-encoding'] && res.headers['content-encoding'].includes('gzip')) {

                           buffer = glib.gunzip(buffer, (err,data) => {

                               debug('gzip解压完成并成功返回')

                               resolve(data)

                           })

                        } else {

                            debug('成功返回')

                            resolve(buffer)

                        }

                    })

                }

            })

            req_obj.on('error', err => {

                debug('爬虫失败')

                reject(err)

            })

            req_obj.end()

        }

    })

}

require进来然后传入url或者options,就可以得到爬虫后返回的promise了

举个例子：

我要爬个bilibili的视频：

const url = require('url')

const fs = require('fs')

function getVideo(options, headers, fileName) {

    if(typeof options === "string") {

        options = {

            url: options,

            method: 'GET',

            headers: {},

            timeout: 2000

        }

    } else {

        options = options || {}

        options.method = options.method || 'GET'

        options.headers = options.headers || {}

        options.timeout = options.timeout || 2000

    }

    options.headers = headers

    return new Promise((resolve, reject) => {

        req(options)

        function req(options) {

            //判断是http还是https

            let urlObj = url.parse(options.url)

            let mod = null

            port = 0

            if(urlObj.protocol == 'https:') {

                mod = require('https')

                port = 443

            } else {

                mod = require('http')

                port = 80

            }

            let _req_options = {

                hostname: urlObj.hostname,

                port,

                path: urlObj.path,

                method: options.method,

                headers: options.headers,

                timeout: options.timeout

            }

            //开始模拟，爬取信息

            let req_obj = mod.request(_req_options, (res) => {

                // 视频路径

                const filePath = `${__dirname}/${fileName}`;

                if (fs.existsSync(filePath)) {

                    fs.unlinkSync(filePath)

                }

                res.on('data', buffer => {

                    fs.appendFileSync(filePath, buffer)

                    const size = fs.statSync(filePath).size;

                    console.log(`已下载${(size / 1024 / 1024).toFixed(2)}MB,完成${(size/res.headers['content-length'] * 100).toFixed(2)}%`)

                })

                res.on('end', () =>{

                    resolve()

                })

            })

            req_obj.on('error', err => {

                debug('爬虫失败')

                reject(err)

            })

            req_obj.end()

        }

    })

}

// 生成文件名

const fileName = '1.flv'

// 链接

const videoUrl = 'https://cn-sdyt-cu-v-05.acgvideo.com/upgcxcode/66/83/34548366/34548366-1-64.flv?expires=1545405600&platform=pc&ssig=ElhY4A2e-U4R2m8EI1eiGQ&oi=1928611810&nfa=uTIiNt+AQjcYULykM2EttA==&dynamic=1&hfa=2116953847&hfb=Yjk5ZmZjM2M1YzY4ZjAwYTMzMTIzYmIyNWY4ODJkNWI=&trid=45c5fdc464354b71bf599c224b7df8ea&nfb=maPYqpoel5MI3qOUX6YpRA==&nfc=1';

// 头部

const header = {

    'Origin': 'https://www.bilibili.com',

    'Referer': 'https://www.bilibili.com/video/av21061574',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',

}

getVideo(videoUrl, header, fileName).then(res => {

    console.log('写入成功');

})

node爬虫进阶版的更多相关文章

node爬虫（简版）
做node爬虫,首先像如何的去做这个爬虫,首先先想下思路,我这里要爬取一个页面的数据,要调取网页的数据,转换成页面格式(html+div)格式,然后提取里面独特的属性值,再把你提取的值,传送给你的页面 ...
Nodejs爬虫进阶教程之异步并发控制
Nodejs爬虫进阶教程之异步并发控制之前写了个现在看来很不完美的小爬虫,很多地方没有处理好,比如说在知乎点开一个问题的时候,它的所有回答并不是全部加载好了的,当你拉到回答的尾部时,点击加载更多,回 ...
webpack4打包nodejs项目进阶版——多页应用模板
前段时间我写了个打包nodejs项目的文章,点击前往但是,问题很多.因为之前的项目是个历史遗留项目,重构起来可能会爆炸,当时又比较急所以就写个的适用范围很小的webpack的打包方法. 最近稍微得空 ...
node爬虫的几种简易实现方式
说到爬虫大家可能会觉得很NB的东西,可以爬小电影,羞羞图,没错就是这样的.在node爬虫方面,我也是个新人,这篇文章主要是给大家分享几种实现node 爬虫的方式.第一种方式,采用node,js中的 s ...
高效能团队的Java研发规范(进阶版)
目前大部分团队是使用的阿里巴巴Java开发规范,不过在日常开发中难免遇到覆盖不到的场景,本文在阿里巴巴Java开发规范基础上,补充一些常用的规范,用于提升代码质量及增强代码可读性. 编程规约 1.基础 ...
zip伪加密文件分析（进阶版）
作者近日偶然获得一misc题,本来以为手到擒来,毕竟这是个大家都讨论烂了的题,详情访问链接http://blog.csdn.net/ETF6996/article/details/51946250.既 ...
继续node爬虫 — 百行代码自制自动AC机器人日解千题攻占HDOJ
前言不说话,先猛戳 Ranklist 看我排名. 这是用 node 自动刷题大概半天的 "战绩",本文就来为大家简单讲解下如何用 node 做一个 "自动AC机&quo ...
Node爬虫
Node爬虫参考 http://www.cnblogs.com/edwardstudy/p/4133421.html 所谓的爬虫就是发送请求,并将响应的数据做一些处理只不过不用浏览器来发送请求需 ...
python--代码统计（进阶版）
在上一篇的随笔中发表了代码统计小程序,但是发表后,我发现,以前写的代码怎么办写了那么多,怎么就从0开始了呢,,,,我还是个孩子啊,不能这么残忍于是,代码统计进阶版:统计当前目录下所有指定文件类型的 ...

随机推荐

Ubuntu16.04LTS +Qt+boost1.66编译错误：consuming_buffers.hpp: parse error in template argument list
升级gcc版本至 6 以上.. 安装gcc-6系列与安装boost (Ubuntu16.04LTS)
【LG3768】简单的数学题
[LG3768]简单的数学题题面求 \[ (\sum_{i=1}^n\sum_{j=1}^nij\text{gcd}(i,j))\text{mod}p \] 其中\(n\leq 10^{10},5 ...
[CF966F]May Holidays[分块+虚树]
题意给定 \(n\) 个点的树,初始所有颜色都是 \(0\) ,每个点有一个阈值 \(t\) ,每次可能会让一个点的颜色异或1,问每次操作之后有多少个点满足子树内的颜色为 \(1\) 的点的个数 \ ...
8、Dockerfile介绍和最佳实践
一.Dockerfile 概念 1.Dockerfile是什么 Docker 镜像是一个特殊的文件系统,除了提供容器运行时所需的程序.库.资源.配置等文件外,还包含了一些为运行时准备的一些配置参数(如 ...
stl源码剖析详细学习笔记空间配置器
//---------------------------15/04/05---------------------------- /* 空间配置器概述: 1:new操作包含两个阶段操作 1>调 ...
冒泡排序算法的C++,Java和Python实现和冒泡排序算法三种语言效率的比较
冒泡排序原理: 这一篇百度经验讲得很好,我不多说了 https://jingyan.baidu.com/article/6525d4b13f920bac7d2e9484.html 他讲的是C语言,没有 ...
tcp ,http .udp
三次握手,四次挥手要知道,记住. 计算机协议常见面试题,学会了,记住.会运用.
Teaching Machines to Understand Us 让机器理解我们之二深度学习的历史
Deep history 深度学习的历史 The roots of deep learning reach back further than LeCun’s time at Bell Labs. H ...
Keras学习笔记。
1. keras.layers.Dense (Fully Connected Neural NetWork),所实现的运算是output = activation(dot(input, kernel) ...
GoldNumber游戏比赛成绩公布
比赛介绍:http://www.cnblogs.com/xinz/p/3347418.html 黄金点游戏: N个同学(N通常大于10),每人写一个0~100之间的有理数 (不包括0或100),交给裁 ...

node爬虫进阶版

node爬虫进阶版的更多相关文章

随机推荐

热门专题