go并发版爬虫
并发版爬虫
代码实现
/crawler/main.go
package main
import (
"learn/crawler/engine"
"learn/crawler/scheduler"
"learn/crawler/zhenai/parser"
)
func main() {
e := engine.ConcurrentEngine{
Scheduler: &scheduler.QueuedScheduler{},
WorkerCount: 20,
}
e.Run(engine.Request{
Url: "http://www.zhenai.com/zhenghun",
ParseFunc: parser.ParseCityList,
})
//测试上海单个城市
//e.Run(engine.Request{
// Url: "http://www.zhenai.com/zhenghun/shanghai",
// ParseFunc: parser.ParseCity,
//})
}
/crawler/engine/simple.go
package engine
import (
"learn/crawler/fetcher"
"log"
)
type SimpleEngine struct {
}
func (e SimpleEngine) Run(seeds ...Request) {
var requests []Request
for _, r := range seeds {
requests = append(requests, r)
}
for len(requests) > 0 {
r := requests[0]
requests = requests[1:]
parseResult, err := worker(r)
if err != nil {
continue
}
requests = append(requests, parseResult.Requests...)
for _, item := range parseResult.Items{
log.Printf("Got item %v", item)
}
}
}
func worker(r Request) (ParseResult, error) {
log.Printf("Fetching %s", r.Url)
body, err := fetcher.Fetch(r.Url)
if err != nil {
log.Printf("Fetcher: error" + "fetching url %s: %v", r.Url, err)
return ParseResult{}, err
}
return r.ParseFunc(body), nil
}
/crawler/engine/concurrent.go
package engine
import (
"log"
)
type ConcurrentEngine struct {
Scheduler Scheduler
WorkerCount int
}
type Scheduler interface {
ReadyNotifier
Submit(Request)
WorkerChan() chan Request
Run()
}
type ReadyNotifier interface {
WorkerReady(chan Request)
}
func (e *ConcurrentEngine) Run(seeds ...Request) {
out := make(chan ParseResult)
e.Scheduler.Run()
for i := 0; i < e.WorkerCount; i++ {
createWork(e.Scheduler.WorkerChan(), out, e.Scheduler)
}
for _, r := range seeds {
e.Scheduler.Submit(r)
}
itemCount := 0
for {
result := <- out
for _, item := range result.Items {
log.Printf("Got item #%d: %v", itemCount, item)
itemCount++
}
for _, request := range result.Requests {
e.Scheduler.Submit(request)
}
}
}
func createWork(in chan Request, out chan ParseResult, ready ReadyNotifier) {
go func() {
for {
ready.WorkerReady(in)
request := <- in
result, err := worker(request)
if err != nil {
continue
}
out <- result
}
}()
}
/crawler/engine/typers.go
package engine
type Request struct {
Url string
ParseFunc func([]byte) ParseResult
}
type ParseResult struct {
Requests []Request
Items []interface{}
}
func NilParser([]byte) ParseResult{
return ParseResult{}
}
/crawler/fetcher/fetcher.go
package fetcher
import (
"bufio"
"fmt"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"io/ioutil"
"log"
"net/http"
"time"
)
var rateLimiter = time.Tick(100 * time.Millisecond)
func Fetch(url string) ([]byte, error) {
<- rateLimiter
client := &http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("Wrong status code: %d", resp.StatusCode)
}
bodyReader := bufio.NewReader(resp.Body)
e := determineEncoding(bodyReader)
utf8Reader := transform.NewReader(bodyReader, e.NewDecoder())
return ioutil.ReadAll(utf8Reader)
}
func determineEncoding(r *bufio.Reader) encoding.Encoding {
bytes, err := r.Peek(1024)
if err != nil {
log.Printf("Fetcher error: %v", err)
return unicode.UTF8
}
e, _, _ := charset.DetermineEncoding(bytes, "")
return e
}
/crawler/zhenai/parser/citylist.go
package parser
import (
"learn/crawler/engine"
"regexp"
)
const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" [^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) engine.ParseResult {
re := regexp.MustCompile(cityListRe)
matches := re.FindAllSubmatch(contents, -1)
result := engine.ParseResult{}
for _, m := range matches {
result.Items = append(result.Items, "City: "+string(m[2]))
result.Requests = append(result.Requests, engine.Request{
Url: string(m[1]),
ParseFunc: ParseCity,
})
}
return result
}
/crawler/zhenai/parser/city.go
package parser
import (
"learn/crawler/engine"
"regexp"
)
var (
profileRe = regexp.MustCompile(`<a href="(http://album.zhenai.com/u/[0-9]+)" [^>]*>([^<]+)</a>`)
cityUrlRe = regexp.MustCompile(`href="(http://www.zhenai.com/zhenghun/[^"]+)"`)
)
func ParseCity(contents []byte) engine.ParseResult {
matches := profileRe.FindAllSubmatch(contents, -1)
result := engine.ParseResult{}
for _, m := range matches {
name := string(m[2])
result.Items = append(result.Items, "User "+name)
result.Requests = append(result.Requests, engine.Request{
Url: string(m[1]),
ParseFunc: func(c []byte) engine.ParseResult {
return ParseProfile(c, "name:"+name)
},
})
}
matches = cityUrlRe.FindAllSubmatch(contents, -1)
for _, m := range matches {
result.Requests = append(result.Requests, engine.Request{
Url: string(m[1]),
ParseFunc: ParseCity,
})
}
return result
}
/crawler/zhenai/parser/profile.go
package parser
import (
"learn/crawler/engine"
"learn/crawler/model"
"regexp"
)
const all = `<div class="m-btn purple" data-v-8b1eac0c>([^<]+)</div>`
func ParseProfile(contents []byte, name string) engine.ParseResult {
profile := model.Profile{}
profile.User = append(profile.User, name)
re := regexp.MustCompile(all)
match := re.FindAllSubmatch(contents,-1)
if match != nil {
for _, m := range match {
profile.User = append(profile.User, string(m[1]))
}
}
result := engine.ParseResult{
Items: []interface{}{profile},
}
return result
}
/crawler/model/profile.go
package model
type Profile struct {
User []string
}
/crawler/scheduler/queued.go
package scheduler
import "learn/crawler/engine"
type QueuedScheduler struct {
requestChan chan engine.Request
workChan chan chan engine.Request
}
func (s *QueuedScheduler) WorkerChan() chan engine.Request {
return make(chan engine.Request)
}
func (s *QueuedScheduler) Submit(r engine.Request) {
s.requestChan <- r
}
func (s *QueuedScheduler) WorkerReady(w chan engine.Request){
s.workChan <- w
}
func (s *QueuedScheduler) Run(){
s.workChan = make(chan chan engine.Request)
s.requestChan = make(chan engine.Request)
go func() {
var requestQ []engine.Request
var workerQ []chan engine.Request
for {
var activeRequest engine.Request
var activeWorker chan engine.Request
if len(requestQ) > 0 && len(workerQ) > 0 {
activeRequest = requestQ[0]
activeWorker = workerQ[0]
}
select {
case r := <-s.requestChan:
requestQ = append(requestQ, r)
case w := <-s.workChan:
workerQ = append(workerQ, w)
case activeWorker <- activeRequest:
workerQ = workerQ[1:]
requestQ = requestQ[1:]
}
}
}()
}
/crawler/scheduler/simple.go
package scheduler
import "learn/crawler/engine"
type SimpleScheduler struct {
workerChan chan engine.Request
}
func (s *SimpleScheduler) WorkerChan() chan engine.Request {
return s.workerChan
}
func (s *SimpleScheduler) WorkerReady(chan engine.Request) {
}
func (s *SimpleScheduler) Run() {
s.workerChan = make(chan engine.Request)
}
func (s *SimpleScheduler) Submit(r engine.Request) {
go func() { s.workerChan <- r }()
}
完整项目
https://gitee.com/FenYiYuan/golang-cpdcrawler.git
go并发版爬虫的更多相关文章
- Go语言之进阶篇爬百度贴吧并发版
1.爬百度贴吧并发版 示例: package main import ( "fmt" "net/http" "os" "strco ...
- Go HelloWorld 网络版和并发版
网络版 package main import ( "net/http" "fmt" ) func main() { http.HandleFunc(" ...
- go-爬虫-百度贴吧(并发版)
爬取百度贴吧的网页 非并发版 package main import ( "fmt" "io" "net/http" "os&qu ...
- go单任务版爬虫
go单任务版爬虫(爬取珍爱网) 爬虫总体算法 单任务版爬虫架构 任务 获取并打印所在城市第一页用户的详细信息 代码实现 /crawler/main.go package main import ( & ...
- 区划代码 node 版爬虫尝试
前言 对于区划代码数据,很多人都不会陌生,大多公司数据库都会维护一份区划代码,包含省市区等数据.区划信息跟用户信息息息相关,往往由于历史原因很多数据都是比较老的数据,且不会轻易更改.网上也有很多人提供 ...
- python链家网高并发异步爬虫asyncio+aiohttp+aiomysql异步存入数据
python链家网二手房异步IO爬虫,使用asyncio.aiohttp和aiomysql 很多小伙伴初学python时都会学习到爬虫,刚入门时会使用requests.urllib这些同步的库进行单线 ...
- 最新IP地址数据库Dat格式-高性能高并发版(2019年3月)
最新IP地址数据库->Dat 二进制文件 高性能高并发-qqzeng-ip.dat 格式 全球IP数据库-20190301-Dat 版 国内IP数据库-20190 ...
- python链家网高并发异步爬虫and异步存入数据
python链家网二手房异步IO爬虫,使用asyncio.aiohttp和aiomysql 很多小伙伴初学python时都会学习到爬虫,刚入门时会使用requests.urllib这些同步的库进行单线 ...
- python学习_新闻联播文字版爬虫(V 1.0版)
python3的爬虫练习,爬取的是新闻联播文字版网站 #!/usr/bin/env python # -*- coding: utf-8 -*- ''' __author__ = 'wyf349' _ ...
随机推荐
- Day6-Python3基础-面向对象编程
面向过程 VS 面向对象 编程范式 编程是 程序 员 用特定的语法+数据结构+算法组成的代码来告诉计算机如何执行任务的过程 , 一个程序是程序员为了得到一个任务结果而编写的一组指令的集合,正所谓条条大 ...
- Web前端-HTML、CSS、JS
概述 HTML是英文Hyper Text Mark-up Language(超文本标记语言)的缩写,它是一种制作万维网页面标准语言(标记).相当于定义统一的一套规则,大家都来遵守他, 这样就可以让浏览 ...
- excle 写入数据库
龙龙博客:https://www.cnblogs.com/meilong/p/cao-zuoexcel-mo-kuaiopenpyxl.html 1 安装 pip install openpyxl 如 ...
- CTF--HTTP服务--PUT上传漏洞
开门见山 1. 扫描靶机ip,发现PCS 192.168.31.48 2. 用nmap扫描靶机的开放服务和版本信息 3. 再扫描全部信息 4. 用nikto探测靶机http服务敏感信息 5. 再用di ...
- HTML简介介绍
网页概述 网页:纯文本格式的文件:(以村文本格式编写,后缀名改为HTML的文本文件) ---- 网站:多个网页的集合: ---- 主页:打开网站后显示的第一个页面: ---- 浏览器:将纯文本格式的文 ...
- Arduino系列之智能家居蓝牙语音遥控灯(四)
用到的材料 Arduino uno hc-05 蓝牙模块 安卓手机 安卓APP AMR—voice 通过安卓手机连接Arduino的蓝牙模块Hc-05,通过语音识别软件AMR-voice识别语音, ...
- 西门子PLC在自动浇灌系统中的应用
西门子PLC在自动浇灌系统中的应用(鸿控整理) 2020-02-07 22:50:48 1 自动浇灌系统简介 系统采用自行研制的湿度传感器监测土壤的湿度情况,当土壤湿度低于所要求的值后,自动开启水泵电 ...
- 分享数百个 HT 工业互联网 2D 3D 可视化应用案例之 2019 篇
继<分享数百个 HT 工业互联网 2D 3D 可视化应用案例>2018 篇,图扑软件定义 2018 为国内工业互联网可视化的元年后,2019 年里我们与各行业客户进行了更深度合作,拓展了H ...
- tomcat-windows10环境搭建
1.进入Tomcat官网Apache Tomcat® - Welcome! 2.根据操作系统选择合适的版本下载 zip用于windows操作系统, tar.gz用于unix和linux操作系统 Bin ...
- POP and IMAP - Post Office Protocol and Internet Message Access Protocol
POP and IMAP - Post Office Protocol and Internet Message Access Protocol 用来从 SMTP Server 上下载邮件的协议. P ...