Colly provides a clean interface to write any kind of crawler/scraper/spider

Scraping Framework for Golang http://go-colly.org/

https://github.com/gocolly/colly

package main

import (
   "fmt"

   "github.com/gocolly/colly"
   "time"
   "regexp"
   "strings"
)

/*
task
http://www.cnhan.com/hyzx/
http://www.cnhan.com/shantui/
http://www.cnhan.com/pinfo/

http://www.heze.cn/info
http://www.heze.cn/qiye/

采集站点当日更新数据的客户联系方式

*/
func getTodayUrls() []string {
   var todayUrls []string
   // Instantiate default collector
   c := colly.NewCollector(
      colly.AllowedDomains("www.cnhan.com"),
   )
   // On every a element which has href attribute call callback
   // 类选择器
   //url仅在本页
   c.OnHTML(".showSort a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      todayUrls = append(todayUrls, link)
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
   })

   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.cnhan.com/shantui/")

   //起始路由改变
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.cnhan.com"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         //http://www.cnhan.com/hyzx/
         //http://www.cnhan.com/hyzx/index-all-2.html
         //硬代码：当天最多更新99页http://www.cnhan.com/hyzx/index-all-99.html
         //^[1-9][0-9]{0,1}[^0-9]{0,1}$
         regexp.MustCompile("^http://www.cnhan.com/hyzx/(.{0}$)|(index-all-[1-9][0-9]{0,1}[^0-9]{0,1}\\.html$)"),
      ),
   )
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
      datetime := time.Now().Format("20060102")
      fmt.Println(datetime)
      reg := regexp.MustCompile(datetime) // http://www.cnhan.com/hyzx/20180827/7109076.html 通过url格式过滤出今天的url
      data := reg.Find([]byte(link))
      regRes := len(data)
      if regRes > 0 {
         link = "http://www.cnhan.com/hyzx/" + link
         todayUrls = append(todayUrls, link)
      }
   })

   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })

   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.cnhan.com/hyzx/")

   //起始路由改变
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.cnhan.com"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         //http://www.cnhan.com/pinfo/
         //http://www.cnhan.com/pinfo/index-5.html
         //硬代码：当天最多更新99页http://www.cnhan.com/pinfo/index-99.html
         regexp.MustCompile("^http://www.cnhan.com/pinfo/(.{0}$)|(index-[1-9][0-9]{0,1}[^0-9]{0,1}\\.html$)"),
      ),
   )
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
      //文本过滤
      eDate := e.ChildText(".span2")
      //http://www.cnhan.com/pinfo/313257.html   周口水泥彩砖具有的特色是什么2018.08.27
      datetime := time.Now().Format("2006.01.02")
      if (strings.Contains(eDate, datetime)) {
         link := e.Attr("href")
         link = "http://www.cnhan.com" + link
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         todayUrls = append(todayUrls, link)
      }
   })

   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })

   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.cnhan.com/pinfo/")

   //起始路由改变
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.heze.cn"),
   )
   // On every a element which has href attribute call callback
   // 类选择器
   c.OnHTML(".news_list_r a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      todayUrls = append(todayUrls, link)
   })

   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })

   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.heze.cn/info/")

   /*
   站内目标url
   http://www.heze.cn/info/
   http://www.heze.cn/qiye/
   检测思路：
   1、按父url，分别进入 http://www.heze.cn/qiye/18240670888/show-37-1367148.html  http://www.heze.cn/info/LEbigong/show-1-13931879.html
      与2反
   2、按照全站进入
      优点：过滤规则简单，代码代码简单；爬取结果数据不便于分类处理，比如产品类型、发布时间；
      缺点：爬爬取速度慢
   */

   //起始路由改变
   //http://www.heze.cn/qiye/  该页面、其主体子页面，刷新，内容变化
   //http://www.heze.cn/qiye/list-8.html
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.heze.cn"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         regexp.MustCompile("^http://www.heze.cn/qiye/(.{0}$)|(list-\\d+-\\d+\\.html$)"),
      ),
   )
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
      // http://www.heze.cn/qiye/hongfei688/show-44-14825619.html
      reg := regexp.MustCompile("^http://www.heze.cn/qiye/[0-9a-zA-Z]+/show-\\d+-\\d+\\.html$")
      data := reg.Find([]byte(link))
      regRes := len(data)
      if regRes > 0 {
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         todayUrls = append(todayUrls, link)
      }
   })

   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })

   // Start scraping on http://www.heze.cn/qiye/
   c.Visit("http://www.heze.cn/qiye/")

   return todayUrls
}

func main() {
   var todayUrls = getTodayUrls()
   fmt.Println(todayUrls)
   fmt.Println(len(todayUrls))
}

Colly provides a clean interface to write any kind of crawler/scraper/spider的更多相关文章

使用composer安装php的相关框架
使用composer来安装php的相关框架,不需要事先准备composer.json以及conmposer.lock以及composer.phar等文件: 直接在项目根目录下是使用composer r ...
Your First ASP.NET 5 Application on a Mac
Your First ASP.NET 5 Application on a Mac By Daniel Roth, Steve Smith, Rick Anderson ASP.NET 5 is cr ...
Scott Hanselman's 2014 Ultimate Developer and Power Users Tool List for Windows -摘自网络
Everyone collects utilities, and most folks have a list of a few that they feel are indispensable. ...
【翻译】在Mac上使用VSCode创建你的第一个Asp.Net Core应用
Setting Up Your Development Environment 设置你的开发环境 To setup your development machine download and inst ...
开源蜘蛛集合（转自haizhiguang博客，链接：http://blog.csdn.net/haizhiguang/article/details/20209573）
各种蜘蛛: Heritrix 点击次数:1458 Heritrix是一个开源,可扩展的web爬虫项目.Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签. ...
Less is exponentially more
Less is exponentially more (原文出处:rob pike 博客,https://commandcenter.blogspot.jp/2012/06/less-is-expo ...
[转] h5上传视频或文件编写
Html5 finally solves an age old problem of being able to upload files while also showing the upload ...
getting-started-with-mqtt
来自:https://dzone.com/refcardz/getting-started-with-mqtt SECTION 1 Why MQTT? The Internet of Things ( ...
csredis base usage
Basic usage Whenever possible, server responses are mapped to the appropriate CLR type. using (var r ...

随机推荐

VS2010 + winxp 无法定位程序输入点GetTickCount64 在动态链接库kernel32.dll上错误
winxp系统,使用VS2010, 在使用boost中的thread中的sleep的时候出现 “无法定位程序输入点GetTickCount64 在动态链接库kernel32.dll上”的错误, 在网上 ...
hdu 5455(字符串处理)
Fang Fang Time Limit: 1500/1000 MS (Java/Others) Memory Limit: 65535/32768 K (Java/Others)Total S ...
Java 实现随机验证码
许多系统的注册.登录或者发布信息模块都添加的随机码功能,就是为了避免自动注册程序或者自动发布程序的使用. 验证码实际上就是随机选择一些字符以图片的形式展现在页面上,如果进行提交操作的同时需要将图片上的 ...
Java实验--关于课上找“水王”问题分析
问题的表述就是说有那么一个人,他在一个论坛上发帖,然后每贴必回,自己也发帖.那么这个人在发帖的数目上就超过了整个论坛的帖子数目的一半以上. 我对这个问题一开始的思路是,用SQL语句获取整个列表中的数据 ...
chrome mac 快捷键
⌘-N 打开新窗口. ⌘-T 打开新标签页. ⌘-Shift-N 在隐身模式下打开新窗口. 按 ⌘-O,然后选择文件. 在 Google Chrome 浏览器中打开计算机中的文件. 按住 ⌘ 键,然后 ...
JDK1.8中的Lambda表达式和Stream
1.lambda表达式 Java8最值得学习的特性就是Lambda表达式和Stream API,如果有python或者javascript的语言基础,对理解Lambda表达式有很大帮助,因为Java正 ...
Java线程池的内部实现
一.线程池介绍线程是稀缺资源,如果无限制的创建,不仅会消耗系统资源,还会降低系统的稳定性,合理的使用线程池可以对线程进行统一的分配.调优和监控,并有以下好处: (1)降低资源消耗. (2)提高响应速 ...
第四期coding_group笔记_用CRF实现分词-词性标注
一.背景知识 1.1 什么是分词? NLP的基础任务分为三个部分,词法分析.句法分析和语义分析,其中词法分析中有一种方法叫Tokenization,对汉字以字为单位进行处理叫做分词. Example ...
Objective-C 协议（protocol）二
我们前面提到了OOP的继承,我们说Objective-C不像C++可以有多重继承,Objective-C是单一继承的,如果想要做到一个类别同时拥有多种型别的能力,我们就可以通过协议来实现.Object ...
android_浅析canvas的save()和restore()方法
<span style="font-size:18px;"> </span> <span style="font-size:18px;&qu ...

Colly provides a clean interface to write any kind of crawler/scraper/spider

Colly provides a clean interface to write any kind of crawler/scraper/spider的更多相关文章

随机推荐

热门专题