1、找出url汇总页，过滤出满足条件的详情页url；2、去详情页采集信息

package main

import (
   "fmt"

   "github.com/gocolly/colly"
   "regexp"
   "strings"

   "github.com/mongodb/mongo-go-driver/mongo"
   "github.com/mongodb/mongo-go-driver/bson"

   "context"
   "log"
   "gopkg.in/mgo.v2"
   mgoBson "gopkg.in/mgo.v2/bson"
   "time"
   "math/rand"
)

/*
task
http://www.cnhan.com/hyzx/
http://www.cnhan.com/shantui/
http://www.cnhan.com/pinfo/
http://www.heze.cn/info
http://www.heze.cn/qiye/
http://cn.sonhoo.com/wukong/
采集站点当日更新数据的客户联系方式

20180828
站点日期字符串特征
"cnhan.com/hyzx/":    "20180902",
"cnhan.com/shantui/": "20180902",
"cnhan.com/pinfo/":   "2018.09.02",
"heze.cn/info/":      "2018.09.02",
"heze.cn/qiye/":      "2018.09.02",
"sonhoo.com/wukong/": "2018-09-02",

20180904
http://cn.sonhoo.com/wukong/c16?offset=600&limit=50 先去文章含有文章日期的列表页遍历出符合条件的文章url，
再去文章详情页http://cn.sonhoo.com/wukong/a213383采集客户资料

*/

type PotentialCustomerWebSiteUrl struct {
   Url string `bson:"url"`
}

// Configuration | Colly http://go-colly.org/docs/introduction/configuration/
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

func RandomString() string {
   b := make([]byte, rand.Intn(10)+10)
   for i := range b {
      b[i] = letterBytes[rand.Intn(len(letterBytes))]
   }
   return string(b)
}

// url汇总页的日期筛选方法
func eggSitePathListTargetDate(TargetDate string) map[string]string {
   SitePathListTargetDateFmt := map[string]string{
      "cnhan.com/hyzx/":    "",
      "cnhan.com/shantui/": "ONLYtoday",
      "cnhan.com/pinfo/":   ".",
      "heze.cn/info/":      ".",
      "heze.cn/qiye/":      ".",
      "sonhoo.com/wukong/": "-",
   }
   for k, v := range SitePathListTargetDateFmt {
      fmt.Println(k)
      if ( v == "") {
         SitePathListTargetDateFmt[k] = TargetDate
      } else {
         SitePathListTargetDateFmt[k] = TargetDate[0:4] + v + TargetDate[4:6] + v + TargetDate[6:8]
      }
   }
   fmt.Println(SitePathListTargetDateFmt)
   return SitePathListTargetDateFmt
}

//  指定日期数据采集
var TargetDate = "20180904"
var TodayDate = time.Now().Format("20060102")
var mongoCollectioName = "todayUrls0904TestData"
var SitePathListTargetDate = eggSitePathListTargetDate(TargetDate)

func getTargetDateSpideredUrl() []string {
   //查询mongodb数据
   session, err := mgo.Dial("mongodb://hbaseU:123@192.168.3.103:27017/hbase")
   if err != nil {
      panic(err)
   }
   defer session.Close()
   // Optional. Switch the session to a monotonic behavior.
   session.SetMode(mgo.Monotonic, true)
   Collection := session.DB("hbase").C(mongoCollectioName)
   var PotentialCustomerWebSiteUrls [] PotentialCustomerWebSiteUrl
   err = Collection.Find(mgoBson.M{"spiderDate": TargetDate}).All(&PotentialCustomerWebSiteUrls)
   if err != nil {
      log.Fatal(err)
   }
   var PotentialCustomerWebSiteUrlSet [] string
   for i, v := range PotentialCustomerWebSiteUrls {
      fmt.Println(i)
      fmt.Println(v.Url)
      PotentialCustomerWebSiteUrlSet = append(PotentialCustomerWebSiteUrlSet, v.Url)
   }
   return PotentialCustomerWebSiteUrlSet
}

// 检查元素是否存在于数组？遍历？如何集合运算方法
func eleInArr(ele string, arr [] string) bool {
   for _, v := range arr {
      if (ele == v) {
         fmt.Println("eleInArr", ele)
         return true
      }
   }
   return false
}

func getTargetDateUrls() []string {
   var targetDateUrls []string
   PotentialCustomerWebSiteUrlSet := getTargetDateSpideredUrl()
   c := colly.NewCollector()
   if (TargetDate == TodayDate) {
      // 路径下只有当日url
      // Instantiate default collector
      c = colly.NewCollector(
         colly.AllowedDomains("www.cnhan.com"),
      )
      // On every a element which has href attribute call callback
      // 类选择器
      //url仅在本页
      c.OnHTML(".showSort a[href]", func(e *colly.HTMLElement) {

         link := e.Attr("href")
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      })
      // Start scraping on http://www.cnhan.com/shantui/
      c.Visit("http://www.cnhan.com/shantui/")

      // 起始路由改变
      // Instantiate default collector
      c = colly.NewCollector(
         colly.AllowedDomains("www.cnhan.com"),
         colly.URLFilters(
            //请求页面的正则表达式，满足其一即可
            //http://www.cnhan.com/hyzx/
            //http://www.cnhan.com/hyzx/index-all-2.html
            //硬代码：目标日最多更新99页http://www.cnhan.com/hyzx/index-all-99.html
            //^[1-9][0-9]{0,1}[^0-9]{0,1}$
            regexp.MustCompile("^http://www.cnhan.com/hyzx/(.{0}$)|(index-all-[1-9][0-9]{0,1}[^0-9]{0,1}\\.html$)"),
         ),
      )

      // On every a element which has href attribute call callback
      c.OnHTML("a[href]", func(e *colly.HTMLElement) {
         link := e.Attr("href")
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         c.Visit(e.Request.AbsoluteURL(link))
         d := SitePathListTargetDate["cnhan.com/hyzx/"]
         reg := regexp.MustCompile(d) // http://www.cnhan.com/hyzx/20180827/7109076.html 通过url格式过滤出目标日的url
         data := reg.Find([]byte(link))
         regRes := len(data)
         if regRes > 0 {
            link = "http://www.cnhan.com/hyzx/" + link
            t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
            if (!t) {
               targetDateUrls = append(targetDateUrls, link)
               fmt.Printf("Link found: %q -> %s\n", e.Text, link)
            }
         }
      })
      // Before making a request print "Visiting ..."
      c.OnRequest(func(r *colly.Request) {
         fmt.Println("Visiting", r.URL.String())
      })
      // Start scraping on http://www.cnhan.com/shantui/
      c.Visit("http://www.cnhan.com/hyzx/")

   }

   // 起始路由改变
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.cnhan.com"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         //http://www.cnhan.com/pinfo/
         //http://www.cnhan.com/pinfo/index-5.html
         //硬代码：目标日最多更新99页http://www.cnhan.com/pinfo/index-99.html
         regexp.MustCompile("^http://www.cnhan.com/pinfo/(.{0}$)|(index-[1-9][0-9]{0,1}[^0-9]{0,1}\\.html$)"),
      ),
   )
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
      //文本过滤
      eDate := e.ChildText(".span2")
      //http://www.cnhan.com/pinfo/313257.html   周口水泥彩砖具有的特色是什么2018.08.27
      d := SitePathListTargetDate["cnhan.com/pinfo/"]
      if (strings.Contains(eDate, d)) {
         link := e.Attr("href")
         link = "http://www.cnhan.com" + link
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      }
   })
   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })
   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.cnhan.com/pinfo/")

   // 起始路由改变
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.heze.cn"),
   )
   // On every a element which has href attribute call callback
   // 类选择器
   c.OnHTML(".news_list_r a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      targetDateUrls = append(targetDateUrls, link)
   })
   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.heze.cn/info/")

   /*
   站内目标url
   http://www.heze.cn/info/
   http://www.heze.cn/qiye/
   检测思路：
   1、按父url，分别进入 http://www.heze.cn/qiye/18240670888/show-37-1367148.html  http://www.heze.cn/info/LEbigong/show-1-13931879.html
      与2反
   2、按照全站进入
      优点：过滤规则简单，代码代码简单；爬取结果数据不便于分类处理，比如产品类型、发布时间；
      缺点：爬爬取速度慢
   */

   // 起始路由改变
   //http://www.heze.cn/qiye/  该页面、其主体子页面，刷新，内容变化
   //http://www.heze.cn/qiye/list-8.html
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.heze.cn"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         regexp.MustCompile("^http://www.heze.cn/qiye/(.{0}$)|(list-\\d+-\\d+\\.html$)"),
      ),
   )
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
      // http://www.heze.cn/qiye/hongfei688/show-44-14825619.html
      reg := regexp.MustCompile("^http://www.heze.cn/qiye/[0-9a-zA-Z]+/show-\\d+-\\d+\\.html$")
      data := reg.Find([]byte(link))
      regRes := len(data)
      if regRes > 0 {
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      }
   })
   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })
   // Start scraping on http://www.heze.cn/qiye/
   c.Visit("http://www.heze.cn/qiye/")

   // 起始路由改变
   // 目标日页面部分存在于起始页且与飞目标日页面混杂，无法通过选择器判断，需解析文本
   // 全站过滤
   // 类目页 http://cn.sonhoo.com/wukong/c133
   // 文章页 http://cn.sonhoo.com/wukong/a191114
   c = colly.NewCollector(
      colly.AllowedDomains("cn.sonhoo.com"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         regexp.MustCompile("^http://cn.sonhoo.com/wukong/$"),
         //regexp.MustCompile("^http://cn.sonhoo.com/wukong/[ac]{1}\\d+$"),
         regexp.MustCompile("^http://cn.sonhoo.com/wukong/[c]{1}\\d+$"),
         // http://cn.sonhoo.com/wukong/c0?offset=150&limit=50 文章列表页
         regexp.MustCompile("^http://cn.sonhoo.com/wukong/c\\d+\\?offset=\\d+\\&limit=\\d+$"),
      ),
      // 不加UA，无数据
      // colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
   )
   // 限制线程数，引入随机延迟
   // Limit the number of threads started by colly to two
   // when visiting links which domains' matches "*httpbin.*" glob
   c.Limit(&colly.LimitRule{
      DomainGlob:  "*sonhoo.*",
      Parallelism: 5,
      RandomDelay: 5 * time.Second,
   })

   // 保证遍历http://cn.sonhoo.com/wukong/c4?offset=100&limit=50各个页面的，获取可能的目标日的url
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
   })

   // 目标日的url入库
   // div[class=page-articles__articles]>ul>li
   c.OnHTML("div[class=page-articles__articles]>ul>li", func(e *colly.HTMLElement) {
      link := e.ChildAttr("a", "href")
      dat1 := e.ChildText("span")
      fmt.Println(dat1)
      fmt.Println(link)
      pageDate := e.ChildText("span")
      d := SitePathListTargetDate["sonhoo.com/wukong/"]
      fmt.Println("tDate", d)
      fmt.Println("pageDate", pageDate)
      if (strings.Contains(pageDate, d)) {
         link = "http://cn.sonhoo.com" + link
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      }
      c.Visit(e.Request.AbsoluteURL(link))
   })

   c.OnScraped(func(r *colly.Response) {
      fmt.Println("Finished", r.Request.URL)
   })
   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
      r.Headers.Set("User-Agent", RandomString())
   })
   c.OnError(func(_ *colly.Response, err error) {
      log.Println("Something went wrong:", err)
   })
   c.OnResponse(func(r *colly.Response) {
      fmt.Println("Visited", r.Request.URL)
   })
   // Start scraping on
   // http://cn.sonhoo.com/wukong/ 不带日期文章列表页
   // http://cn.sonhoo.com/wukong/c0 20180904 带日期的文章列表页：共50页，没有l类目
   c.Visit("http://cn.sonhoo.com/wukong/c0")
   // 等待线程结束
   // Wait until threads are finished
   c.Wait()

   return targetDateUrls
}

func main() {
   var targetDateUrls = getTargetDateUrls()
   fmt.Println(targetDateUrls)
   fmt.Println(len(targetDateUrls))
   for _, targetDateUrl := range targetDateUrls {
      // Instantiate default collector
      c := colly.NewCollector()
      // On every a element which has href attribute call callback
      c.OnScraped(func(r *colly.Response) {
         reqUrl := fmt.Sprintln(r.Request.URL)
         strings.Replace(reqUrl, "\n", "", -1)
         wholePageHtml := string(r.Body)
         client, err := mongo.Connect(context.Background(), "mongodb://hbaseU:123@192.168.3.103:27017/hbase", nil)
         db := client.Database("hbase")
         coll := db.Collection(mongoCollectioName)
         if err != nil {
            fmt.Println(err)
         }
         //目标日多次采集，目标日同站点且同路径url不重复入库
         //存入时间戳，分析目标站点的信息更新规律
         result, err := coll.InsertOne(
            context.Background(),
            bson.NewDocument(
               bson.EC.String("spiderDate", TargetDate),
               bson.EC.String("url", reqUrl),
               bson.EC.String("html", wholePageHtml),
            ))
         fmt.Println(err)
         fmt.Println(result)
      })
      c.Visit(targetDateUrl)
   }
}

1、找出url汇总页，过滤出满足条件的详情页url；2、去详情页采集信息的更多相关文章

Java中过滤出字母、数字和中文的正则表达式
1.Java中过滤出字母.数字和中文的正则表达式 (1)过滤出字母的正则表达式 [^(A-Za-z)] (2)过滤出数字的正则表达式 [^(0-9)] (3)过滤出中文的正则表达式 [^(\\u4e0 ...
Java正则表达式过滤出字母、数字和中文
原文:http://blog.csdn.net/k21325/article/details/54090066 1.Java中过滤出字母.数字和中文的正则表达式 (1)过滤出字母的正则表达式 [^(A ...
lintcode：Find the Connected Component in the Undirected Graph 找出无向图汇总的相连要素
题目: 找出无向图汇总的相连要素请找出无向图中相连要素的个数. 图中的每个节点包含其邻居的 1 个标签和 1 个列表.(一个无向图的相连节点(或节点)是一个子图,其中任意两个顶点通过路径相连,且不与 ...
asp .NET弹出窗口汇总（精华，麒麟创想）
asp .NET弹出窗口汇总(精华,麒麟创想) 注://关闭,父窗口弹出对话框,子窗口直接关闭 this.Response.Write("<script language=javas ...
java的List中使用filter过滤出符合特定条件的元素List
在实际开发中,经常需要把一个列表中的元素,按照特定条件过滤出来,放到一个新的列表中.本文给出了几个例子,来描述解决这个问题的方法. 我们假设有一个书的List,需要找出其中id分别是3.6.8.9的书 ...
python 过滤出某后缀名文件
以从某文件夹过滤出py文件为例: 法1: import glob import os os.chdir(“./”) for file in glob.glob(“*.py”): print file ...
通过Linux命令过滤出binlog中完整的SQL语句
DB:5.6.16CentOS:CentOS release 6.3 (Final) 当insert语句通过空格跨行输入的时候,如何提取完整的insert语句! 创建一个空表:mysql> cr ...
写出java8实现对List<User>中的username字段过滤出不等于张三的数据
写出java8实现对List<User>中的username字段过滤出不等于张三的数据... 对...这个是一道面试题.当时没有看过java8的新特性...所以有点懵. 看完之后感觉真. ...
AJPFX总结关于Java中过滤出字母、数字和中文的正则表达式
1.Java中过滤出字母.数字和中文的正则表达式 (1)过滤出字母的正则表达式 [^(A-Za-z)] (2) 过滤出数字的正则表达式 [^(0-9)] (3) 过滤出中文的正则 ...

随机推荐

python第三方库离线安装-使用pip
参考:http://www.cnblogs.com/michael-xiang/p/5690746.html 操作系统:CentOS 6.9 python:2.7.14 (默认的2.6.6需要升级到2 ...
facebook architecture 2 【转】
At the scale that Facebook operates, a lot of traditional approaches to serving web content breaks d ...
Fennec VS. Snuke --AtCoder
题目描述 Fennec and Snuke are playing a board game.On the board, there are N cells numbered 1 through N, ...
jvm 简单描述
java零基础入门-面向对象篇(一) 基础类型和引用类型友情提示:本章开始可能会有部分较深入的内容,不说又不行,说了又很难解释清楚,因为里面的技术细节实在太多太复杂,所以我会屏蔽部分技术细节,只展示 ...
Java开发笔记（一百零三）线程间的通信方式
前面介绍了多线程并发之时的资源抢占情况,以及利用同步.加锁.信号量等机制解决资源冲突问题,不过这些机制只适合同一资源的共享分配,并未涉及到某件事由的前因后果.日常生活中,经常存在两个前后关联的事务,像 ...
WEB API 返回类型设置为JSON 【转】
http://blog.sina.com.cn/s/blog_60ba16ed0102uzc7.html web api写api接口时默认返回的是把你的对象序列化后以XML形式返回,那么怎样才能让其返 ...
Centos7/RedHat7 下 python3使用cx-freeze打包matplotlib程序遇到的问题和解决办法
折腾了一天遇到了几个头疼的问题,还好回去前解决掉了第一个:执行cxfreeze打包好的程序遇到 tkinter 和 _tkinter的缺失问题首先终端:python tkinter python ...
【Salvation】——人物角色动画实现
写在前面:这个角色动画主要使用JavaScript编写脚本,在Unity3D游戏引擎的环境中实现. 一.显示角色并实现镜像效果 1.显示贴图: create→cube→修改名称为player,位置归0 ...
2017.2.20 activiti实战--第一章--认识Activiti
学习资料:<Activiti实战> 第一章认识Activiti 内容概览:讲解activiti的特点.接口概览.架构等基本信息. 1.3 Activiti的特点 1.使用mybatis ...
一个端口划到多个VLAN
不想启路由的情况下,希望将一个端口划到多个VLAN中去,其目的有如下几点: 1.隔离不想让相互访问的端口.(如两个部门) 2.让都需要访问的端口划到所有VLAN.(如共享服务器) 3.不启路由协议.( ...

1、找出url汇总页，过滤出满足条件的详情页url；2、去详情页采集信息

1、找出url汇总页，过滤出满足条件的详情页url；2、去详情页采集信息的更多相关文章

随机推荐

热门专题