using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using GearUp.Crawler.Entities;
using HtmlAgilityPack;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using System.Text.RegularExpressions;
using System.Collections.Concurrent;
using System.Threading; namespace GearUp.Crawler
{
public class Crawler
{
private ILoreBookItemRepository repository;
private ILorebookItemParser parser;
private LinkManager linkManager; private string linkDomain; private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>(); private const int DownloadTimeout = 10; public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager)
{
this.repository = repository;
this.parser = parser;
this.linkManager = linkManager;
} public async void StartCrawl(string targetUrl)
{
var cts = new CancellationTokenSource();
var ct = cts.Token; linkDomain = LinkManager.LinkDomain(targetUrl); var downloaderOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 3,
MaxDegreeOfParallelism = 4,
BoundedCapacity = 10
}; var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions); var pipelineOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 2,
CancellationToken = ct
}; var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions); var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct }); // Flow setup
downloader.LinkTo(contentBroadcaster);
contentBroadcaster.LinkTo(linkParser);
contentBroadcaster.LinkTo(writer);
linkParser.LinkTo(downloader); //Kick off the TPL dataflow here
downloader.Post(targetUrl);
WriteToConsole("Crawling...", ConsoleColor.Green);
PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape);
cts.Cancel();
WriteToConsole("Stopping...", ConsoleColor.Green);
await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion); } public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page)
{
if (page == null) return Enumerable.Empty<string>(); var discoveredLinks = new List<string>();
var document = new LorebookDocument(page.Html);
foreach (var link in document.LinksInArticleBodyDiv())
{
var fullUrl = linkManager.FullyQualifyLink(page.Url, link);
if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl)))
discoveredLinks.Add(fullUrl);
}
WriteToConsole(" {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count);
return discoveredLinks;
} public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url)
{
WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url);
var itemDetails = document.OfficialLorebookEntry();
var item = parser.ParseHtmlNode(itemDetails, url);
return item;
} public async Task<PageAndUrl> DownloadUrl(string url)
{
try
{
if (urls.ContainsKey(url)) return null;
urls.TryAdd(url, true); var client = new WebClient();
WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url);
var download = client.DownloadStringTaskAsync(url);
var cancel = Task.Delay(DownloadTimeout * 1000);
var any = await Task.WhenAny(download, cancel);
if (any == cancel)
{
client.CancelAsync();
WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url);
return null;
}
string result = download.Result; WriteToConsole("Downloaded: {0}", ConsoleColor.White, url); return new PageAndUrl() { Url = url, Html = result };
} catch (WebException ex)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message);
}
catch (AggregateException ex)
{
foreach (var exc in ex.Flatten().InnerExceptions)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message);
}
}
catch (Exception ex)
{
WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message);
} return null;
} public async Task SaveEntry(PageAndUrl page)
{
if (page == null) return;
var document = new LorebookDocument(page.Html);
var item = ExtractLoreBookItem(document, page.Url);
if (item != null) await repository.Save(page.Url, item);
} private static void WriteToConsole(string format, ConsoleColor color, params object[] texts)
{
Console.ForegroundColor = color;
Console.WriteLine(format, texts);
Console.ResetColor();
} private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null)
{
WriteToConsole(message, color);
if (key == null)
Console.ReadLine();
else
{
ConsoleKeyInfo entry;
do
{
entry = Console.ReadKey(true);
} while (key != entry.Key);
}
} }
}

tdf sample的更多相关文章

  1. Linux下UPnP sample分析

        一.UPnP简介   UPnP(Universal Plug and Play)技术是一种屏蔽各种数字设备的硬件和操作系统的通信协议.它是一种数字网络中间件技术,建立在TCP/IP.HTTP协 ...

  2. cocos2d-x for android配置 & 运行 Sample on Linux OS

    1.从http://www.cocos2d-x.org/download下载稳定版 比如cocos2d-x-2.2 2.解压cocos2d-x-2.2.zip,比如本文将其解压到 /opt 目录下 3 ...

  3. android studio2.2 的Find Sample Code点击没有反应

    1 . 出现的问题描述:           右键点击Find Sample Code后半天没有反应,然后提示 Samples are currently unavailable for :{**** ...

  4. jmeter(四)Sample之http请求

    启动jmeter,建立一个测试计划 这里再次说说怎么安装和启动jmeter吧,昨天下午又被人问到怎样安装和使用,我也是醉了:在我看来,百度能解决百分之八十的问题,特别是基础的问题... 安装:去官网下 ...

  5. jcaptcha sample 制作验证码

    Skip to end of metadata Created by marc antoine garrigue, last modified by Jeremy Waters on Feb 23, ...

  6. Python 对不均衡数据进行Over sample(重抽样)

    需要重采样的数据文件(Libsvm format),如heart_scale +1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.4 ...

  7. Basic linux command-with detailed sample

    Here I will list some parameters which people use very ofen, I will attach the output of the command ...

  8. 例子:RSS Reader Sample

    本例演示了Rss xml信息的获取,以及如何使用SyndicationFeed来进行符合Rss规范的xml进行解析. SyndicationFeed 解析完成后 可以得到SyndicationItem ...

  9. 例子:Background Audio Streamer Sample

    The Background Audio Streamer sample demonstrates how to create an app that uses a MediaStreamSource ...

随机推荐

  1. ui-grid使用详解

    HTML <pre name="code" class="html"><!--ui-grid css--> <link rel=& ...

  2. 最少拦截系统(线性dp)

    某国为了防御敌国的导弹袭击,发展出一种导弹拦截系统.但是这种导弹拦截系统有一个缺陷:虽然它的第一发炮弹能够到达任意的高度,但是以后每一发炮弹都不能超过前一发的高度.某天,雷达捕捉到敌国的导弹来袭.由于 ...

  3. Chapter 6

    6.1 顶点与输入布局 Direct3D 的顶点可以包含除空间坐标外的其他数据.如: struct Vertex1 { XMFLOAT3 Pos; XMFLOAT4 Color; }; struct ...

  4. PIE SDK微分锐化

    1.算法功能简介 微分锐化通过微分使图像的边缘或轮廓突出.清晰.导数算子具有突出灰度变化的作用,对图像运用导数算子,灰度变化较大的点处算得的值较高,因此我们将图像的导数算子运算值作为相应的边界强度,所 ...

  5. Qt客户端阿里云服上传文件

    整体原理: 阿里云提供了c程序上传文件到阿里云服务器的sdk工具包,将这个工具包继承在自己的客户端,调用接口即可实现上传文件. 前期准备: 1.阿里云c程序客户端的sdk,下载地址:https://h ...

  6. The user specified as a definer ('root'@'%') does not exist解决方案

    今天操作以root身份操作MySQL数据库的时候报出了这个异常: Error updating database. Cause: java.sql.SQLException: The user spe ...

  7. (转)shell命令:echo命令详解

    shell命令:echo命令详解 原文:https://www.cnblogs.com/xyz0601/archive/2015/04/23/4450736.html 功能说明:显示文字. 语 法:e ...

  8. hadoop-2.6.0.tar.gz + spark-1.6.1-bin-hadoop2.6.tgz的集群搭建(单节点)(CentOS系统)

    福利 => 每天都推送 欢迎大家,关注微信扫码并加入我的4个微信公众号:   大数据躺过的坑      Java从入门到架构师      人工智能躺过的坑         Java全栈大联盟   ...

  9. elasticsearch 2.4.0执行update的时候发现的一个问题

    请关注inline参数的变化 正确: POST /test/type1/1/_update{ "script" : { "inline": "ctx. ...

  10. nginx配置多域名

    http{ # 第一个虚拟主机 server { listen 80; server_name aaa.domain.com; #access_log logs/host.access.log mai ...