tdf sample
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using GearUp.Crawler.Entities;
using HtmlAgilityPack;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using System.Text.RegularExpressions;
using System.Collections.Concurrent;
using System.Threading; namespace GearUp.Crawler
{
public class Crawler
{
private ILoreBookItemRepository repository;
private ILorebookItemParser parser;
private LinkManager linkManager; private string linkDomain; private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>(); private const int DownloadTimeout = 10; public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager)
{
this.repository = repository;
this.parser = parser;
this.linkManager = linkManager;
} public async void StartCrawl(string targetUrl)
{
var cts = new CancellationTokenSource();
var ct = cts.Token; linkDomain = LinkManager.LinkDomain(targetUrl); var downloaderOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 3,
MaxDegreeOfParallelism = 4,
BoundedCapacity = 10
}; var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions); var pipelineOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 2,
CancellationToken = ct
}; var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions); var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct }); // Flow setup
downloader.LinkTo(contentBroadcaster);
contentBroadcaster.LinkTo(linkParser);
contentBroadcaster.LinkTo(writer);
linkParser.LinkTo(downloader); //Kick off the TPL dataflow here
downloader.Post(targetUrl);
WriteToConsole("Crawling...", ConsoleColor.Green);
PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape);
cts.Cancel();
WriteToConsole("Stopping...", ConsoleColor.Green);
await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion); } public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page)
{
if (page == null) return Enumerable.Empty<string>(); var discoveredLinks = new List<string>();
var document = new LorebookDocument(page.Html);
foreach (var link in document.LinksInArticleBodyDiv())
{
var fullUrl = linkManager.FullyQualifyLink(page.Url, link);
if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl)))
discoveredLinks.Add(fullUrl);
}
WriteToConsole(" {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count);
return discoveredLinks;
} public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url)
{
WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url);
var itemDetails = document.OfficialLorebookEntry();
var item = parser.ParseHtmlNode(itemDetails, url);
return item;
} public async Task<PageAndUrl> DownloadUrl(string url)
{
try
{
if (urls.ContainsKey(url)) return null;
urls.TryAdd(url, true); var client = new WebClient();
WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url);
var download = client.DownloadStringTaskAsync(url);
var cancel = Task.Delay(DownloadTimeout * 1000);
var any = await Task.WhenAny(download, cancel);
if (any == cancel)
{
client.CancelAsync();
WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url);
return null;
}
string result = download.Result; WriteToConsole("Downloaded: {0}", ConsoleColor.White, url); return new PageAndUrl() { Url = url, Html = result };
} catch (WebException ex)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message);
}
catch (AggregateException ex)
{
foreach (var exc in ex.Flatten().InnerExceptions)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message);
}
}
catch (Exception ex)
{
WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message);
} return null;
} public async Task SaveEntry(PageAndUrl page)
{
if (page == null) return;
var document = new LorebookDocument(page.Html);
var item = ExtractLoreBookItem(document, page.Url);
if (item != null) await repository.Save(page.Url, item);
} private static void WriteToConsole(string format, ConsoleColor color, params object[] texts)
{
Console.ForegroundColor = color;
Console.WriteLine(format, texts);
Console.ResetColor();
} private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null)
{
WriteToConsole(message, color);
if (key == null)
Console.ReadLine();
else
{
ConsoleKeyInfo entry;
do
{
entry = Console.ReadKey(true);
} while (key != entry.Key);
}
} }
}
tdf sample的更多相关文章
- Linux下UPnP sample分析
一.UPnP简介 UPnP(Universal Plug and Play)技术是一种屏蔽各种数字设备的硬件和操作系统的通信协议.它是一种数字网络中间件技术,建立在TCP/IP.HTTP协 ...
- cocos2d-x for android配置 & 运行 Sample on Linux OS
1.从http://www.cocos2d-x.org/download下载稳定版 比如cocos2d-x-2.2 2.解压cocos2d-x-2.2.zip,比如本文将其解压到 /opt 目录下 3 ...
- android studio2.2 的Find Sample Code点击没有反应
1 . 出现的问题描述: 右键点击Find Sample Code后半天没有反应,然后提示 Samples are currently unavailable for :{**** ...
- jmeter(四)Sample之http请求
启动jmeter,建立一个测试计划 这里再次说说怎么安装和启动jmeter吧,昨天下午又被人问到怎样安装和使用,我也是醉了:在我看来,百度能解决百分之八十的问题,特别是基础的问题... 安装:去官网下 ...
- jcaptcha sample 制作验证码
Skip to end of metadata Created by marc antoine garrigue, last modified by Jeremy Waters on Feb 23, ...
- Python 对不均衡数据进行Over sample(重抽样)
需要重采样的数据文件(Libsvm format),如heart_scale +1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.4 ...
- Basic linux command-with detailed sample
Here I will list some parameters which people use very ofen, I will attach the output of the command ...
- 例子:RSS Reader Sample
本例演示了Rss xml信息的获取,以及如何使用SyndicationFeed来进行符合Rss规范的xml进行解析. SyndicationFeed 解析完成后 可以得到SyndicationItem ...
- 例子:Background Audio Streamer Sample
The Background Audio Streamer sample demonstrates how to create an app that uses a MediaStreamSource ...
随机推荐
- HDU 6189 Law of Commutation(规律)
题意: 给定n,a,求区间 [ 1 , 1<<n ] 的数b 满足 的个数 分析:打出暴力程序可以发现当a为奇数的时候结果为一: 当a为偶时 , a^b=2^(k+b)mod 2^n ; ...
- BZOJ - 2457 思維+貪心
//為什麼我的Chrome OS更新後變成強制繁體了?? 題目要求使用最少的雙端隊列來維護一個單調非降序列 先來看下規律 首先,val肯定是單調非降的,在相等val範圍內的id可以xjb亂放不影響 其 ...
- java的Spring学习2- junit和mock
<!-- 引用Mock --> <dependency> <groupId>org.mockito</groupId> <artifactId&g ...
- javascript的事件触发和接收源码
define(function(require,exports,module){ var Events=function(){ var array = []; var push = array.pus ...
- python爬虫之趟雷
python爬虫之趟雷整理 雷一:URLError 问题具体描述:urllib.error.URLError: <urlopen error [Errno 11004] getaddrinfo ...
- PIE SDK常用滤波
1. 算法功能简介 空间域滤波实在图像空间( x. y)对输入图像应用滤波函数(核.模板)来改进输出图像的处理方法,主要包括平滑和锐化处理,强调像素与其周围相邻像素的关系,常用的方法是卷积运算. 空间 ...
- mysql 配置文件详解
mysql配置文件参数详解 (一) [client]port = 3306socket = /tmp/mysql.sock [mysqld]port = 3306socket = /tmp/mysql ...
- GreenPlum 大数据平台--备份-邮件配置-gpcrondump & gpdbrestore(五)
01,备份 生成备份数据库 [gpadmin@greenplum01 ~]$ gpcrondump -l /gpbackup/back2/gpcorndump.log -x postgres -v [ ...
- 【3dsMax安装失败,如何卸载、安装3dMax 2011?】
AUTODESK系列软件着实令人头疼,安装失败之后不能完全卸载!!!(比如maya,cad,3dsmax等).有时手动删除注册表重装之后还是会出现各种问题,每个版本的C++Runtime和.NET f ...
- (转)Linux磁盘空间监控告警 && Linux磁盘管理
Linux磁盘空间监控告警 http://blog.csdn.net/github_39069288/article/details/73478784-----------Linux磁盘管理 原文:h ...