using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using GearUp.Crawler.Entities;
using HtmlAgilityPack;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using System.Text.RegularExpressions;
using System.Collections.Concurrent;
using System.Threading; namespace GearUp.Crawler
{
public class Crawler
{
private ILoreBookItemRepository repository;
private ILorebookItemParser parser;
private LinkManager linkManager; private string linkDomain; private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>(); private const int DownloadTimeout = 10; public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager)
{
this.repository = repository;
this.parser = parser;
this.linkManager = linkManager;
} public async void StartCrawl(string targetUrl)
{
var cts = new CancellationTokenSource();
var ct = cts.Token; linkDomain = LinkManager.LinkDomain(targetUrl); var downloaderOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 3,
MaxDegreeOfParallelism = 4,
BoundedCapacity = 10
}; var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions); var pipelineOptions = new ExecutionDataflowBlockOptions
{
MaxMessagesPerTask = 2,
CancellationToken = ct
}; var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions); var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct }); // Flow setup
downloader.LinkTo(contentBroadcaster);
contentBroadcaster.LinkTo(linkParser);
contentBroadcaster.LinkTo(writer);
linkParser.LinkTo(downloader); //Kick off the TPL dataflow here
downloader.Post(targetUrl);
WriteToConsole("Crawling...", ConsoleColor.Green);
PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape);
cts.Cancel();
WriteToConsole("Stopping...", ConsoleColor.Green);
await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion); } public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page)
{
if (page == null) return Enumerable.Empty<string>(); var discoveredLinks = new List<string>();
var document = new LorebookDocument(page.Html);
foreach (var link in document.LinksInArticleBodyDiv())
{
var fullUrl = linkManager.FullyQualifyLink(page.Url, link);
if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl)))
discoveredLinks.Add(fullUrl);
}
WriteToConsole(" {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count);
return discoveredLinks;
} public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url)
{
WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url);
var itemDetails = document.OfficialLorebookEntry();
var item = parser.ParseHtmlNode(itemDetails, url);
return item;
} public async Task<PageAndUrl> DownloadUrl(string url)
{
try
{
if (urls.ContainsKey(url)) return null;
urls.TryAdd(url, true); var client = new WebClient();
WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url);
var download = client.DownloadStringTaskAsync(url);
var cancel = Task.Delay(DownloadTimeout * 1000);
var any = await Task.WhenAny(download, cancel);
if (any == cancel)
{
client.CancelAsync();
WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url);
return null;
}
string result = download.Result; WriteToConsole("Downloaded: {0}", ConsoleColor.White, url); return new PageAndUrl() { Url = url, Html = result };
} catch (WebException ex)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message);
}
catch (AggregateException ex)
{
foreach (var exc in ex.Flatten().InnerExceptions)
{
WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message);
}
}
catch (Exception ex)
{
WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message);
} return null;
} public async Task SaveEntry(PageAndUrl page)
{
if (page == null) return;
var document = new LorebookDocument(page.Html);
var item = ExtractLoreBookItem(document, page.Url);
if (item != null) await repository.Save(page.Url, item);
} private static void WriteToConsole(string format, ConsoleColor color, params object[] texts)
{
Console.ForegroundColor = color;
Console.WriteLine(format, texts);
Console.ResetColor();
} private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null)
{
WriteToConsole(message, color);
if (key == null)
Console.ReadLine();
else
{
ConsoleKeyInfo entry;
do
{
entry = Console.ReadKey(true);
} while (key != entry.Key);
}
} }
}

tdf sample的更多相关文章

  1. Linux下UPnP sample分析

        一.UPnP简介   UPnP(Universal Plug and Play)技术是一种屏蔽各种数字设备的硬件和操作系统的通信协议.它是一种数字网络中间件技术,建立在TCP/IP.HTTP协 ...

  2. cocos2d-x for android配置 & 运行 Sample on Linux OS

    1.从http://www.cocos2d-x.org/download下载稳定版 比如cocos2d-x-2.2 2.解压cocos2d-x-2.2.zip,比如本文将其解压到 /opt 目录下 3 ...

  3. android studio2.2 的Find Sample Code点击没有反应

    1 . 出现的问题描述:           右键点击Find Sample Code后半天没有反应,然后提示 Samples are currently unavailable for :{**** ...

  4. jmeter(四)Sample之http请求

    启动jmeter,建立一个测试计划 这里再次说说怎么安装和启动jmeter吧,昨天下午又被人问到怎样安装和使用,我也是醉了:在我看来,百度能解决百分之八十的问题,特别是基础的问题... 安装:去官网下 ...

  5. jcaptcha sample 制作验证码

    Skip to end of metadata Created by marc antoine garrigue, last modified by Jeremy Waters on Feb 23, ...

  6. Python 对不均衡数据进行Over sample(重抽样)

    需要重采样的数据文件(Libsvm format),如heart_scale +1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.4 ...

  7. Basic linux command-with detailed sample

    Here I will list some parameters which people use very ofen, I will attach the output of the command ...

  8. 例子:RSS Reader Sample

    本例演示了Rss xml信息的获取,以及如何使用SyndicationFeed来进行符合Rss规范的xml进行解析. SyndicationFeed 解析完成后 可以得到SyndicationItem ...

  9. 例子:Background Audio Streamer Sample

    The Background Audio Streamer sample demonstrates how to create an app that uses a MediaStreamSource ...

随机推荐

  1. window7下karma 报 The header content contains invalid characters BUG

    打开你的依赖node_modules\karma\node_modules\connect\lib\patch.js 将里面的setHeader方法改成下面这样,干掉序列化日期时出现的中文 res.s ...

  2. C#.net 设置Access-Control-Allow-Origin来实现跨域

    <system.webServer> <httpProtocol> <customHeaders> <add name="Access-Contro ...

  3. linux 文件截取

    相关函数:open, ftruncate 表头文件:#include <unistd.h> 定义函数:int truncate(const char *path, off_t length ...

  4. Element UI 中国省市区级联数据

    https://www.npmjs.com/package/element-china-area-data

  5. 处理定时事件(一)---模拟Redis实现(C++)

    https://blog.csdn.net/xiyoulinux_kangyijie/article/details/78278992

  6. (转)Saltstack系列

    Saltstack系列1:安装配置 Saltstack系列2:Saltstack远程执行命令 Saltstack系列3:Saltstack常用模块及API Saltstack系列4:Saltstack ...

  7. Docker的镜像迁移

    [root@localhost ~]# mkdir /opt/soft/ [root@localhost ~]# docker save c3987965c15d > /opt/soft/pos ...

  8. setContentView和inflate区别

    一般用LayoutInflater做一件事:inflate inflate这个方法总共有四种形式(见下面),目的都是把xml表述的layout转化为View对象.其中有一个比较常用,View infl ...

  9. STM32F407使用MFRC522射频卡调试及程序移植成功

    版权声明:转载请注明出处,谢谢 https://blog.csdn.net/Kevin_8_Lee/article/details/88865556 或  https://www.cnblogs.co ...

  10. repoquery详解——linux查看包依赖关系的神器

    repoquery是yum扩展工具包yum-utils中的一个工具,所有如果你没有repoquery命令的话,可以先 sudo yum install yum-utils 安装yum-utils包.是 ...