using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer
{
internal class Soufun_News : AnalyzerBase
{
private enum Kind
{
[Description("市场")]
Market = ,
[Description("政策")]
Policy = ,
[Description("公司")]
Company = ,
} private static readonly string[] FilterTags = new string[] { "script", "iframe" }; public override void Init(PageCrawler crawler)
{
string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>()));
crawler.PushUrl(new StringPatternGenerator(exp), );
base.Init(crawler);
} protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
dynamic repository = Repository;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case :
{
var dom = lander.GetDocument(pHandler);
foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext"))
{
var linkNode = QueryNode(node, "a.link_01");
string url = GetHref(linkNode, current.Url).OriginalString;
int i = url.LastIndexOf(".");
Crawler.PushUrl(new Uri(url.Insert(i, "_all")), );
}
}
break;
case :
{
var dom = lander.GetDocument(pHandler);
var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26");
string kind = QueryNodes(hackNode, "a").Last().InnerText;
string title = QueryNode(dom.DocumentNode, "h1").InnerText;
var contentNode = QueryNode(dom.DocumentNode, "#news_body");
foreach (string tag in FilterTags)
{
foreach (var node in QueryNodes(contentNode, tag, false).ToArray())
{
node.Remove();
}
}
var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take().ToArray();
string source = null;
DateTime publishDate;
DateTime.TryParse(set[].InnerText, out publishDate);
if (set.Length == )
{
source = set[].InnerText;
}
repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate);
Crawler.OutWrite("保存新闻 {0}", title);
}
break;
}
}
}
}
        public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate)
{
Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString);
using (var db = Create())
{
var q = from t in db.News
where t.RowID == rowID
select t;
var news = q.SingleOrDefault();
if (news == null)
{
db.News.Add(news = new News()
{
RowID = rowID,
SiteID = pageUrl.Authority,
});
}
news.Kind = kind;
news.Source = source;
news.Title = title;
news.Content = content;
news.PublishDate = publishDate;
db._SaveChanges();
}
}

Soufun_News的更多相关文章

随机推荐

  1. Intel+Ardruino 101 翻转时点灯

    /*  ===============================================  Example sketch for CurieIMU library for Intel(R ...

  2. TCP/IP 协议介绍

    转自http://blog.jobbole.com/104886/ 一.TCP/IP 协议介绍 在介绍 HTTP 协议之前,先简单说一下TCP/IP协议的相关内容.TCP/IP协议是分层的,从底层至应 ...

  3. String s ; 和 String s = null ; 和 String s = "" ; 的却别

    String s ;该语句表示只是声明了一个引用变量,但是并没有初始化引用,所以对变量s的任何操作(除了初始化赋值外) 都将引发异常. String s=null; 表示未申请任何内存资源,即此语句表 ...

  4. [转]为何需要调用“super viewDidLoad

    转载地址:http://liwpk.blog.163.com/blog/static/36326170201165104413314/   - (void)didReceiveMemoryWarnin ...

  5. WPFのInkCanvas作为蒙版透明笔迹不透明

    本人最近利用inkcavas做一个蒙版的功能,结果发现笔迹稀释了,经过一番查找发现:应该讲inkcavas的背景设置为白色,然后透明,而不是将整个控件透明,具体代码: <InkCanvas Na ...

  6. python 注意事项

    常见错误 #4:  不理解Python的作用域 Python是基于 LEGB 来进行作用于解析的, LEGB 是 Local, Enclosing, Global, Built-in 的缩写.看起来“ ...

  7. jQuery 添加元素

    jQuery 添加元素 1.append 在被选元素的结尾插入内容 $(document).ready(function(){ $("button").click(function ...

  8. iOS OAuth2.0认证和SSO授权

    OAuth2.0和SSO授权   一.OAuth2.0授权协议 一种安全的登陆协议,用户提交的账户密码不提交到本APP,而是提交到授权服务器,待服务器确认后,返回本APP一个访问令牌,本APP即可用该 ...

  9. Postman Postman测试接口之POST提交本地文件数据

    举例: 文件同步接口 接口地址:http://183.xxx.xxx.xxx:23333/ditui/fileupload HTTP请求方式:POST 针对上述这种POST本地文件的接口,接口数据咋提 ...

  10. NSUrl的打印

    1转自 http://my.oschina.net/wangdk/blog/165554: NSURL *url = [NSURL URLWithString:@"http://www.ba ...