Soufun_News
using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer
{
internal class Soufun_News : AnalyzerBase
{
private enum Kind
{
[Description("市场")]
Market = ,
[Description("政策")]
Policy = ,
[Description("公司")]
Company = ,
} private static readonly string[] FilterTags = new string[] { "script", "iframe" }; public override void Init(PageCrawler crawler)
{
string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>()));
crawler.PushUrl(new StringPatternGenerator(exp), );
base.Init(crawler);
} protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
dynamic repository = Repository;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case :
{
var dom = lander.GetDocument(pHandler);
foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext"))
{
var linkNode = QueryNode(node, "a.link_01");
string url = GetHref(linkNode, current.Url).OriginalString;
int i = url.LastIndexOf(".");
Crawler.PushUrl(new Uri(url.Insert(i, "_all")), );
}
}
break;
case :
{
var dom = lander.GetDocument(pHandler);
var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26");
string kind = QueryNodes(hackNode, "a").Last().InnerText;
string title = QueryNode(dom.DocumentNode, "h1").InnerText;
var contentNode = QueryNode(dom.DocumentNode, "#news_body");
foreach (string tag in FilterTags)
{
foreach (var node in QueryNodes(contentNode, tag, false).ToArray())
{
node.Remove();
}
}
var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take().ToArray();
string source = null;
DateTime publishDate;
DateTime.TryParse(set[].InnerText, out publishDate);
if (set.Length == )
{
source = set[].InnerText;
}
repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate);
Crawler.OutWrite("保存新闻 {0}", title);
}
break;
}
}
}
}
public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate)
{
Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString);
using (var db = Create())
{
var q = from t in db.News
where t.RowID == rowID
select t;
var news = q.SingleOrDefault();
if (news == null)
{
db.News.Add(news = new News()
{
RowID = rowID,
SiteID = pageUrl.Authority,
});
}
news.Kind = kind;
news.Source = source;
news.Title = title;
news.Content = content;
news.PublishDate = publishDate;
db._SaveChanges();
}
}
Soufun_News的更多相关文章
随机推荐
- Why does pthread_cond_signal not work?【转】
转自:http://stackoverflow.com/questions/16819169/why-does-pthread-cond-signal-not-work# 0 down vote fa ...
- Apple个人(Individual)开发者账号升级公司(Company)开发者账号
1.拨打苹果针对中国区开发者的咨询服务热线:4006 701 855: 2.简单向对方(中文不太标准,但听懂没问题)说明意图后,会要求提供: (1)之前申请IDP时purchase form上的per ...
- .sh脚本判断判断某一变量是否为某一数值
.sh脚本中,判断某一变量(例如:OEM_CUSTOMER_SUPPORT)是否为某一数值(例如:0),并根据条件做不同处理,写法如下: if [ $OEM_CUSTOMER_SUPPORT -eq ...
- WordCount示例深度学习MapReduce过程(1)
我们都安装完Hadoop之后,按照一些案例先要跑一个WourdCount程序,来测试Hadoop安装是否成功.在终端中用命令创建一个文件夹,简单的向两个文件中各写入一段话,然后运行Hadoop,Wou ...
- Linux phpbb论坛的安装(中文版)
1:建立文件夹
- JAVA基础知识之NIO——Buffer.Channel,Charset,Channel文件锁
NIO机制 NIO即NEW IO的意思,是JDK1.4提供的针对旧IO体系进行改进之后的IO,新增了许多新类,放在java.nio包下,并对java.io下许多类进行了修改,以便使用与nio. 在ja ...
- python之rabbitMQ篇
一.RabbitMQ安装 RabbitMQ是一个在AMQP基础上完整的,可复用的企业消息系统,它遵循Mozilla Pulic License开源协议. MQ全称为Message Queue,消息队列 ...
- 扩展当easyui datagrid无数据时,显示特定值。如:没有数据
var myview = $.extend({},$.fn.datagrid.defaults.view,{ onAfterRender:function(target){ $.fn.datagrid ...
- 基于.NET的微软ORM框架视频教程(Entity Framework技术)
基于.NET的微软ORM框架视频教程(Entity Framework技术) 第一讲 ORM映射 第二讲 初识EntifyFramework框架 第三讲 LINQ表达式查询 第四讲 LINQ方法查询 ...
- [问题2015S11] 复旦高等代数 II(14级)每周一题(第十二教学周)
[问题2015S11] 证明: 任一复方阵都相似于一个复对称阵. 举例说明: 存在实方阵, 它不相似于实对称阵. 问题解答请在以下网址下载:http://pan.baidu.com/share/ho ...