Dooioo Deal
using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer
{
internal class Dooioo : AnalyzerBase
{
protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case :
{
var dom = lander.GetDocument(pHandler);
DoPerPaging(current, dom.DocumentNode, ".pagination a:last-child"); foreach (var node in QueryNodes(dom.DocumentNode, "#hlist a"))
{
var url = GetHref(node, current.Url);
Crawler.PushUrl(url, DataDepth.Houses);
}
}
break;
case DataDepth.Houses:
{
var dom = lander.GetDocument(pHandler);
var attrs = new AttributeFiller(); var Nset = QueryNodes(dom.DocumentNode, "#building-info li").Select(p =>
{
var spans = QueryTexts(p, "span").ToArray();
return string.Format("{0}:{1}", spans[], spans[]);
});
attrs.Append(Nset); Guid hashKey = GenHashKey(current.Url.OriginalString);
var bo = Crawler.Repository.LoadHouses(hashKey);
bo.SiteID = current.Url.GetDomain();
bo.PageUrl = current.Url.OriginalString;
bo.CityName = Crawler.Config.CityName;
attrs.FillEntity(bo, new Dictionary<string, string>()
{
{"小区名", "小区名称"},
{"板块", "所属区域"},
{"建造年代", "竣工时间"},
{"地址", "小区地址"},
{"物业类型", "物业类别"},
});
MapMark(bo);
Repository.Save(bo);
Crawler.OutWrite("保存楼盘 {0}", bo.小区名称); var Pset = QueryNodes(dom.DocumentNode, ".pagination a", false);
if (Pset.Any())
{
string pageCount = Pset.Skip(Pset.Count() - ).First().InnerText;
Crawler.PushUrl(new Uri(string.Format("http://www.dooioo.com/ershoufang/s117862?p=[2-{0}]", pageCount)), DataDepth.Deal, bo.RowID);
}
SaveHouselisting(bo.RowID, current, dom);
}
break;
case DataDepth.Deal:
{
Guid housesID = (Guid)current.State;
pHandler.CrossLoad = (arg, xDom) =>
{
string pName = "p";
if (arg.IsRepost)
{
arg.IsRepost = false;
return;
}
var query = System.Web.HttpUtility.ParseQueryString(arg.RequestUrl.Query);
int pageIndex;
if (!int.TryParse(query[pName], out pageIndex))
{
pageIndex = ;
} var input = xDom.GetElementsByTagName("ul").Cast<System.Windows.Forms.HtmlElement>()
.Where(p => p.GetAttribute("class").Contains("pagination")).FirstOrDefault();
if (input == null)
{
App.LogInfo("CrossLoad xPaing:{0} {1}", this.GetType().Name, xDom.Body.InnerHtml);
return;
}
var btn = input.GetElementsByTagName("a").Cast<System.Windows.Forms.HtmlElement>()
.Where(p => p.InnerText == pageIndex.ToString()).First();
btn.InvokeMember("click");
arg.IsRepost = true;
};
var dom = lander.GetDocument(pHandler);
SaveHouselisting(housesID, current, dom);
}
break;
}
} private void SaveHouselisting(Guid housesID, PageLandEntity current, HtmlAgilityPack.HtmlDocument dom)
{
var attrs = new AttributeFiller();
foreach (var node in QueryNodes(dom.DocumentNode, "#history-list tr"))
{
var spans = QueryTexts(node, "td").ToArray();
attrs.Append("HousesID:{0}", housesID); DateTime dump;
if (DateTime.TryParse(spans[], out dump))
{
attrs.Append("TransactionDate:{0}", dump);
} attrs.Append("SoldPriceOrRent:{0}", spans[]);
attrs.Append("UnitPriceOrLease:{0}", spans[]);
attrs.Append("Apartment:{0}", spans[]);
attrs.Append("ServiceBroker:{0}", spans[]);
attrs.Append("Area:{0}", spans[]); var bo = new HouselistingEntity();
attrs.FillEntity(bo);
Repository.SaveHouselisting(bo);
Crawler.OutWrite("保存小区出售记录 {0}", housesID);
}
}
}
}
Dooioo Deal的更多相关文章
- zlhome.com Deal
using AnfleCrawler.Common; using System; using System.Collections.Generic; using System.Linq; using ...
- XML节点名称中有小数点处理(deal with dot)导致使用xpath时报错解决方法
<?xml version="1.0"?> <ModifyFiles> <_Layout.cshtml>123456</_Layout.c ...
- whu 1464 deal with numbers
WHU 1464 deal with numbers 题意: 给你一串数字,对着串数字有三项操作: Minus a,b,c:对区间[a,b]总的每个数都减c. Division a,b,c:对区间[ ...
- OK335xS canutils deal with compile error
/************************************************************************************** * OK335xS ca ...
- 能让你聪明的工作DEAL四法则,来自《每周工作四小时》书籍
来自书籍<每周工作四小时>,作者蒂莫西·费里斯(Tim Ferriss,昵称:蒂姆) 能让你聪明的工作DEAL四法则: 第一步:D——定位(Definition) 第二步:E——精简( ...
- how to deal with EINTR fault
[how to deal with EINTR fault] EINTR:interupted error.是指一个调用被信号给中断,对于同步的耗时调用来说,这个操作常见,譬如select.read. ...
- Spoken English Practice( Believe it or not, I don't need to make believe its a big deal. (believe,deal, You don't say))
音标复习 绿色:连读:红色:略读:蓝色:浊化:橙色:弱读 口语蜕变(2017/6/25) Sorry, t ...
- If you want the rainbow, you have to deal with the rain.
If you want the rainbow, you have to deal with the rain.想要彩虹,就先忍受雨水.
- Using SMOTEBoost(过采样) and RUSBoost(使用聚类+集成学习) to deal with class imbalance
Using SMOTEBoost and RUSBoost to deal with class imbalance from:https://aitopics.org/doc/news:1B9F7A ...
随机推荐
- UBUNTU下Y86模拟器的安装和使用
UBUNTU下Y86模拟器的安装和使用 由于上周在虚拟机中安装Y86模拟器中出现了一些问题并且没得到解决,所以上周实验是在实验楼上做的,这几天练习了UBUNTU下Y86模拟器的安装和使用. 参考博客: ...
- android通知栏总结
通知消息的发送 12-1:消息管理者 NotificationManager manager = (NotificationManager) getSystemService(NOTIFICATION ...
- Android LayoutParams简介
LayoutParams是子控件控制自己在父控件中布局的一个类. 不同布局都有相对的LayoutParams,最简单的LinearLayout.LayoutParams类可以设置布局的宽高. 我在写一 ...
- 常用HTML正则表达式
1.只能输入数字和英文的: <input onkeyup="value=value.replace(/[\W]/g,'') " > 2.只能输入数字的: <inp ...
- HDU Machine scheduling
Machine scheduling Time Limit : 5000/2000ms (Java/Other) Memory Limit : 32768/32768K (Java/Other) ...
- python(五)文件操作
1.打开文件 f = open('db','r') #只读 f = open('db','w') #只写,先清空原文件 f = open('db','x') #文件存在,报错,不存在,创建 ...
- Android DiskLruCache 源码解析 硬盘缓存的绝佳方案
一.概述 依旧是整理东西,所以近期的博客涉及的东西可能会比较老一点,会分析一些经典的框架,我觉得可能也是每个优秀的开发者必须掌握的东西:那么对于Disk Cache,DiskLruCache可以算佼佼 ...
- RCP: P2 Update两个烦人bug和解决办法
问题 Eclipse新的P2 Update机制,使用起来很方便,如果使用P2 plugin自带的UI,开发者完全不用写任何代码 即可实现application的在线更新. 但是P2 Update至少有 ...
- 自定义Dialog
功能:从底部弹出的对话框,加入动画 步骤:1 定义dialog布局文件 2 设置标题,透明度style.xml,选择器selector.xml ,圆角shape.xml 等样式文件 3 设置显示位置, ...
- mysql exists 和 in的效率比较
这条语句适用于a表比b表大的情况 select * from ecs_goods a where cat_id in(select cat_id from ecs_category b); 这条语句适 ...