using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer
{
internal class Dooioo : AnalyzerBase
{
protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case :
{
var dom = lander.GetDocument(pHandler);
DoPerPaging(current, dom.DocumentNode, ".pagination a:last-child"); foreach (var node in QueryNodes(dom.DocumentNode, "#hlist a"))
{
var url = GetHref(node, current.Url);
Crawler.PushUrl(url, DataDepth.Houses);
}
}
break;
case DataDepth.Houses:
{
var dom = lander.GetDocument(pHandler);
var attrs = new AttributeFiller(); var Nset = QueryNodes(dom.DocumentNode, "#building-info li").Select(p =>
{
var spans = QueryTexts(p, "span").ToArray();
return string.Format("{0}:{1}", spans[], spans[]);
});
attrs.Append(Nset); Guid hashKey = GenHashKey(current.Url.OriginalString);
var bo = Crawler.Repository.LoadHouses(hashKey);
bo.SiteID = current.Url.GetDomain();
bo.PageUrl = current.Url.OriginalString;
bo.CityName = Crawler.Config.CityName;
attrs.FillEntity(bo, new Dictionary<string, string>()
{
{"小区名", "小区名称"},
{"板块", "所属区域"},
{"建造年代", "竣工时间"},
{"地址", "小区地址"},
{"物业类型", "物业类别"},
});
MapMark(bo);
Repository.Save(bo);
Crawler.OutWrite("保存楼盘 {0}", bo.小区名称); var Pset = QueryNodes(dom.DocumentNode, ".pagination a", false);
if (Pset.Any())
{
string pageCount = Pset.Skip(Pset.Count() - ).First().InnerText;
Crawler.PushUrl(new Uri(string.Format("http://www.dooioo.com/ershoufang/s117862?p=[2-{0}]", pageCount)), DataDepth.Deal, bo.RowID);
}
SaveHouselisting(bo.RowID, current, dom);
}
break;
case DataDepth.Deal:
{
Guid housesID = (Guid)current.State;
pHandler.CrossLoad = (arg, xDom) =>
{
string pName = "p";
if (arg.IsRepost)
{
arg.IsRepost = false;
return;
}
var query = System.Web.HttpUtility.ParseQueryString(arg.RequestUrl.Query);
int pageIndex;
if (!int.TryParse(query[pName], out pageIndex))
{
pageIndex = ;
} var input = xDom.GetElementsByTagName("ul").Cast<System.Windows.Forms.HtmlElement>()
.Where(p => p.GetAttribute("class").Contains("pagination")).FirstOrDefault();
if (input == null)
{
App.LogInfo("CrossLoad xPaing:{0} {1}", this.GetType().Name, xDom.Body.InnerHtml);
return;
}
var btn = input.GetElementsByTagName("a").Cast<System.Windows.Forms.HtmlElement>()
.Where(p => p.InnerText == pageIndex.ToString()).First();
btn.InvokeMember("click");
arg.IsRepost = true;
};
var dom = lander.GetDocument(pHandler);
SaveHouselisting(housesID, current, dom);
}
break;
}
} private void SaveHouselisting(Guid housesID, PageLandEntity current, HtmlAgilityPack.HtmlDocument dom)
{
var attrs = new AttributeFiller();
foreach (var node in QueryNodes(dom.DocumentNode, "#history-list tr"))
{
var spans = QueryTexts(node, "td").ToArray();
attrs.Append("HousesID:{0}", housesID); DateTime dump;
if (DateTime.TryParse(spans[], out dump))
{
attrs.Append("TransactionDate:{0}", dump);
} attrs.Append("SoldPriceOrRent:{0}", spans[]);
attrs.Append("UnitPriceOrLease:{0}", spans[]);
attrs.Append("Apartment:{0}", spans[]);
attrs.Append("ServiceBroker:{0}", spans[]);
attrs.Append("Area:{0}", spans[]); var bo = new HouselistingEntity();
attrs.FillEntity(bo);
Repository.SaveHouselisting(bo);
Crawler.OutWrite("保存小区出售记录 {0}", housesID);
}
}
}
}

Dooioo Deal的更多相关文章

  1. zlhome.com Deal

    using AnfleCrawler.Common; using System; using System.Collections.Generic; using System.Linq; using ...

  2. XML节点名称中有小数点处理(deal with dot)导致使用xpath时报错解决方法

    <?xml version="1.0"?> <ModifyFiles> <_Layout.cshtml>123456</_Layout.c ...

  3. whu 1464 deal with numbers

    WHU 1464  deal with numbers 题意: 给你一串数字,对着串数字有三项操作: Minus a,b,c:对区间[a,b]总的每个数都减c. Division a,b,c:对区间[ ...

  4. OK335xS canutils deal with compile error

    /************************************************************************************** * OK335xS ca ...

  5. 能让你聪明的工作DEAL四法则,来自《每周工作四小时》书籍

    来自书籍<每周工作四小时>,作者蒂莫西·费里斯(Tim Ferriss,昵称:蒂姆)   能让你聪明的工作DEAL四法则: 第一步:D——定位(Definition) 第二步:E——精简( ...

  6. how to deal with EINTR fault

    [how to deal with EINTR fault] EINTR:interupted error.是指一个调用被信号给中断,对于同步的耗时调用来说,这个操作常见,譬如select.read. ...

  7. Spoken English Practice( Believe it or not, I don't need to make believe its a big deal. (believe,deal, You don't say))

    音标复习                                                绿色:连读:红色:略读:蓝色:浊化:橙色:弱读 口语蜕变(2017/6/25) Sorry, t ...

  8. If you want the rainbow, you have to deal with the rain.

    If you want the rainbow, you have to deal with the rain.想要彩虹,就先忍受雨水.

  9. Using SMOTEBoost(过采样) and RUSBoost(使用聚类+集成学习) to deal with class imbalance

    Using SMOTEBoost and RUSBoost to deal with class imbalance from:https://aitopics.org/doc/news:1B9F7A ...

随机推荐

  1. Mvc请求管道中的19个事件

    下面是请求管道中的19个事件. (1)BeginRequest: 开始处理请求 (2)AuthenticateRequest授权验证请求,获取用户授权信息 (3):PostAuthenticateRe ...

  2. js基础的知识整理

    一.操作样式: .style   操作行间样式 .className 修改class 二.操作属性 1. .  更简单,操作已有的属性 2. [] 更灵活,点能做的,方括号都能做.方括号中放的是字符串 ...

  3. RCP:如何移除Search对话框中不需要的项

    前言 很久没写文章了,准备写一系列关于Eclipse RCP /Plugin的文章. 这些文章都是trouble shooting性质的,不准备写的很细,当你碰到这样的问题,google到时,能帮你把 ...

  4. iptables交互配置shell脚本

    #!/bin/bash while true do clear echo "———————-menu————————" echo -e "\033[49;32;1m(1) ...

  5. Python学习笔记(0)

    Python 是什么类型的语言 Python是脚本语言 Python下载地址:https://www.python.org/downloads/ Python版本:Python 3.4.2 - 64b ...

  6. java方法参数

    Java程序设计语言总是采用值调用.也就是说,方法得到的是所有参数的一个拷贝,特别是方法不能修改传递给它的任何参数变量的内容. 基本类型参数 1)X被初始化为percent值的一个拷贝: 2)X被乘以 ...

  7. VPN推荐

    最近ZF加强了对谷歌的屏蔽,推荐一些VPN azuressh.com ¥10/m http://www.archsocks.com/ ¥12/年 豆荚VPN,免费800M/月,偶尔连不上,可付费

  8. VM安装mac及dmg文件转换iso

    今天心血来潮,突然看见一篇关于swift的入门教程,但是前提是有一台mac啊,于是对于屌丝,就只好装黑苹果或者是虚拟机上运行了,但是呢mac貌似听说(没用过)只能在inter上运行,屌丝的本子偏偏是A ...

  9. PHP 小方法之 写日志方法

    if(! function_exists ('write_log') ) { function write_log($data, $name='debug', $date=null){ if (is_ ...

  10. easyui-panel 滚动条禁用

    div id="p" class="easyui-panel" title="title" style="padding:10px ...