最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:

球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30

在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:

public void Run()
{
Logger.ClearAll();
for(int i=; i<=; i++)
{
Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
Default.Ready();
var teamid = i;
var teamname = Default.SelectSingleNode("div.blockA>h2>span");
Logger.Log(teamname.Text());
var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
Logger.Log(teamurl.Text());
var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
Logger.Log(teamcity.Text().Replace("主场所在城市:",""));
var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
Logger.Log(gym.Text().Replace("主体育馆:",""));
var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
Logger.Log(peoplenum.Text().Replace("可容纳人数:",""));
var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
Logger.Log(intonba.Text().Replace("加入NBA时间:",""));
var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
Logger.Log(champion.Text().Replace("获总冠军次数:",""));
var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
Logger.Log(coach.Text().Replace("现任主教练:",""));
DataManager.AppendData("TEAM",
DataEntry.Create()
.Set("teamid", teamid+"")
.Set("teamname", teamname.Text())
.Set("teamurl", teamurl.Text().Replace("主场所在城市:",""))
.Set("gym",gym.Text().Replace("主体育馆:",""))
.Set("peoplenum", peoplenum.Text().Replace("可容纳人数:",""))
.Set("intonba", intonba.Text().Replace("加入NBA时间:",""))
.Set("champion", champion.Text().Replace("获总冠军次数:",""))
.Set("coach", coach.Text().Replace("现任主教练:",""))
);
Logger.Log(i.ToString());
var playelist = Default.SelectNodes("div.tab>table tr");
foreach(var player in playelist)
{
var num = player.SelectSingleNode("TD:eq(0)");
var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
var url = a.Attr("href");
var playerid = Regex.Match(url, @"\d+").Value;
var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
var height = player.SelectSingleNode("TD:eq(3)");
var weight = player.SelectSingleNode("TD:eq(4)");
var birth = player.SelectSingleNode("TD:eq(5)");
var college = player.SelectSingleNode("TD:eq(6)");
Logger.Log(playerimageurl.Text());
Logger.Log(playername.Text());
Logger.Log(position.Text());
Logger.Log(height.Text());
Logger.Log(weight.Text());
Logger.Log(birth.Text());
Logger.Log(college.Text());
Logger.Log(playerimageurl.Attr("src"));
Logger.Log(playerid);
DataManager.AppendData("player",
DataEntry.Create()
.Set("playerid", playerid)
.Set("teamid", teamid+"")
.Set("playername", playername.Text())
.Set("position", position.Text())
.Set("height",height.Text())
.Set("weight", weight.Text())
.Set("birth", birth.Text())
.Set("college", college.Text())
.Set("num", num.Text())
.Set("playerimageurl",playerimageurl.Attr("src"))
);
}
}
}

比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012

脚本如下:

public void Run()
{
Logger.ClearAll();
Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
Default.Ready();
var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains(\"技术统计\")"); List<string> urls = new List<string>();
foreach(var g in games)
{
var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
urls.Add(url.ToString());
}
foreach(var url in urls)
{
if( Default.Available == false) return;
Default.Navigate(url);
Default.Ready();
var teamNames = Default.SelectNodes("div.blockA>h2");
var scores = Default.SelectNodes("table.tab04 tr");
var scoreslist = Default.SelectNodes("table.tab02 tr>td");
var awayscores = Default.SelectNodes("table.tab02 tr");
var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
var logos = Default.SelectNodes("td.logo img");
var awayid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homeid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homescore=scores[].Text();
var awayscore=scores[].Text();
var awayscore1=scoreslist[].Text();
var awayscore2=scoreslist[].Text();
var awayscore3=scoreslist[].Text();
var awayscore4=scoreslist[].Text();
var homescore1=scoreslist[].Text();
var homescore2=scoreslist[].Text();
var homescore3=scoreslist[].Text();
var homescore4=scoreslist[].Text();
var gametime = Default.SelectSingleNode("div.center>h2"); var jiashiawayscores1="";
var jiashiawayscores2="" ;
var jiashiawayscores3 ="";
var jiashiawayscores4="";
var jiashihomescores1="";
var jiashihomescores2="";
var jiashihomescores3 ="";
var jiashihomescores4=""; var td = Default.SelectSingleNode("table.tabBig td:contains(\"加时赛\")");
if(!td.IsEmpty())
{ if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
}
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text(); } } DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",awayscore1)
.Set("score2", awayscore2)
.Set("score3",awayscore3)
.Set("score4",awayscore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashiawayscores1)
.Set("jiashiscore2",jiashiawayscores2)
.Set("jiashiscore3",jiashiawayscores3)
.Set("jiashiscore4",jiashiawayscores4)
);
DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",homescore1)
.Set("score2", homescore2)
.Set("score3",homescore3)
.Set("score4",homescore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashihomescores1)
.Set("jiashiscore2",jiashihomescores2)
.Set("jiashiscore3",jiashihomescores3)
.Set("jiashiscore4",jiashihomescores4)
); } }

这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的.

最后运行起来:

文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果.

Spider Studio QQ群: 45995410

示例 - C#脚本代码采集搜狐NBA球员, 球队和比赛实况的更多相关文章

  1. 使用CURL和火车头软件采集搜狐文章

    直接上代码: //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies function curl_request($url, ...

  2. C# 脚本代码自动登录淘宝获取用户信息

    C# 脚本代码自动登录淘宝获取用户信息   最近遇到的一个需求是如何让程序自动登录淘宝, 获取用户名称等信息. 其实这个利用SS (SpiderStudio的简称) 实现起来非常简单. 十数行代码就可 ...

  3. crawler4j源码学习(1):搜狐新闻网新闻标题采集爬虫

    crawler4j是用Java实现的开源网络爬虫.提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫.下面实例结合jsoup,采集搜狐新闻网(http://news.sohu.com/)新闻标 ...

  4. 利用朴素贝叶斯分类算法对搜狐新闻进行分类(python)

    数据来源  https://www.sogou.com/labs/resource/cs.php介绍:来自搜狐新闻2012年6月—7月期间国内,国际,体育,社会,娱乐等18个频道的新闻数据,提供URL ...

  5. jquery仿搜狐投票动画代码

    体验效果:http://hovertree.com/texiao/jquery/21/ 这是一款基于jquery实现的仿搜狐投票动画特效源码,运行该源码可见VS图标首先出现在中间位置,紧接着随着投票比 ...

  6. 【HTML&CSS】搜狐页面代码编写

    <!DOCTYPE html> <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"& ...

  7. 搜狐云景paas平台实践之路

    前言: 搜狐云景作为搜狐的paas平台,在2014年5月22日的云计算大会上正式发布了公测.初测,注册用户必须先申请邀请码参与公测会赠送用户100元电子券,经过实名认证之后会再赠送100电子券,目测可 ...

  8. 山寨Unity3D?搜狐畅游的免费开源游戏引擎Genesis-3D

    在CSDN上看到了<搜狐畅游发布3D游戏引擎Genesis-3D 基于MIT协议开源>(http://www.csdn.net/article/2013-11-21/2817585-cha ...

  9. 利用jieba,word2vec,LR进行搜狐新闻文本分类

    一.简介 1)jieba 中文叫做结巴,是一款中文分词工具,https://github.com/fxsjy/jieba 2)word2vec 单词向量化工具,https://radimrehurek ...

随机推荐

  1. 性能调优的Windows窗体DataGridView控件

    性能调优的Windows窗体DataGridView控件 . 净框架4.5     在处理大量数据时, DataGridView 控制可以消耗大量的内存开销,除非你仔细地使用它. 在客户有限的内存,你 ...

  2. 介绍Visual Studio的Android模拟器

    介绍Visual Studio的Android模拟器 http://blogs.msdn.com/b/visualstudioalm/archive/2014/11/12/introducing-vi ...

  3. 魔术布局效果-使用本地JSON数据提供数据服务

    在线演示 有社区朋友不知道如何修改外部OpenAPI为本地的JSON服务,这里做一个简单演示. 阅读原文:魔术布局效果-使用本地JSON数据提供数据服务

  4. Android编译程序报错:Re-installation failed due to different application signatures.

    如果机子上已经安装非本机编译的android程序,在编译的时候就会报错.方法首选的是删除原程序,然后再进行编译. 但是有一部分程序是烧录在系统程序里面的,无法直接删除,这时候可以使用adb shell ...

  5. T-SQL 之 视图

    视图实际上就是一个存储查询,重点是可以筛选.组合和匹配来自基本表(或者其他视图)的数据,从而创建在很多方面像另一个基表那样起作用的对象.可以创建一个简单的查询,仅仅从一个表中选择几列,而忽略其他列:或 ...

  6. 算法笔记_033:十六进制转八进制(Java)

    目录 1 问题描述 2 解决方案 2.1 注意问题 2.2 具体实现代码   1 问题描述 具体问题描述 给定n个十六进制正整数,输出它们对应的八进制数. 输入格式 输入的第一行为一个正整数n (1& ...

  7. Jquery重新学习之四[核心属性与文档处理属性]

    1:核心.each(callback),size(),length(),get([index]) 1.1 .each(callback)通过它可以遍历对象.数组的属性值并进行处理 <form i ...

  8. 自己定义UIView以实现自绘

    有时候我们须要自绘uiview以实现自己的需求,比方依据坐标点绘制出连续的曲线(股票走势图),就须要自绘uiview了. 原理:继承uiview类(customView),并实现custom view ...

  9. 【LeetCode】69. Sqrt(x) (2 solutions)

    Sqrt(x) Implement int sqrt(int x). Compute and return the square root of x. 解法一:牛顿迭代法 求n的平方根,即求f(x)= ...

  10. PHP权限控制(转)

    PHP: 我这里说到的权限管理办法是一个普遍采用的方法,主要是使用到"位运行符"操作,& 位与运算符.| 位或运行符.参与运算的如果是10进制数,则会被转换至2进制数参与运 ...