最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:

球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30

在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:

public void Run()
{
Logger.ClearAll();
for(int i=; i<=; i++)
{
Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
Default.Ready();
var teamid = i;
var teamname = Default.SelectSingleNode("div.blockA>h2>span");
Logger.Log(teamname.Text());
var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
Logger.Log(teamurl.Text());
var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
Logger.Log(teamcity.Text().Replace("主场所在城市:",""));
var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
Logger.Log(gym.Text().Replace("主体育馆:",""));
var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
Logger.Log(peoplenum.Text().Replace("可容纳人数:",""));
var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
Logger.Log(intonba.Text().Replace("加入NBA时间:",""));
var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
Logger.Log(champion.Text().Replace("获总冠军次数:",""));
var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
Logger.Log(coach.Text().Replace("现任主教练:",""));
DataManager.AppendData("TEAM",
DataEntry.Create()
.Set("teamid", teamid+"")
.Set("teamname", teamname.Text())
.Set("teamurl", teamurl.Text().Replace("主场所在城市:",""))
.Set("gym",gym.Text().Replace("主体育馆:",""))
.Set("peoplenum", peoplenum.Text().Replace("可容纳人数:",""))
.Set("intonba", intonba.Text().Replace("加入NBA时间:",""))
.Set("champion", champion.Text().Replace("获总冠军次数:",""))
.Set("coach", coach.Text().Replace("现任主教练:",""))
);
Logger.Log(i.ToString());
var playelist = Default.SelectNodes("div.tab>table tr");
foreach(var player in playelist)
{
var num = player.SelectSingleNode("TD:eq(0)");
var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
var url = a.Attr("href");
var playerid = Regex.Match(url, @"\d+").Value;
var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
var height = player.SelectSingleNode("TD:eq(3)");
var weight = player.SelectSingleNode("TD:eq(4)");
var birth = player.SelectSingleNode("TD:eq(5)");
var college = player.SelectSingleNode("TD:eq(6)");
Logger.Log(playerimageurl.Text());
Logger.Log(playername.Text());
Logger.Log(position.Text());
Logger.Log(height.Text());
Logger.Log(weight.Text());
Logger.Log(birth.Text());
Logger.Log(college.Text());
Logger.Log(playerimageurl.Attr("src"));
Logger.Log(playerid);
DataManager.AppendData("player",
DataEntry.Create()
.Set("playerid", playerid)
.Set("teamid", teamid+"")
.Set("playername", playername.Text())
.Set("position", position.Text())
.Set("height",height.Text())
.Set("weight", weight.Text())
.Set("birth", birth.Text())
.Set("college", college.Text())
.Set("num", num.Text())
.Set("playerimageurl",playerimageurl.Attr("src"))
);
}
}
}

比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012

脚本如下:

public void Run()
{
Logger.ClearAll();
Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
Default.Ready();
var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains(\"技术统计\")"); List<string> urls = new List<string>();
foreach(var g in games)
{
var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
urls.Add(url.ToString());
}
foreach(var url in urls)
{
if( Default.Available == false) return;
Default.Navigate(url);
Default.Ready();
var teamNames = Default.SelectNodes("div.blockA>h2");
var scores = Default.SelectNodes("table.tab04 tr");
var scoreslist = Default.SelectNodes("table.tab02 tr>td");
var awayscores = Default.SelectNodes("table.tab02 tr");
var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
var logos = Default.SelectNodes("td.logo img");
var awayid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homeid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homescore=scores[].Text();
var awayscore=scores[].Text();
var awayscore1=scoreslist[].Text();
var awayscore2=scoreslist[].Text();
var awayscore3=scoreslist[].Text();
var awayscore4=scoreslist[].Text();
var homescore1=scoreslist[].Text();
var homescore2=scoreslist[].Text();
var homescore3=scoreslist[].Text();
var homescore4=scoreslist[].Text();
var gametime = Default.SelectSingleNode("div.center>h2"); var jiashiawayscores1="";
var jiashiawayscores2="" ;
var jiashiawayscores3 ="";
var jiashiawayscores4="";
var jiashihomescores1="";
var jiashihomescores2="";
var jiashihomescores3 ="";
var jiashihomescores4=""; var td = Default.SelectSingleNode("table.tabBig td:contains(\"加时赛\")");
if(!td.IsEmpty())
{ if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
}
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text(); } } DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",awayscore1)
.Set("score2", awayscore2)
.Set("score3",awayscore3)
.Set("score4",awayscore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashiawayscores1)
.Set("jiashiscore2",jiashiawayscores2)
.Set("jiashiscore3",jiashiawayscores3)
.Set("jiashiscore4",jiashiawayscores4)
);
DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",homescore1)
.Set("score2", homescore2)
.Set("score3",homescore3)
.Set("score4",homescore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashihomescores1)
.Set("jiashiscore2",jiashihomescores2)
.Set("jiashiscore3",jiashihomescores3)
.Set("jiashiscore4",jiashihomescores4)
); } }

这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的.

最后运行起来:

文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果.

Spider Studio QQ群: 45995410

示例 - C#脚本代码采集搜狐NBA球员, 球队和比赛实况的更多相关文章

  1. 使用CURL和火车头软件采集搜狐文章

    直接上代码: //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies function curl_request($url, ...

  2. C# 脚本代码自动登录淘宝获取用户信息

    C# 脚本代码自动登录淘宝获取用户信息   最近遇到的一个需求是如何让程序自动登录淘宝, 获取用户名称等信息. 其实这个利用SS (SpiderStudio的简称) 实现起来非常简单. 十数行代码就可 ...

  3. crawler4j源码学习(1):搜狐新闻网新闻标题采集爬虫

    crawler4j是用Java实现的开源网络爬虫.提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫.下面实例结合jsoup,采集搜狐新闻网(http://news.sohu.com/)新闻标 ...

  4. 利用朴素贝叶斯分类算法对搜狐新闻进行分类(python)

    数据来源  https://www.sogou.com/labs/resource/cs.php介绍:来自搜狐新闻2012年6月—7月期间国内,国际,体育,社会,娱乐等18个频道的新闻数据,提供URL ...

  5. jquery仿搜狐投票动画代码

    体验效果:http://hovertree.com/texiao/jquery/21/ 这是一款基于jquery实现的仿搜狐投票动画特效源码,运行该源码可见VS图标首先出现在中间位置,紧接着随着投票比 ...

  6. 【HTML&CSS】搜狐页面代码编写

    <!DOCTYPE html> <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"& ...

  7. 搜狐云景paas平台实践之路

    前言: 搜狐云景作为搜狐的paas平台,在2014年5月22日的云计算大会上正式发布了公测.初测,注册用户必须先申请邀请码参与公测会赠送用户100元电子券,经过实名认证之后会再赠送100电子券,目测可 ...

  8. 山寨Unity3D?搜狐畅游的免费开源游戏引擎Genesis-3D

    在CSDN上看到了<搜狐畅游发布3D游戏引擎Genesis-3D 基于MIT协议开源>(http://www.csdn.net/article/2013-11-21/2817585-cha ...

  9. 利用jieba,word2vec,LR进行搜狐新闻文本分类

    一.简介 1)jieba 中文叫做结巴,是一款中文分词工具,https://github.com/fxsjy/jieba 2)word2vec 单词向量化工具,https://radimrehurek ...

随机推荐

  1. 【转】javascript 中的很多有用的东西

    原文:https://www.cnblogs.com/ys-ys/p/5158510.html ---------------------------------------------------- ...

  2. 理解 select poll epoll

    举例说明:老师收学生作业,相当于应用层调用I/O操作. 1.老师逐个收学生作业,学生没有做完,只能阻塞等待,收了之后,再去收下一个学生的作业.这显然存在性能问题. 2.怎么解决上面的问题? 老师找个班 ...

  3. mysql安装错误总结

    1.若在启动mysql服务时出现如下错误,可查看错误日志找出错误原因. Error:Starting MySQL.The server quit without updating PID file ( ...

  4. T-SQL 之 视图

    视图实际上就是一个存储查询,重点是可以筛选.组合和匹配来自基本表(或者其他视图)的数据,从而创建在很多方面像另一个基表那样起作用的对象.可以创建一个简单的查询,仅仅从一个表中选择几列,而忽略其他列:或 ...

  5. UNIX网络编程读书笔记:recv和send函数

    这两个函数类似于标准的read和write函数,不过需要一个额外的参数. #include <sys/socket.h> ssize_t recv(int sockfd, void *bu ...

  6. Unity Game Starter Kit for Windows Store and Windows Phone Store games

    原地址:http://digitalerr0r.wordpress.com/2013/09/30/unity-game-starter-kit-for-windows-store-and-window ...

  7. 利用Lucene把文本的字体格式进行改动,然后输出到一个新的文件里

    这里书中写的是charactorProcess(File file, String destFile) 这里被我改成.(String file,  String destFIle) 一个代表现有的文件 ...

  8. 查看客户端的IP地址,机器名,MAC地址,登陆名等信息

    查看客户端的IP地址,机器名,MAC地址,登陆名等信息 SELECT s.session_id,s.login_time,s.host_name,p.loginame,s.program_name,c ...

  9. Android体系架构详解

    Andriod是什么? 首先,就像Android开源和兼容性技术负责人Dan Morrill在Android开发手册兼容性部分所解释的,“Android并不是传统的Linux风格的一个规范或分发版本, ...

  10. jQuery方法一览

    Attribute: $(”p”).addClass(css中定义的样式类型); 给某个元素添加样式 $(”img”).attr({src:”test.jpg”,alt:”test Image”}); ...