最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:

球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30

在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:

public void Run()
{
Logger.ClearAll();
for(int i=; i<=; i++)
{
Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
Default.Ready();
var teamid = i;
var teamname = Default.SelectSingleNode("div.blockA>h2>span");
Logger.Log(teamname.Text());
var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
Logger.Log(teamurl.Text());
var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
Logger.Log(teamcity.Text().Replace("主场所在城市:",""));
var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
Logger.Log(gym.Text().Replace("主体育馆:",""));
var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
Logger.Log(peoplenum.Text().Replace("可容纳人数:",""));
var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
Logger.Log(intonba.Text().Replace("加入NBA时间:",""));
var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
Logger.Log(champion.Text().Replace("获总冠军次数:",""));
var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
Logger.Log(coach.Text().Replace("现任主教练:",""));
DataManager.AppendData("TEAM",
DataEntry.Create()
.Set("teamid", teamid+"")
.Set("teamname", teamname.Text())
.Set("teamurl", teamurl.Text().Replace("主场所在城市:",""))
.Set("gym",gym.Text().Replace("主体育馆:",""))
.Set("peoplenum", peoplenum.Text().Replace("可容纳人数:",""))
.Set("intonba", intonba.Text().Replace("加入NBA时间:",""))
.Set("champion", champion.Text().Replace("获总冠军次数:",""))
.Set("coach", coach.Text().Replace("现任主教练:",""))
);
Logger.Log(i.ToString());
var playelist = Default.SelectNodes("div.tab>table tr");
foreach(var player in playelist)
{
var num = player.SelectSingleNode("TD:eq(0)");
var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
var url = a.Attr("href");
var playerid = Regex.Match(url, @"\d+").Value;
var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
var height = player.SelectSingleNode("TD:eq(3)");
var weight = player.SelectSingleNode("TD:eq(4)");
var birth = player.SelectSingleNode("TD:eq(5)");
var college = player.SelectSingleNode("TD:eq(6)");
Logger.Log(playerimageurl.Text());
Logger.Log(playername.Text());
Logger.Log(position.Text());
Logger.Log(height.Text());
Logger.Log(weight.Text());
Logger.Log(birth.Text());
Logger.Log(college.Text());
Logger.Log(playerimageurl.Attr("src"));
Logger.Log(playerid);
DataManager.AppendData("player",
DataEntry.Create()
.Set("playerid", playerid)
.Set("teamid", teamid+"")
.Set("playername", playername.Text())
.Set("position", position.Text())
.Set("height",height.Text())
.Set("weight", weight.Text())
.Set("birth", birth.Text())
.Set("college", college.Text())
.Set("num", num.Text())
.Set("playerimageurl",playerimageurl.Attr("src"))
);
}
}
}

比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012

脚本如下:

public void Run()
{
Logger.ClearAll();
Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
Default.Ready();
var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains(\"技术统计\")"); List<string> urls = new List<string>();
foreach(var g in games)
{
var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
urls.Add(url.ToString());
}
foreach(var url in urls)
{
if( Default.Available == false) return;
Default.Navigate(url);
Default.Ready();
var teamNames = Default.SelectNodes("div.blockA>h2");
var scores = Default.SelectNodes("table.tab04 tr");
var scoreslist = Default.SelectNodes("table.tab02 tr>td");
var awayscores = Default.SelectNodes("table.tab02 tr");
var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
var logos = Default.SelectNodes("td.logo img");
var awayid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homeid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homescore=scores[].Text();
var awayscore=scores[].Text();
var awayscore1=scoreslist[].Text();
var awayscore2=scoreslist[].Text();
var awayscore3=scoreslist[].Text();
var awayscore4=scoreslist[].Text();
var homescore1=scoreslist[].Text();
var homescore2=scoreslist[].Text();
var homescore3=scoreslist[].Text();
var homescore4=scoreslist[].Text();
var gametime = Default.SelectSingleNode("div.center>h2"); var jiashiawayscores1="";
var jiashiawayscores2="" ;
var jiashiawayscores3 ="";
var jiashiawayscores4="";
var jiashihomescores1="";
var jiashihomescores2="";
var jiashihomescores3 ="";
var jiashihomescores4=""; var td = Default.SelectSingleNode("table.tabBig td:contains(\"加时赛\")");
if(!td.IsEmpty())
{ if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
}
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text(); } } DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",awayscore1)
.Set("score2", awayscore2)
.Set("score3",awayscore3)
.Set("score4",awayscore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashiawayscores1)
.Set("jiashiscore2",jiashiawayscores2)
.Set("jiashiscore3",jiashiawayscores3)
.Set("jiashiscore4",jiashiawayscores4)
);
DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",homescore1)
.Set("score2", homescore2)
.Set("score3",homescore3)
.Set("score4",homescore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashihomescores1)
.Set("jiashiscore2",jiashihomescores2)
.Set("jiashiscore3",jiashihomescores3)
.Set("jiashiscore4",jiashihomescores4)
); } }

这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的.

最后运行起来:

文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果.

Spider Studio QQ群: 45995410

示例 - C#脚本代码采集搜狐NBA球员, 球队和比赛实况的更多相关文章

  1. 使用CURL和火车头软件采集搜狐文章

    直接上代码: //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies function curl_request($url, ...

  2. C# 脚本代码自动登录淘宝获取用户信息

    C# 脚本代码自动登录淘宝获取用户信息   最近遇到的一个需求是如何让程序自动登录淘宝, 获取用户名称等信息. 其实这个利用SS (SpiderStudio的简称) 实现起来非常简单. 十数行代码就可 ...

  3. crawler4j源码学习(1):搜狐新闻网新闻标题采集爬虫

    crawler4j是用Java实现的开源网络爬虫.提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫.下面实例结合jsoup,采集搜狐新闻网(http://news.sohu.com/)新闻标 ...

  4. 利用朴素贝叶斯分类算法对搜狐新闻进行分类(python)

    数据来源  https://www.sogou.com/labs/resource/cs.php介绍:来自搜狐新闻2012年6月—7月期间国内,国际,体育,社会,娱乐等18个频道的新闻数据,提供URL ...

  5. jquery仿搜狐投票动画代码

    体验效果:http://hovertree.com/texiao/jquery/21/ 这是一款基于jquery实现的仿搜狐投票动画特效源码,运行该源码可见VS图标首先出现在中间位置,紧接着随着投票比 ...

  6. 【HTML&CSS】搜狐页面代码编写

    <!DOCTYPE html> <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"& ...

  7. 搜狐云景paas平台实践之路

    前言: 搜狐云景作为搜狐的paas平台,在2014年5月22日的云计算大会上正式发布了公测.初测,注册用户必须先申请邀请码参与公测会赠送用户100元电子券,经过实名认证之后会再赠送100电子券,目测可 ...

  8. 山寨Unity3D?搜狐畅游的免费开源游戏引擎Genesis-3D

    在CSDN上看到了<搜狐畅游发布3D游戏引擎Genesis-3D 基于MIT协议开源>(http://www.csdn.net/article/2013-11-21/2817585-cha ...

  9. 利用jieba,word2vec,LR进行搜狐新闻文本分类

    一.简介 1)jieba 中文叫做结巴,是一款中文分词工具,https://github.com/fxsjy/jieba 2)word2vec 单词向量化工具,https://radimrehurek ...

随机推荐

  1. [ES6] 04. The let keyword -- 2 Fiald case

    Fiald case 1: let can work in it's block { let a = 10; var b = 1; } a // ReferenceError: a is not de ...

  2. Servlet对文件的读写操作

    (1)怎样在serlvet中读取文件的内容 package com.tsinghua; import java.io.*; import javax.servlet.http.*; public cl ...

  3. sqlserver并发用户数

    http://zhidao.baidu.com/question/291231462.html http://dev.mysql.com/downloads/mysql/

  4. Android API Guides---Tasks and Back Stack

    一个应用程序通常包括多个活动.每一个活动应环绕行动的用户能够运行,而且能够启动其它活动的特定种类进行设计.比如,电子邮件应用程序可能具有一个活动,以显示新的消息的列表.当用户选择一个信息.一个新的活动 ...

  5. GO!自制一款【不丑】的名片

    大概每一个人都有自己的名片.也见过不少名片. 我敢打赌,你常常认为很多名片"不咋地".尽管不是全部人都具备一定的审美眼光,但实际上每一个人都具备较高的"审丑"眼 ...

  6. Android Studio之多个Activity的滑动切换(二)

    1.因为Android界面上的全部控件一般都位于Layout控件(比方RelativeLayout)之上,而布局控件能够设置响应touch事件,所以能够通过布局控件的setOnTouchListen来 ...

  7. 算法笔记_136:交替字符串(Java)

    目录 1 问题描述 2 解决方案   1 问题描述 输入三个字符串s1.s2和s3,判断第三个字符串s3是否由前两个字符串s1和s2交错而成且不改变s1和s2中各个字符原有的相对顺序. 2 解决方案 ...

  8. qs.js库 使用方法

    1.qs.js库说明 qs是一个url参数转化(parse和stringify)的js库. https://www.npmjs.com/package/qs 2.使用(以vue文件做示例) (1)基本 ...

  9. 自己主动化測试使用mybatis更新数据库信息实例

    代码例如以下: mybatis配置文件: <? xml version="1.0" encoding="UTF-8"?> <!DOCTYPE ...

  10. oracle 存储过程 示例

      oracle 存储过程 示例 CreationTime--2018年9月4日09点49分 Author:Marydon 1.情景展示 对VIRTUAL_QRCODELOG表的静态二维码,动态二维码 ...