解析html教程(重点) http://www.cnblogs.com/kissdodog/archive/2013/02/28/2936950.html

完整的教程 http://www.cnblogs.com/kissdodog/category/453229.html

1 解析html

路径
//div 属于平行路径
/html/body/div/ul 属于xml类型的路径
//table/tr 平行路径+xml类型路径,混合使用
//*[@id='div1'] 可以根据id选择,也可以根据其它的属性
*代表匹配所有类型的标签,也可以换成其它的标签,如div等
如果要选择多个使用:var nodes = doc.DocumentNode.SelectNodes("//*[@class='a']");
按节点的ChildNodes选择
divInfo.ChildNodes[0].ChildNodes[0].Attributes["src"].Value

1 选择网页中的所有的div
doc.DocumentNode.SelectNodes("//div")

2 选择doc.DocumentNode.SelectSingleNode("/html/body/div/ul")

3 根据属性id选择节点
HtmlNode node8 = doc.DocumentNode.SelectSingleNode("//*[@id='div1']");
Response.Write(node8.Id);
Response.Write(node8.InnerText);

属性
Name
InnerHtml
InnerText
OuterHtml
ParentNode
XPath

2 Get/Post请求网页

 using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Net;
using System.Configuration;
using System.IO;
using System.Text; namespace MyLibrary.Common
{
public class BaseParser
{
private string _encode = "utf-8"; //默认编码格式 #region 1.0 下载指定URL的HTML代码(默认编码格式) + string GetHtml(string strUrl)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 1.1 下载指定URL的HTML代码(默认编码格式,并加了try catch) + string GetHtml2(string strUrl)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
/// <returns>目标URL的HTML代码,如果报错,则返回error</returns>
public string GetHtml2(string strUrl)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc;
try
{
httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
catch
{
return "error";
} }
#endregion #region 2.0 下载指定URL的HTML代码 + string GetHtml(string strUrl, Encoding encode)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
///<param name="encode">编码格式</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, Encoding encode)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, encode);
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 3.0 带Cookie凭据下载有登录限制URL的HTML代码(默认编码格式) + string GetHtml(string strUrl, CookieContainer cc)
/// <summary>
/// 带Cookie凭据下载有登录限制URL的HTML代码
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="cc">Cookie凭据</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, CookieContainer cc)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 4.0 带Cookie凭据下载有登录限制URL的HTML代码 + string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
/// <summary>
/// 带Cookie凭据下载有登录限制URL的HTML代码
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="cc">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, encode);
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 5.0 带Cookie凭据模拟发送POST请求(默认编码格式) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
/// <summary>
/// 带Cookie凭据模拟发送POST请求
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error";
}
#endregion #region 5.1 带Cookie凭据模拟发送POST请求(默认编码格式,即使报错也返回HTML代码) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
/// <summary>
/// 带Cookie凭据模拟发送POST请求(即使报错也返回HTML代码)
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error和HTML代码(格式:error|HTML代码)</returns>
public string PostWebRequest2(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error|"+html;
}
#endregion #region 6.0 带Cookie凭据模拟发送POST请求 + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
/// <summary>
/// 带Cookie凭据模拟发送POST请求
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = encode.GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), encode);
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error";
}
#endregion }
}

HtmlAgilityPack教程的更多相关文章

  1. 【转】黄聪:HtmlAgilityPack教程案例

    [转]黄聪:HtmlAgilityPack教程案例 HtmlAgilityPack中的HtmlNode类与XmlNode类差不多,提供的功能也大同小异.下面来看看该类提供功能. 一.静态属性 publ ...

  2. 黄聪:HtmlAgilityPack教程案例

    HtmlAgilityPack中的HtmlNode类与XmlNode类差不多,提供的功能也大同小异.下面来看看该类提供功能. 一.静态属性 public static Dictionary<st ...

  3. HTML解析利器 - HtmlAgilityPack

    HtmlAgilityPack 是CodePlex 上的一个开源项目.它提供了标准的DOM API 和XPath 导航--即使 HTML 不是适当的格式! 使用HtmlAgilityPack操作HTM ...

  4. C#:使用HtmlAgilityPack解析Html

    推荐阅读: HtmlAgilityPack 入门教程1 HtmlAgilityPack入门教程2 向HtmlAgilityPack道歉:解析HTML还是你好用 获取html中meta标签中的conte ...

  5. 网页采集(通过HtmlAgilityPack+XPath)

    有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取.因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记. 1.下载HtmlAgilityP ...

  6. 史林枫:开源HtmlAgilityPack公共小类库封装 - 网页采集(爬虫)辅助解析利器【附源码+可视化工具推荐】

    做开发的,可能都做过信息采集相关的程序,史林枫也经常做一些数据采集或某些网站的业务办理自动化操作软件. 获取目标网页的信息很简单,使用网络编程,利用HttpWebResponse.HttpWebReq ...

  7. Angular2入门系列教程7-HTTP(一)-使用Angular2自带的http进行网络请求

    上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数 感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...

  8. Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数

    上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数 之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...

  9. Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数

    上一篇:Angular2入门系列教程-服务 上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由 为了编写样式方便,我们这篇 ...

随机推荐

  1. php 数组操作

    <?php $vegetables[0] = "corn"; $vegetables[1] = "broccoli"; $vegetables[2] = ...

  2. php phpeclipse + xampp 配置安装过程

    就想test是否能配置成功,下载apache,php5.3,安装开始 apache的安装,一路next,遇到Server Information,随便填写即可,安装路径自己可选 php的安装,将下载的 ...

  3. java--测体重练习

    public class tz{ public static void main(String[] args){ int sg=165,tz=52;bz = sg-115 if (tz-bz>3 ...

  4. io资料

    jitsi red5 apache meeting2 openmeeting2 openfire http://www.onlycoder.net/ 在视频会议领域,有许多可以值得参考的开源项目,这些 ...

  5. 小红伞和virtualbox5.0.10冲突

    win7 sp1 64bit 旗舰版:virtual box 5.0.10 提示 error in supr3hardNtChildWaitFor……Timed out after 60001 ms ...

  6. HDU 1789 贪心经典

    题意 给出n门作业的截止时间与分数 如果不能在那天结束前做完就扣掉相应分数 问怎么安排能让扣分最少 思路 先按分数从大到小排序 先研究大的 做好标记 一开始每天都能放作业 全是true 如果这一天已经 ...

  7. Bungie Interview with Halo3 Developer

    http://www.realtimerendering.com/blog/tag/bungie/ Digital Foundry interview with Halo: Reach develop ...

  8. LR中HTTP协议录制模式选择

    在LR中使用HTML/HTTP协议进行脚本录制时面临正确选择HTTP-based script / URL-base script 录制模式的问题,以下是比较官方的建议:1)基于浏览器的应用程序推荐使 ...

  9. Defining Stored Programs

    ok DROP PROCEDURE IF EXISTS truncate_insert_rank_month; DELIMITER /w/ CREATE PROCEDURE truncate_inse ...

  10. Squid 操作实践

    Squid简介 Squid可以做什么 性能要素 Squid安装 Squid快速体验 Squid配置 Squid简介 Squid is a caching proxy for the Web suppo ...