解析html教程(重点) http://www.cnblogs.com/kissdodog/archive/2013/02/28/2936950.html

完整的教程 http://www.cnblogs.com/kissdodog/category/453229.html

1 解析html

路径
//div 属于平行路径
/html/body/div/ul 属于xml类型的路径
//table/tr 平行路径+xml类型路径,混合使用
//*[@id='div1'] 可以根据id选择,也可以根据其它的属性
*代表匹配所有类型的标签,也可以换成其它的标签,如div等
如果要选择多个使用:var nodes = doc.DocumentNode.SelectNodes("//*[@class='a']");
按节点的ChildNodes选择
divInfo.ChildNodes[0].ChildNodes[0].Attributes["src"].Value

1 选择网页中的所有的div
doc.DocumentNode.SelectNodes("//div")

2 选择doc.DocumentNode.SelectSingleNode("/html/body/div/ul")

3 根据属性id选择节点
HtmlNode node8 = doc.DocumentNode.SelectSingleNode("//*[@id='div1']");
Response.Write(node8.Id);
Response.Write(node8.InnerText);

属性
Name
InnerHtml
InnerText
OuterHtml
ParentNode
XPath

2 Get/Post请求网页

 using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Net;
using System.Configuration;
using System.IO;
using System.Text; namespace MyLibrary.Common
{
public class BaseParser
{
private string _encode = "utf-8"; //默认编码格式 #region 1.0 下载指定URL的HTML代码(默认编码格式) + string GetHtml(string strUrl)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 1.1 下载指定URL的HTML代码(默认编码格式,并加了try catch) + string GetHtml2(string strUrl)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
/// <returns>目标URL的HTML代码,如果报错,则返回error</returns>
public string GetHtml2(string strUrl)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc;
try
{
httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
catch
{
return "error";
} }
#endregion #region 2.0 下载指定URL的HTML代码 + string GetHtml(string strUrl, Encoding encode)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
///<param name="encode">编码格式</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, Encoding encode)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, encode);
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 3.0 带Cookie凭据下载有登录限制URL的HTML代码(默认编码格式) + string GetHtml(string strUrl, CookieContainer cc)
/// <summary>
/// 带Cookie凭据下载有登录限制URL的HTML代码
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="cc">Cookie凭据</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, CookieContainer cc)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 4.0 带Cookie凭据下载有登录限制URL的HTML代码 + string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
/// <summary>
/// 带Cookie凭据下载有登录限制URL的HTML代码
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="cc">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, encode);
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 5.0 带Cookie凭据模拟发送POST请求(默认编码格式) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
/// <summary>
/// 带Cookie凭据模拟发送POST请求
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error";
}
#endregion #region 5.1 带Cookie凭据模拟发送POST请求(默认编码格式,即使报错也返回HTML代码) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
/// <summary>
/// 带Cookie凭据模拟发送POST请求(即使报错也返回HTML代码)
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error和HTML代码(格式:error|HTML代码)</returns>
public string PostWebRequest2(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error|"+html;
}
#endregion #region 6.0 带Cookie凭据模拟发送POST请求 + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
/// <summary>
/// 带Cookie凭据模拟发送POST请求
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = encode.GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), encode);
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error";
}
#endregion }
}

HtmlAgilityPack教程的更多相关文章

  1. 【转】黄聪:HtmlAgilityPack教程案例

    [转]黄聪:HtmlAgilityPack教程案例 HtmlAgilityPack中的HtmlNode类与XmlNode类差不多,提供的功能也大同小异.下面来看看该类提供功能. 一.静态属性 publ ...

  2. 黄聪:HtmlAgilityPack教程案例

    HtmlAgilityPack中的HtmlNode类与XmlNode类差不多,提供的功能也大同小异.下面来看看该类提供功能. 一.静态属性 public static Dictionary<st ...

  3. HTML解析利器 - HtmlAgilityPack

    HtmlAgilityPack 是CodePlex 上的一个开源项目.它提供了标准的DOM API 和XPath 导航--即使 HTML 不是适当的格式! 使用HtmlAgilityPack操作HTM ...

  4. C#:使用HtmlAgilityPack解析Html

    推荐阅读: HtmlAgilityPack 入门教程1 HtmlAgilityPack入门教程2 向HtmlAgilityPack道歉:解析HTML还是你好用 获取html中meta标签中的conte ...

  5. 网页采集(通过HtmlAgilityPack+XPath)

    有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取.因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记. 1.下载HtmlAgilityP ...

  6. 史林枫:开源HtmlAgilityPack公共小类库封装 - 网页采集(爬虫)辅助解析利器【附源码+可视化工具推荐】

    做开发的,可能都做过信息采集相关的程序,史林枫也经常做一些数据采集或某些网站的业务办理自动化操作软件. 获取目标网页的信息很简单,使用网络编程,利用HttpWebResponse.HttpWebReq ...

  7. Angular2入门系列教程7-HTTP(一)-使用Angular2自带的http进行网络请求

    上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数 感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...

  8. Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数

    上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数 之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...

  9. Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数

    上一篇:Angular2入门系列教程-服务 上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由 为了编写样式方便,我们这篇 ...

随机推荐

  1. HDU 4031 Attack(线段树/树状数组区间更新单点查询+暴力)

    Attack Time Limit: 5000/3000 MS (Java/Others)    Memory Limit: 65768/65768 K (Java/Others) Total Sub ...

  2. CodeForces 670E Correct Bracket Sequence Editor(list和迭代器函数模拟)

    E. Correct Bracket Sequence Editor time limit per test 2 seconds memory limit per test 256 megabytes ...

  3. shopnc nginx优化配置文件

    user www; worker_processes 2; error_log /var/log/nginx/error.log error; #error_log logs/error.log no ...

  4. Oracle11g创建表空间语句

    在plsql工具中执行以下语句,可建立Oracle表空间. /*分为四步 *//*第1步:创建临时表空间  */create temporary tablespace yuhang_temp temp ...

  5. java.sql.SQLException: 关闭的连接 解决办法

    程序如果长时间不进行数据库操作,那么数据源中的 Connection 很可能已经断开.其原因有可能是防火墙,或者连接的数据库设置的超时时间.这里使用的是 C3P0 连接 oracle 数据库,引起的异 ...

  6. context:component-scan 分析

    <context:component-scan> Web.xml中 <servlet> <servlet-name>springMVC</servlet-na ...

  7. twitter storm源码走读之2 -- tuple消息发送场景分析

    欢迎转载,转载请注明出处源自徽沪一郎.本文尝试分析tuple发送时的具体细节,本博的另一篇文章<bolt消息传递路径之源码解读>主要从消息接收方面来阐述问题,两篇文章互为补充. worke ...

  8. 奥迪--Q3

    -型号:Q3 -价格:23-35W -动力:1.4T/2.0T -变速箱:6挡双离合/7挡双离合 -长宽高:4.40,1.84,1.59 -油箱:64L -发动机:EA888 -大灯:氙气(选装LED ...

  9. 区分super和this

    Java关键字this.super使用总结 一.this Java关键字this只能用于方法方法体内.当一个对象创建后,Java虚拟机(JVM)就会给这个对象分配一个引用自身的指针,这个指针的名字就是 ...

  10. Nobel Lecture, December 12, 1929 Thermionic phenomena and the laws which govern them

    http://www.nobelprize.org/nobel_prizes/physics/laureates/1928/richardson-lecture.pdf OWEN W. RICHARD ...