HtmlAgilityPack教程
解析html教程(重点) http://www.cnblogs.com/kissdodog/archive/2013/02/28/2936950.html
完整的教程 http://www.cnblogs.com/kissdodog/category/453229.html
1 解析html
路径
//div 属于平行路径
/html/body/div/ul 属于xml类型的路径
//table/tr 平行路径+xml类型路径,混合使用
//*[@id='div1'] 可以根据id选择,也可以根据其它的属性
*代表匹配所有类型的标签,也可以换成其它的标签,如div等
如果要选择多个使用:var nodes = doc.DocumentNode.SelectNodes("//*[@class='a']");
按节点的ChildNodes选择
divInfo.ChildNodes[0].ChildNodes[0].Attributes["src"].Value
1 选择网页中的所有的div
doc.DocumentNode.SelectNodes("//div")
2 选择doc.DocumentNode.SelectSingleNode("/html/body/div/ul")
3 根据属性id选择节点
HtmlNode node8 = doc.DocumentNode.SelectSingleNode("//*[@id='div1']");
Response.Write(node8.Id);
Response.Write(node8.InnerText);
属性
Name
InnerHtml
InnerText
OuterHtml
ParentNode
XPath
2 Get/Post请求网页
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Net;
using System.Configuration;
using System.IO;
using System.Text; namespace MyLibrary.Common
{
public class BaseParser
{
private string _encode = "utf-8"; //默认编码格式 #region 1.0 下载指定URL的HTML代码(默认编码格式) + string GetHtml(string strUrl)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 1.1 下载指定URL的HTML代码(默认编码格式,并加了try catch) + string GetHtml2(string strUrl)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
/// <returns>目标URL的HTML代码,如果报错,则返回error</returns>
public string GetHtml2(string strUrl)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc;
try
{
httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
catch
{
return "error";
} }
#endregion #region 2.0 下载指定URL的HTML代码 + string GetHtml(string strUrl, Encoding encode)
/// <summary>
/// 下载指定URL的HTML代码
/// </summary>
/// <param name="strUrl">目标页URL</param>
///<param name="encode">编码格式</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, Encoding encode)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
CookieContainer cc = new CookieContainer();
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, encode);
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 3.0 带Cookie凭据下载有登录限制URL的HTML代码(默认编码格式) + string GetHtml(string strUrl, CookieContainer cc)
/// <summary>
/// 带Cookie凭据下载有登录限制URL的HTML代码
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="cc">Cookie凭据</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, CookieContainer cc)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 4.0 带Cookie凭据下载有登录限制URL的HTML代码 + string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
/// <summary>
/// 带Cookie凭据下载有登录限制URL的HTML代码
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="cc">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>目标URL的HTML代码</returns>
public string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
{
HttpWebRequest httpReq;
HttpWebResponse httpResp; httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
httpReq.CookieContainer = cc; httpResp = (HttpWebResponse)httpReq.GetResponse();
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, encode);
string html = respStreamReader.ReadToEnd();
respStream.Close();
respStreamReader.Close(); return html;
}
#endregion #region 5.0 带Cookie凭据模拟发送POST请求(默认编码格式) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
/// <summary>
/// 带Cookie凭据模拟发送POST请求
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error";
}
#endregion #region 5.1 带Cookie凭据模拟发送POST请求(默认编码格式,即使报错也返回HTML代码) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
/// <summary>
/// 带Cookie凭据模拟发送POST请求(即使报错也返回HTML代码)
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error和HTML代码(格式:error|HTML代码)</returns>
public string PostWebRequest2(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error|"+html;
}
#endregion #region 6.0 带Cookie凭据模拟发送POST请求 + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
/// <summary>
/// 带Cookie凭据模拟发送POST请求
/// </summary>
/// <param name="strUrl">目标URL</param>
/// <param name="dicParams">参数列表</param>
/// <param name="container">Cookie凭据</param>
/// <param name="encode">编码格式</param>
/// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
{
string postData = string.Empty;
if (dicParams != null)
{
foreach (string key in dicParams.Keys)
{
postData += string.Format("{0}={1}&", key, dicParams[key]);
}
if (postData != string.Empty) postData = postData.Substring(, postData.Length - );
}
byte[] byteArray = encode.GetBytes(postData);
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
httpReq.AllowAutoRedirect = true;
//httpReq.Credentials = CredentialCache.DefaultCredentials;
httpReq.KeepAlive = true;
httpReq.Method = "POST";
httpReq.ContentType = "application/x-www-form-urlencoded";
httpReq.ContentLength = byteArray.Length; if (container != null) httpReq.CookieContainer = container;
else httpReq.CookieContainer = new CookieContainer(); Stream reqStream = httpReq.GetRequestStream();
reqStream.Write(byteArray, , byteArray.Length); //写入参数
reqStream.Close(); HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
int cookies = httpResp.Cookies.Count;
if (container == null) container = httpReq.CookieContainer; StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), encode);
string html = respStream.ReadToEnd(); respStream.Close();
httpReq.Abort();
httpResp.Close(); if (cookies > ) return html;
else return "error";
}
#endregion }
}
HtmlAgilityPack教程的更多相关文章
- 【转】黄聪:HtmlAgilityPack教程案例
[转]黄聪:HtmlAgilityPack教程案例 HtmlAgilityPack中的HtmlNode类与XmlNode类差不多,提供的功能也大同小异.下面来看看该类提供功能. 一.静态属性 publ ...
- 黄聪:HtmlAgilityPack教程案例
HtmlAgilityPack中的HtmlNode类与XmlNode类差不多,提供的功能也大同小异.下面来看看该类提供功能. 一.静态属性 public static Dictionary<st ...
- HTML解析利器 - HtmlAgilityPack
HtmlAgilityPack 是CodePlex 上的一个开源项目.它提供了标准的DOM API 和XPath 导航--即使 HTML 不是适当的格式! 使用HtmlAgilityPack操作HTM ...
- C#:使用HtmlAgilityPack解析Html
推荐阅读: HtmlAgilityPack 入门教程1 HtmlAgilityPack入门教程2 向HtmlAgilityPack道歉:解析HTML还是你好用 获取html中meta标签中的conte ...
- 网页采集(通过HtmlAgilityPack+XPath)
有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取.因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记. 1.下载HtmlAgilityP ...
- 史林枫:开源HtmlAgilityPack公共小类库封装 - 网页采集(爬虫)辅助解析利器【附源码+可视化工具推荐】
做开发的,可能都做过信息采集相关的程序,史林枫也经常做一些数据采集或某些网站的业务办理自动化操作软件. 获取目标网页的信息很简单,使用网络编程,利用HttpWebResponse.HttpWebReq ...
- Angular2入门系列教程7-HTTP(一)-使用Angular2自带的http进行网络请求
上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数 感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...
- Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数
上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数 之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...
- Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数
上一篇:Angular2入门系列教程-服务 上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由 为了编写样式方便,我们这篇 ...
随机推荐
- linux的信号机制
软中断信号(signal,又简称为信号)用来通知进程发生了异步事件.进程之间可以互相通过系统调用kill发送软中断信号.内核也可以因为内部事件而给进程发送信号,通知进程发生了某个事件.注意,信号只是用 ...
- 分布式架构高可用架构篇_01_zookeeper集群的安装、配置、高可用测试
参考: 龙果学院http://www.roncoo.com/share.html?hamc=hLPG8QsaaWVOl2Z76wpJHp3JBbZZF%2Bywm5vEfPp9LbLkAjAnB%2B ...
- A trip through the Graphics Pipeline 2011_03
At this point, we’ve sent draw calls down from our app all the way through various driver layers and ...
- mysql字段额外属性,除去字段类型外的其他属性
如果你不想字段为 NULL 可以设置字段的属性为 NOT NULL, 在操作数据库时如果输入该字段的数据为NULL ,就会报错. AUTO_INCREMENT定义列为自增的属性,一般用于主键,数值会自 ...
- 验证码 mewebstudio/captcha
composer require mews/captcha https://github.com/mewebstudio/captcha
- MYSQL PASSWORD()
https://www.pythian.com/blog/hashing-algorithm-in-mysql-password-2/ SELECT PASSWORD ("this_is_a ...
- he time that it takes to bring a block from disk into main memory
DATABASE SYSTEM CONCEPTS, SIXTH EDITION There is a trade-off that the system designer must make betw ...
- 【转】C# 解析JSON方法总结
http://blog.csdn.net/jjhua/article/details/51438317 主要参考http://blog.csdn.NET/joyhen/article/details/ ...
- JS中的工厂模式
.一个栗子: var BicycleShop = function(){}; BicycleShop.prototype = { sellBicycle : function( model ){ va ...
- zabbix-agent passive
http://www.cnblogs.com/mysql-dba/p/5010902.html http://blog.chinaunix.net/uid-29155617-id-4668602.ht ...