HTML 转文本及HTML内容提取(C#)

//1、HTML直接转文本
//使用方法
HtmlToText convert = new HtmlToText();
textBox2.Text = convert.Convert(textBox1.Text);
//代码
/// <summary>
/// Converts HTML to plain text.
/// </summary>
class HtmlToText
{
// Static data tables
protected static Dictionary<string, string> _tags;
protected static HashSet<string> _ignoreTags;
// Instance variables
protected TextBuilder _text;
protected string _html;
protected int _pos;
// Static constructor (one time only)
static HtmlToText()
{
_tags = new Dictionary<string, string>();
_tags.Add("address", "\n");
_tags.Add("blockquote", "\n");
_tags.Add("div", "\n");
_tags.Add("dl", "\n");
_tags.Add("fieldset", "\n");
_tags.Add("form", "\n");
_tags.Add("h1", "\n");
_tags.Add("/h1", "\n");
_tags.Add("h2", "\n");
_tags.Add("/h2", "\n");
_tags.Add("h3", "\n");
_tags.Add("/h3", "\n");
_tags.Add("h4", "\n");
_tags.Add("/h4", "\n");
_tags.Add("h5", "\n");
_tags.Add("/h5", "\n");
_tags.Add("h6", "\n");
_tags.Add("/h6", "\n");
_tags.Add("p", "\n");
_tags.Add("/p", "\n");
_tags.Add("table", "\n");
_tags.Add("/table", "\n");
_tags.Add("ul", "\n");
_tags.Add("/ul", "\n");
_tags.Add("ol", "\n");
_tags.Add("/ol", "\n");
_tags.Add("/li", "\n");
_tags.Add("br", "\n");
_tags.Add("/td", "\t");
_tags.Add("/tr", "\n");
_tags.Add("/pre", "\n");
_ignoreTags = new HashSet<string>();
_ignoreTags.Add("script");
_ignoreTags.Add("noscript");
_ignoreTags.Add("style");
_ignoreTags.Add("object");
}
/// <summary>
/// Converts the given HTML to plain text and returns the result.
/// </summary>
/// <param name="html">HTML to be converted</param>
/// <returns>Resulting plain text</returns>
public string Convert(string html)
{
// Initialize state variables
_text = new TextBuilder();
_html = html;
_pos = 0;
// Process input
while (!EndOfText)
{
if (Peek() == '<')
{
// HTML tag
bool selfClosing;
string tag = ParseTag(out selfClosing);
// Handle special tag cases
if (tag == "body")
{
// Discard content before <body>
_text.Clear();
}
else if (tag == "/body")
{
// Discard content after </body>
_pos = _html.Length;
}
else if (tag == "pre")
{
// Enter preformatted mode
_text.Preformatted = true;
EatWhitespaceToNextLine();
}
else if (tag == "/pre")
{
// Exit preformatted mode
_text.Preformatted = false;
}
string value;
if (_tags.TryGetValue(tag, out value))
_text.Write(value);
if (_ignoreTags.Contains(tag))
EatInnerContent(tag);
}
else if (Char.IsWhiteSpace(Peek()))
{
// Whitespace (treat all as space)
_text.Write(_text.Preformatted ? Peek() : ' ');
MoveAhead();
}
else
{
// Other text
_text.Write(Peek());
MoveAhead();
}
}
// Return result
return HttpUtility.HtmlDecode(_text.ToString());
}
// Eats all characters that are part of the current tag
// and returns information about that tag
protected string ParseTag(out bool selfClosing)
{
string tag = String.Empty;
selfClosing = false;
if (Peek() == '<')
{
MoveAhead();
// Parse tag name
EatWhitespace();
int start = _pos;
if (Peek() == '/')
MoveAhead();
while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
Peek() != '/' && Peek() != '>')
MoveAhead();
tag = _html.Substring(start, _pos - start).ToLower();
// Parse rest of tag
while (!EndOfText && Peek() != '>')
{
if (Peek() == '"' || Peek() == '\'')
EatQuotedValue();
else
{
if (Peek() == '/')
selfClosing = true;
MoveAhead();
}
}
MoveAhead();
}
return tag;
}
// Consumes inner content from the current tag
protected void EatInnerContent(string tag)
{
string endTag = "/" + tag;
while (!EndOfText)
{
if (Peek() == '<')
{
// Consume a tag
bool selfClosing;
if (ParseTag(out selfClosing) == endTag)
return;
// Use recursion to consume nested tags
if (!selfClosing && !tag.StartsWith("/"))
EatInnerContent(tag);
}
else MoveAhead();
}
}
// Returns true if the current position is at the end of
// the string
protected bool EndOfText
{
get { return (_pos >= _html.Length); }
}
// Safely returns the character at the current position
protected char Peek()
{
return (_pos < _html.Length) ? _html[_pos] : (char)0;
}
// Safely advances to current position to the next character
protected void MoveAhead()
{
_pos = Math.Min(_pos + 1, _html.Length);
}
// Moves the current position to the next non-whitespace
// character.
protected void EatWhitespace()
{
while (Char.IsWhiteSpace(Peek()))
MoveAhead();
}
// Moves the current position to the next non-whitespace
// character or the start of the next line, whichever
// comes first
protected void EatWhitespaceToNextLine()
{
while (Char.IsWhiteSpace(Peek()))
{
char c = Peek();
MoveAhead();
if (c == '\n')
break;
}
}
// Moves the current position past a quoted value
protected void EatQuotedValue()
{
char c = Peek();
if (c == '"' || c == '\'')
{
// Opening quote
MoveAhead();
// Find end of value
int start = _pos;
_pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);
if (_pos < 0)
_pos = _html.Length;
else
MoveAhead(); // Closing quote
}
}
/// <summary>
/// A StringBuilder class that helps eliminate excess whitespace.
/// </summary>
protected class TextBuilder
{
private StringBuilder _text;
private StringBuilder _currLine;
private int _emptyLines;
private bool _preformatted;
// Construction
public TextBuilder()
{
_text = new StringBuilder();
_currLine = new StringBuilder();
_emptyLines = 0;
_preformatted = false;
}
/// <summary>
/// Normally, extra whitespace characters are discarded.
/// If this property is set to true, they are passed
/// through unchanged.
/// </summary>
public bool Preformatted
{
get
{
return _preformatted;
}
set
{
if (value)
{
// Clear line buffer if changing to
// preformatted mode
if (_currLine.Length > 0)
FlushCurrLine();
_emptyLines = 0;
}
_preformatted = value;
}
}
/// <summary>
/// Clears all current text.
/// </summary>
public void Clear()
{
_text.Length = 0;
_currLine.Length = 0;
_emptyLines = 0;
}
/// <summary>
/// Writes the given string to the output buffer.
/// </summary>
/// <param name="s"></param>
public void Write(string s)
{
foreach (char c in s)
Write(c);
}
/// <summary>
/// Writes the given character to the output buffer.
/// </summary>
/// <param name="c">Character to write</param>
public void Write(char c)
{
if (_preformatted)
{
// Write preformatted character
_text.Append(c);
}
else
{
if (c == '\r')
{
// Ignore carriage returns. We'll process
// '\n' if it comes next
}
else if (c == '\n')
{
// Flush current line
FlushCurrLine();
}
else if (Char.IsWhiteSpace(c))
{
// Write single space character
int len = _currLine.Length;
if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
_currLine.Append(' ');
}
else
{
// Add character to current line
_currLine.Append(c);
}
}
}
// Appends the current line to output buffer
protected void FlushCurrLine()
{
// Get current line
string line = _currLine.ToString().Trim();
// Determine if line contains non-space characters
string tmp = line.Replace(" ", String.Empty);
if (tmp.Length == 0)
{
// An empty line
_emptyLines++;
if (_emptyLines < 2 && _text.Length > 0)
_text.AppendLine(line);
}
else
{
// A non-empty line
_emptyLines = 0;
_text.AppendLine(line);
}
// Reset current line
_currLine.Length = 0;
}
/// <summary>
/// Returns the current output as a string.
/// </summary>
public override string ToString()
{
if (_currLine.Length > 0)
FlushCurrLine();
return _text.ToString();
}
}
}
//2、提取html的正文类
using System;
using System.Text;
namespace HtmlStrip
{
class MainClass
{
public static void Main (string[] args)
{
string str = "<div>abc</div><span>efg</span><br /><script>888</script>oo";
//System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");
//str=rd.ReadToEnd ();
HtmlParser t = new HtmlParser (str); //
t.KeepTag (new string[] { "br" }); //设置br标签不过虑
Console.Write (t.Text ());
}
}
class HtmlParser
{
private string[] htmlcode; //把html转为数组形式用于分析
private StringBuilder result = new StringBuilder (); //输出的结果
private int seek; //分析文本时候的指针位置
private string[] keepTag; //用于保存要保留的尖括号内容
private bool _inTag; //标记现在的指针是不是在尖括号内
private bool needContent = true; //是否要提取正文
private string tagName; //当前尖括号的名字
private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容，一般这些标签的正文是不要的
/// <summary>
/// 当指针进入尖括号内，就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字
/// </summary>
public bool inTag {
get { return _inTag; }
set {
_inTag = value;
if (!value)
return;
bool ok = true;
tagName = "";
while (ok) {
string word = read ();
if (word != " " && word != ">") {
tagName += word;
} else if (word == " " && tagName.Length > 0) {
ok = false;
} else if (word == ">") {
ok = false;
inTag = false;
seek -= 1;
}
}
}
}
/// <summary>
/// 初始化类
/// </summary>
/// <param name="html">
/// 要分析的html代码
/// </param>
public HtmlParser (string html)
{
htmlcode = new string[html.Length];
for (int i = 0; i < html.Length; i++) {
htmlcode[i] = html[i].ToString ();
}
KeepTag (new string[] { });
}
/// <summary>
/// 设置要保存那些标签不要被过滤掉
/// </summary>
/// <param name="tags">
///
/// </param>
public void KeepTag (string[] tags)
{
keepTag = tags;
}
/// <summary>
///
/// </summary>
/// <returns>
/// 输出处理后的文本
/// </returns>
public string Text ()
{
int startTag = 0;
int endTag = 0;
while (seek < htmlcode.Length) {
string word = read ();
if (word.ToLower () == "<") {
startTag = seek;
inTag = true;
} else if (word.ToLower () == ">") {
endTag = seek;
inTag = false;
if (iskeepTag (tagName.Replace ("/", ""))) {
for (int i = startTag - 1; i < endTag; i++) {
result.Append (htmlcode[i].ToString ());
}
} else if (tagName.StartsWith ("!--")) {
bool ok = true;
while (ok) {
if (read () == "-") {
if (read () == "-") {
if (read () == ">") {
ok = false;
} else {
seek -= 1;
}
}
}
}
} else {
foreach (string str in specialTag) {
if (tagName == str) {
needContent = false;
break;
} else
needContent = true;
}
}
} else if (!inTag && needContent) {
result.Append (word);
}
}
return result.ToString ();
}
/// <summary>
/// 判断是否要保存这个标签
/// </summary>
/// <param name="tag">
/// A <see cref="System.String"/>
/// </param>
/// <returns>
/// A <see cref="System.Boolean"/>
/// </returns>
private bool iskeepTag (string tag)
{
foreach (string ta in keepTag) {
if (tag.ToLower () == ta.ToLower ()) {
return true;
}
}
return false;
}
private string read ()
{
return htmlcode[seek++];
}
}
}

引文原址：http://blog.csdn.net/cjh200102/article/details/6824895

HTML 转文本及HTML内容提取(C#)的更多相关文章

python利用正则表达式提取文本中特定内容
正则表达式是一个特殊的字符序列,它能帮助你方便的检查一个字符串是否与某种模式匹配. Python 自1.5版本起增加了re 模块,它提供 Perl 风格的正则表达式模式. re 模块使 Python ...
关于MFC文本框输入内容的获取与设置文本框的内容
八月要开始做界面了<(￣︶￣)/,然而目前只会用MFC╮(╯▽╰)╭ 好吧,言归正传,设置好文本框后,要获取用户输入的内容,可以用: GetDlgItemText() ; 这个函数有两个参数,第 ...
jquery获取文本框的内容
使用jquery获取文本框的内容有以下几种: 1.根据ID取值(id属性): // javascript <script type="text/javascript"> ...
jQuery清除文本框，内容并设置不可用
JQuery清除文本框,内容并设置不可用如果是设置只读,则将disabled换成readonly function CleanText(textid) { $("#"+text ...
Python即时网络爬虫项目: 内容提取器的定义(Python2.7版本)
1. 项目背景在Python即时网络爬虫项目启动说明中我们讨论一个数字:程序员浪费在调测内容提取规则上的时间太多了(见上图),从而我们发起了这个项目,把程序员从繁琐的调测规则中解放出来,投入到更高端 ...
Python即时网络爬虫项目: 内容提取器的定义
1. 项目背景在python 即时网络爬虫项目启动说明中我们讨论一个数字:程序员浪费在调测内容提取规则上的时间,从而我们发起了这个项目,把程序员从繁琐的调测规则中解放出来,投入到更高端的数据处理工作 ...
API例子：用Java/JavaScript下载内容提取器
1,引言本文讲解怎样用Java和JavaScript使用 GooSeeker API 接口下载内容提取器,这是一个示例程序.什么是内容提取器?为什么用这种方式?源自Python即时网络爬虫开源项目: ...
.Net 文本框实现内容提示(仿Google、Baidu)
原文:.Net 文本框实现内容提示(仿Google.Baidu) 1.Demo下载: 文本框实现内容提示(仿Google.Baidu).rar 2.创建数据库.表(我用的sqlserver2008数据 ...
shell编程系列11--文本处理三剑客之sed利用sed删除文本中的内容
shell编程系列11--文本处理三剑客之sed利用sed删除文本中的内容删除命令对照表命令含义 1d 删除第一行内容 ,10d 删除1行到10行的内容 ,+5d 删除10行到16行的内容 /p ...

随机推荐

memmove 的实现
baidu的笔试题目用C语言实现一个公用库函数void * memmove(void *dest,const void *src,size_t n).该函数的功能是拷贝src所指的内存内容前n个字节 ...
思科27亿美元收购网络安全公司Sourcefire
据国外媒体报道,思科于今日宣布将以27亿美元的总价收购网络安全公司Sourcefire,以加强自身在网络安全业务领域的优势,该交易将于今年下半年完成. [IT商业新闻网讯](记者张良)7月23日消息 ...
CentOS 7 之几个新特性（转）
上篇我们讲到默认没有ifconfig是centos7的新特性,所以我特意上网搜索了一下其新特性,找到一篇文章,现转过来. centos最小好化安装没有ifconfig命令刚安装了centos7.0, ...
Thinkphp 模版
1.显示模版在Home/Controller/MainController.class.php中写一个方法来显示对应的模版 function text() { //变量输出 $this->as ...
PHPCMS v9构建模块
■补课: 1.phpcms v9帮助文件,上面会写关于二次开发的一些方法. http://v9.help.phpcms.cn/ 2.找一个后台还没安装的模块,先把代码看一边.比如dianping模块 ...
mysql innerjoin left join right join 解析
毕业半年多时间,一直都没有学习好join 之前一直是先从一个表里面取出数据然后,然后再从另外一个表里面取出数据,然后再写一个函数循环格式化数据. 还是先写一下学到的东西吧! 转载自w3school ...
从客户端(xxxxxxxxxxxxxxxxxxxxxx)中检测到有潜在危险的 Request.Form 值。
在项目中用到了富文本编辑器,当将编辑器中的值从视图传递到控制器时,控制器就会向浏览器返回“从客户端(xxxxxxxxxxxxxxxxxxxxxx)中检测到有潜在危险的 Request.Form 值.” ...
Ubuntu 查看和杀死进程[转]
今天在netbeans中关闭webrick时,发现没有关闭掉,打入localhost:3000 依然显示页面,发现无法从nb中再次关闭只有进入ubuntu的进程下关闭查看进程:1法,ps -e 命令 ...
Web API的CPU占用100%
我用Web API做了一个网站,网站很简单,请求就是几个普通的参数,提交到服务器后,在Web API里做一下参数验证,然后去访问Redis里的TIME命令,最后把TIME命令返回的结果计算出yyyy- ...
JDBC连接mysql，TOMCAT错误： Cannot convert value '0000-00-00 00:00:00' from column 10 to TIMESTAMP
解决思路依据如下URL: http://blog.csdn.net/stail111/article/details/5640109 问题:MySQL数据库,如果数据库中日期字段为空为值为'0000- ...

HTML 转文本及HTML内容提取(C#)

HTML 转文本及HTML内容提取(C#)的更多相关文章

随机推荐

热门专题