HTML 转文本及HTML内容提取(C#)
- //1、HTML直接转文本
- //使用方法
- HtmlToText convert = new HtmlToText();
- textBox2.Text = convert.Convert(textBox1.Text);
- //代码
- /// <summary>
- /// Converts HTML to plain text.
- /// </summary>
- class HtmlToText
- {
- // Static data tables
- protected static Dictionary<string, string> _tags;
- protected static HashSet<string> _ignoreTags;
- // Instance variables
- protected TextBuilder _text;
- protected string _html;
- protected int _pos;
- // Static constructor (one time only)
- static HtmlToText()
- {
- _tags = new Dictionary<string, string>();
- _tags.Add("address", "\n");
- _tags.Add("blockquote", "\n");
- _tags.Add("div", "\n");
- _tags.Add("dl", "\n");
- _tags.Add("fieldset", "\n");
- _tags.Add("form", "\n");
- _tags.Add("h1", "\n");
- _tags.Add("/h1", "\n");
- _tags.Add("h2", "\n");
- _tags.Add("/h2", "\n");
- _tags.Add("h3", "\n");
- _tags.Add("/h3", "\n");
- _tags.Add("h4", "\n");
- _tags.Add("/h4", "\n");
- _tags.Add("h5", "\n");
- _tags.Add("/h5", "\n");
- _tags.Add("h6", "\n");
- _tags.Add("/h6", "\n");
- _tags.Add("p", "\n");
- _tags.Add("/p", "\n");
- _tags.Add("table", "\n");
- _tags.Add("/table", "\n");
- _tags.Add("ul", "\n");
- _tags.Add("/ul", "\n");
- _tags.Add("ol", "\n");
- _tags.Add("/ol", "\n");
- _tags.Add("/li", "\n");
- _tags.Add("br", "\n");
- _tags.Add("/td", "\t");
- _tags.Add("/tr", "\n");
- _tags.Add("/pre", "\n");
- _ignoreTags = new HashSet<string>();
- _ignoreTags.Add("script");
- _ignoreTags.Add("noscript");
- _ignoreTags.Add("style");
- _ignoreTags.Add("object");
- }
- /// <summary>
- /// Converts the given HTML to plain text and returns the result.
- /// </summary>
- /// <param name="html">HTML to be converted</param>
- /// <returns>Resulting plain text</returns>
- public string Convert(string html)
- {
- // Initialize state variables
- _text = new TextBuilder();
- _html = html;
- _pos = 0;
- // Process input
- while (!EndOfText)
- {
- if (Peek() == '<')
- {
- // HTML tag
- bool selfClosing;
- string tag = ParseTag(out selfClosing);
- // Handle special tag cases
- if (tag == "body")
- {
- // Discard content before <body>
- _text.Clear();
- }
- else if (tag == "/body")
- {
- // Discard content after </body>
- _pos = _html.Length;
- }
- else if (tag == "pre")
- {
- // Enter preformatted mode
- _text.Preformatted = true;
- EatWhitespaceToNextLine();
- }
- else if (tag == "/pre")
- {
- // Exit preformatted mode
- _text.Preformatted = false;
- }
- string value;
- if (_tags.TryGetValue(tag, out value))
- _text.Write(value);
- if (_ignoreTags.Contains(tag))
- EatInnerContent(tag);
- }
- else if (Char.IsWhiteSpace(Peek()))
- {
- // Whitespace (treat all as space)
- _text.Write(_text.Preformatted ? Peek() : ' ');
- MoveAhead();
- }
- else
- {
- // Other text
- _text.Write(Peek());
- MoveAhead();
- }
- }
- // Return result
- return HttpUtility.HtmlDecode(_text.ToString());
- }
- // Eats all characters that are part of the current tag
- // and returns information about that tag
- protected string ParseTag(out bool selfClosing)
- {
- string tag = String.Empty;
- selfClosing = false;
- if (Peek() == '<')
- {
- MoveAhead();
- // Parse tag name
- EatWhitespace();
- int start = _pos;
- if (Peek() == '/')
- MoveAhead();
- while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
- Peek() != '/' && Peek() != '>')
- MoveAhead();
- tag = _html.Substring(start, _pos - start).ToLower();
- // Parse rest of tag
- while (!EndOfText && Peek() != '>')
- {
- if (Peek() == '"' || Peek() == '\'')
- EatQuotedValue();
- else
- {
- if (Peek() == '/')
- selfClosing = true;
- MoveAhead();
- }
- }
- MoveAhead();
- }
- return tag;
- }
- // Consumes inner content from the current tag
- protected void EatInnerContent(string tag)
- {
- string endTag = "/" + tag;
- while (!EndOfText)
- {
- if (Peek() == '<')
- {
- // Consume a tag
- bool selfClosing;
- if (ParseTag(out selfClosing) == endTag)
- return;
- // Use recursion to consume nested tags
- if (!selfClosing && !tag.StartsWith("/"))
- EatInnerContent(tag);
- }
- else MoveAhead();
- }
- }
- // Returns true if the current position is at the end of
- // the string
- protected bool EndOfText
- {
- get { return (_pos >= _html.Length); }
- }
- // Safely returns the character at the current position
- protected char Peek()
- {
- return (_pos < _html.Length) ? _html[_pos] : (char)0;
- }
- // Safely advances to current position to the next character
- protected void MoveAhead()
- {
- _pos = Math.Min(_pos + 1, _html.Length);
- }
- // Moves the current position to the next non-whitespace
- // character.
- protected void EatWhitespace()
- {
- while (Char.IsWhiteSpace(Peek()))
- MoveAhead();
- }
- // Moves the current position to the next non-whitespace
- // character or the start of the next line, whichever
- // comes first
- protected void EatWhitespaceToNextLine()
- {
- while (Char.IsWhiteSpace(Peek()))
- {
- char c = Peek();
- MoveAhead();
- if (c == '\n')
- break;
- }
- }
- // Moves the current position past a quoted value
- protected void EatQuotedValue()
- {
- char c = Peek();
- if (c == '"' || c == '\'')
- {
- // Opening quote
- MoveAhead();
- // Find end of value
- int start = _pos;
- _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);
- if (_pos < 0)
- _pos = _html.Length;
- else
- MoveAhead(); // Closing quote
- }
- }
- /// <summary>
- /// A StringBuilder class that helps eliminate excess whitespace.
- /// </summary>
- protected class TextBuilder
- {
- private StringBuilder _text;
- private StringBuilder _currLine;
- private int _emptyLines;
- private bool _preformatted;
- // Construction
- public TextBuilder()
- {
- _text = new StringBuilder();
- _currLine = new StringBuilder();
- _emptyLines = 0;
- _preformatted = false;
- }
- /// <summary>
- /// Normally, extra whitespace characters are discarded.
- /// If this property is set to true, they are passed
- /// through unchanged.
- /// </summary>
- public bool Preformatted
- {
- get
- {
- return _preformatted;
- }
- set
- {
- if (value)
- {
- // Clear line buffer if changing to
- // preformatted mode
- if (_currLine.Length > 0)
- FlushCurrLine();
- _emptyLines = 0;
- }
- _preformatted = value;
- }
- }
- /// <summary>
- /// Clears all current text.
- /// </summary>
- public void Clear()
- {
- _text.Length = 0;
- _currLine.Length = 0;
- _emptyLines = 0;
- }
- /// <summary>
- /// Writes the given string to the output buffer.
- /// </summary>
- /// <param name="s"></param>
- public void Write(string s)
- {
- foreach (char c in s)
- Write(c);
- }
- /// <summary>
- /// Writes the given character to the output buffer.
- /// </summary>
- /// <param name="c">Character to write</param>
- public void Write(char c)
- {
- if (_preformatted)
- {
- // Write preformatted character
- _text.Append(c);
- }
- else
- {
- if (c == '\r')
- {
- // Ignore carriage returns. We'll process
- // '\n' if it comes next
- }
- else if (c == '\n')
- {
- // Flush current line
- FlushCurrLine();
- }
- else if (Char.IsWhiteSpace(c))
- {
- // Write single space character
- int len = _currLine.Length;
- if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
- _currLine.Append(' ');
- }
- else
- {
- // Add character to current line
- _currLine.Append(c);
- }
- }
- }
- // Appends the current line to output buffer
- protected void FlushCurrLine()
- {
- // Get current line
- string line = _currLine.ToString().Trim();
- // Determine if line contains non-space characters
- string tmp = line.Replace(" ", String.Empty);
- if (tmp.Length == 0)
- {
- // An empty line
- _emptyLines++;
- if (_emptyLines < 2 && _text.Length > 0)
- _text.AppendLine(line);
- }
- else
- {
- // A non-empty line
- _emptyLines = 0;
- _text.AppendLine(line);
- }
- // Reset current line
- _currLine.Length = 0;
- }
- /// <summary>
- /// Returns the current output as a string.
- /// </summary>
- public override string ToString()
- {
- if (_currLine.Length > 0)
- FlushCurrLine();
- return _text.ToString();
- }
- }
- }
- //2、提取html的正文 类
- using System;
- using System.Text;
- namespace HtmlStrip
- {
- class MainClass
- {
- public static void Main (string[] args)
- {
- string str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo";
- //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");
- //str=rd.ReadToEnd ();
- HtmlParser t = new HtmlParser (str); //
- t.KeepTag (new string[] { "br" }); //设置br标签不过虑
- Console.Write (t.Text ());
- }
- }
- class HtmlParser
- {
- private string[] htmlcode; //把html转为数组形式用于分析
- private StringBuilder result = new StringBuilder (); //输出的结果
- private int seek; //分析文本时候的指针位置
- private string[] keepTag; //用于保存要保留的尖括号内容
- private bool _inTag; //标记现在的指针是不是在尖括号内
- private bool needContent = true; //是否要提取正文
- private string tagName; //当前尖括号的名字
- private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容,一般这些标签的正文是不要的
- /// <summary>
- /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字
- /// </summary>
- public bool inTag {
- get { return _inTag; }
- set {
- _inTag = value;
- if (!value)
- return;
- bool ok = true;
- tagName = "";
- while (ok) {
- string word = read ();
- if (word != " " && word != ">") {
- tagName += word;
- } else if (word == " " && tagName.Length > 0) {
- ok = false;
- } else if (word == ">") {
- ok = false;
- inTag = false;
- seek -= 1;
- }
- }
- }
- }
- /// <summary>
- /// 初始化类
- /// </summary>
- /// <param name="html">
- /// 要分析的html代码
- /// </param>
- public HtmlParser (string html)
- {
- htmlcode = new string[html.Length];
- for (int i = 0; i < html.Length; i++) {
- htmlcode[i] = html[i].ToString ();
- }
- KeepTag (new string[] { });
- }
- /// <summary>
- /// 设置要保存那些标签不要被过滤掉
- /// </summary>
- /// <param name="tags">
- ///
- /// </param>
- public void KeepTag (string[] tags)
- {
- keepTag = tags;
- }
- /// <summary>
- ///
- /// </summary>
- /// <returns>
- /// 输出处理后的文本
- /// </returns>
- public string Text ()
- {
- int startTag = 0;
- int endTag = 0;
- while (seek < htmlcode.Length) {
- string word = read ();
- if (word.ToLower () == "<") {
- startTag = seek;
- inTag = true;
- } else if (word.ToLower () == ">") {
- endTag = seek;
- inTag = false;
- if (iskeepTag (tagName.Replace ("/", ""))) {
- for (int i = startTag - 1; i < endTag; i++) {
- result.Append (htmlcode[i].ToString ());
- }
- } else if (tagName.StartsWith ("!--")) {
- bool ok = true;
- while (ok) {
- if (read () == "-") {
- if (read () == "-") {
- if (read () == ">") {
- ok = false;
- } else {
- seek -= 1;
- }
- }
- }
- }
- } else {
- foreach (string str in specialTag) {
- if (tagName == str) {
- needContent = false;
- break;
- } else
- needContent = true;
- }
- }
- } else if (!inTag && needContent) {
- result.Append (word);
- }
- }
- return result.ToString ();
- }
- /// <summary>
- /// 判断是否要保存这个标签
- /// </summary>
- /// <param name="tag">
- /// A <see cref="System.String"/>
- /// </param>
- /// <returns>
- /// A <see cref="System.Boolean"/>
- /// </returns>
- private bool iskeepTag (string tag)
- {
- foreach (string ta in keepTag) {
- if (tag.ToLower () == ta.ToLower ()) {
- return true;
- }
- }
- return false;
- }
- private string read ()
- {
- return htmlcode[seek++];
- }
- }
- }
引文原址:http://blog.csdn.net/cjh200102/article/details/6824895
HTML 转文本及HTML内容提取(C#)的更多相关文章
- python利用正则表达式提取文本中特定内容
正则表达式是一个特殊的字符序列,它能帮助你方便的检查一个字符串是否与某种模式匹配. Python 自1.5版本起增加了re 模块,它提供 Perl 风格的正则表达式模式. re 模块使 Python ...
- 关于MFC文本框输入内容的获取 与 设置文本框的内容
八月要开始做界面了<( ̄︶ ̄)/,然而目前只会用MFC╮(╯▽╰)╭ 好吧,言归正传,设置好文本框后,要获取用户输入的内容,可以用: GetDlgItemText() ; 这个函数有两个参数,第 ...
- jquery获取文本框的内容
使用jquery获取文本框的内容有以下几种: 1.根据ID取值(id属性): // javascript <script type="text/javascript"> ...
- jQuery清除文本框,内容并设置不可用
JQuery清除文本框,内容并设置不可用 如果是设置只读,则将disabled换成readonly function CleanText(textid) { $("#"+text ...
- Python即时网络爬虫项目: 内容提取器的定义(Python2.7版本)
1. 项目背景 在Python即时网络爬虫项目启动说明中我们讨论一个数字:程序员浪费在调测内容提取规则上的时间太多了(见上图),从而我们发起了这个项目,把程序员从繁琐的调测规则中解放出来,投入到更高端 ...
- Python即时网络爬虫项目: 内容提取器的定义
1. 项目背景 在python 即时网络爬虫项目启动说明中我们讨论一个数字:程序员浪费在调测内容提取规则上的时间,从而我们发起了这个项目,把程序员从繁琐的调测规则中解放出来,投入到更高端的数据处理工作 ...
- API例子:用Java/JavaScript下载内容提取器
1,引言 本文讲解怎样用Java和JavaScript使用 GooSeeker API 接口下载内容提取器,这是一个示例程序.什么是内容提取器?为什么用这种方式?源自Python即时网络爬虫开源项目: ...
- .Net 文本框实现内容提示(仿Google、Baidu)
原文:.Net 文本框实现内容提示(仿Google.Baidu) 1.Demo下载: 文本框实现内容提示(仿Google.Baidu).rar 2.创建数据库.表(我用的sqlserver2008数据 ...
- shell编程系列11--文本处理三剑客之sed利用sed删除文本中的内容
shell编程系列11--文本处理三剑客之sed利用sed删除文本中的内容 删除命令对照表 命令 含义 1d 删除第一行内容 ,10d 删除1行到10行的内容 ,+5d 删除10行到16行的内容 /p ...
随机推荐
- QT5-控件-QProgressBar-进度条-用来做下载进度,文件读取进度还不错
#ifndef MAINWINDOW_H #define MAINWINDOW_H #include <QMainWindow> #include <QProgressBar> ...
- 解决百度Ueditor编辑器表格不显示边框问题
一.主要内容 CMS使用百度Ueditor编辑器中的表格功能,在编辑模式下可以正常显示边框,而文章发布之后表格不能显示边框.本博文经过查阅相关资料,最终解决了该问题. 二.使用平台 1. dedecm ...
- jquery validation插件使用
首先需要引入jQuery.js和jquery.validate.js 以下面代码为例: <form id="mainform"> <fieldset> &l ...
- smarty 自定义函数
自定义函数:<{方法名称}> 在lib/plugins中新建文件,命名方式是固定的:function.方法名称.php 或者 block.方法名称.php 1.<{literal}& ...
- PHP生成数字+字符混合型字符串
以下是一个用PHP随机生成字符+数字混合型的随机字符串,可用来生成会员ID.用户密码/密钥等内容,函数简单,代码如下: <?php function generate_rand($l){ $c= ...
- bat判断某个目录是否存在
使用批处理判断某个目录是否存在的语句. if not exist xxx_folder_name md xxx_folder_name
- 文成小盆友python-num11-(2) python操作Memcache Redis
本部分主要内容: python操作memcache python操作redis 一.python 操作 memcache memcache是一套分布式的高速缓存系统,由LiveJournal的Brad ...
- 针对access数据库的增删改查
1.执行查询操作:(ExecuteReader方法) string myConnectionString = "Provider = Microsoft.Jet.OLEDB.4.0;Data ...
- restful-----------------------------接口设计方式(一种风格)
http动词: GET(SELECT):从服务器取出资源(一项或者多项) POST(CREATE):在服务器创建一个资源 PUT(UPDATE):在服务器更新资源(客户端提供改变后的完整资源)完整更新 ...
- SVM详解
SVM入门(一)至(三)Refresh 按:之前的文章重新汇编一下,修改了一些错误和不当的说法,一起复习,然后继续SVM之旅. (一)SVM的简介 支持向量机(Support Vector Machi ...