[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法

背景

在做网页数据分析的时候，我们关注的部分是内容，可以过滤掉HTML标签、Javascript、CSS等代码。

目标输入

<b>Hello World.</b><br/><p><i>Is there anyone out there?</i><p>

输出结果

Hello World. Is there anyone out there?

开发工具

Html Agility Pack
http://html-agility-pack.net/

实现方案1：（过滤规则严谨，保留HTML版式，推荐使用！）

//small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs

public static class HtmlToText

{

    public static string Convert(string path)

    {

        HtmlDocument doc = new HtmlDocument();

        doc.Load(path);

        return ConvertDoc(doc);

    }

    public static string ConvertHtml(string html)

    {

        HtmlDocument doc = new HtmlDocument();

        doc.LoadHtml(html);

        return ConvertDoc(doc);

    }

    public static string ConvertDoc (HtmlDocument doc)

    {

        using (StringWriter sw = new StringWriter())

        {

            ConvertTo(doc.DocumentNode, sw);

            sw.Flush();

            return sw.ToString();

        }

    }

    internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)

    {

        foreach (HtmlNode subnode in node.ChildNodes)

        {

            ConvertTo(subnode, outText, textInfo);

        }

    }

    public static void ConvertTo(HtmlNode node, TextWriter outText)

    {

        ConvertTo(node, outText, new PreceedingDomTextInfo(false));

    }

    internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)

    {

        string html;

        switch (node.NodeType)

        {

            case HtmlNodeType.Comment:

                // don't output comments

                break;

            case HtmlNodeType.Document:

                ConvertContentTo(node, outText, textInfo);

                break;

            case HtmlNodeType.Text:

                // script and style must not be output

                string parentName = node.ParentNode.Name;

                if ((parentName == "script") || (parentName == "style"))

                {

                    break;

                }

                // get text

                html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?

                if (HtmlNode.IsOverlappedClosingElement(html))

                {

                    break;

                }

                // check the text is meaningful and not a bunch of whitespaces

                if (html.Length == )

                {

                    break;

                }

                if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)

                {

                    html= html.TrimStart();

                    if (html.Length == ) { break; }

                    textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;

                }

                outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));

                if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - ]))

                {

                    outText.Write(' ');

                }

                    break;

            case HtmlNodeType.Element:

                string endElementString = null;

                bool isInline;

                bool skip = false;

                int listIndex = ;

                switch (node.Name)

                {

                    case "nav":

                        skip = true;

                        isInline = false;

                        break;

                    case "body":

                    case "section":

                    case "article":

                    case "aside":

                    case "h1":

                    case "h2":

                    case "header":

                    case "footer":

                    case "address":

                    case "main":

                    case "div":

                    case "p": // stylistic - adjust as you tend to use

                        if (textInfo.IsFirstTextOfDocWritten)

                        {

                            outText.Write("\r\n");

                        }

                        endElementString = "\r\n";

                        isInline = false;

                        break;

                    case "br":

                        outText.Write("\r\n");

                        skip = true;

                        textInfo.WritePrecedingWhiteSpace = false;

                        isInline = true;

                        break;

                    case "a":

                        if (node.Attributes.Contains("href"))

                        {

                            string href = node.Attributes["href"].Value.Trim();

                            if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-)

                            {

                                endElementString =  "<" + href + ">";

                            }

                        }

                        isInline = true;

                        break;

                    case "li":

                        if(textInfo.ListIndex>)

                        {

                            outText.Write("\r\n{0}.\t", textInfo.ListIndex++);

                        }

                        else

                        {

                            outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022

                        }

                        isInline = false;

                        break;

                    case "ol":

                        listIndex = ;

                        goto case "ul";

                    case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems

                        endElementString = "\r\n";

                        isInline = false;

                        break;

                    case "img": //inline-block in reality

                        if (node.Attributes.Contains("alt"))

                        {

                            outText.Write('[' + node.Attributes["alt"].Value);

                            endElementString = "]";

                        }

                        if (node.Attributes.Contains("src"))

                        {

                            outText.Write('<' + node.Attributes["src"].Value + '>');

                        }

                        isInline = true;

                        break;

                    default:

                        isInline = true;

                        break;

                }

                if (!skip && node.HasChildNodes)

                {

                    ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });

                }

                if (endElementString != null)

                {

                    outText.Write(endElementString);

                }

                break;

        }

    }

}

internal class PreceedingDomTextInfo

{

    public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)

    {

        IsFirstTextOfDocWritten = isFirstTextOfDocWritten;

    }

    public bool WritePrecedingWhiteSpace {get;set;}

    public bool LastCharWasSpace { get; set; }

    public readonly BoolWrapper IsFirstTextOfDocWritten;

    public int ListIndex { get; set; }

}

internal class BoolWrapper

{

    public BoolWrapper() { }

    public bool Value { get; set; }

    public static implicit operator bool(BoolWrapper boolWrapper)

    {

        return boolWrapper.Value;

    }

    public static implicit operator BoolWrapper(bool boolWrapper)

    {

        return new BoolWrapper{ Value = boolWrapper };

    }

}

实现方案2：（过滤规则不严谨，适用于结构简单的HTML）

public static string StripHTML(string HTMLText, bool decode = true)

{

    Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);

    var stripped = reg.Replace(HTMLText, "");

    return decode ? HttpUtility.HtmlDecode(stripped) : stripped;

}

参考资料

https://stackoverflow.com/a/25178738
https://stackoverflow.com/a/732110

[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法的更多相关文章

编写高质量代码改善C#程序的157个建议——建议49：在Dispose模式中应提取一个受保护的虚方法
建议49:在Dispose模式中应提取一个受保护的虚方法在标准的Dispose模式中,真正的IDisposable接口的Dispose方法并没有做实际的清理工作,它其实是调用了下面的这个带bool参 ...
Cocos2dx 代码中包含中文导致编译错误的问题解决方法
从网上下载一个cocos2dx的源码,是IOS版本的,我将其迁移到windows 7下 ,用VS2010编译,出现一堆的C2001错误: 1>d:\cocos2d-x-2.2.6\mygame\ ...
在带（继承）TextView的控件中，在代码中动态更改TextView的文字颜色
今天由于公司项目需求,须要实现一种类似tab的选项卡,当时直接想到的就是使用RadioGroup和RadioButton来实现. 这种方法全然没问题.可是在后来的开发过程中,却遇到了一些困扰非常久的小 ...
在java代码中,用xslt处理xml文件
http://blog.csdn.net/zhou_lei/article/details/2661735 ********************************************** ...
java代码中fastjson生成字符串和解析字符串的方法和javascript文件中字符串和json数组之间的转换方法
1.java代码中fastjson生成字符串和解析字符串的方法 List<TemplateFull> templateFulls = new ArrayList<TemplateFu ...
Java基础学习总结（81）——如何尽可能的减少Java代码中bug
Java编程语言的人气自然无需质疑,从Web应用到Android应用,这款语言已经被广泛用于开发各类应用及代码中的复杂功能. 不过在编写代码时,bug永远是困扰每一位从业者的头号难题.在今天的文章中, ...
实际案例：在现有代码中通过async/await实现并行
一项新技术或者一个新特性,只有你用它解决实际问题后,才能真正体会到它的魅力,真正理解它.也期待大家能够多分享解一些解决实际问题的内容. 在我们遭遇“黑色30秒”问题的过程中,切身体会到了异步的巨大作用 ...
在现有代码中通过async/await实现并行
在现有代码中通过async/await实现并行一项新技术或者一个新特性,只有你用它解决实际问题后,才能真正体会到它的魅力,真正理解它.也期待大家能够多分享解一些解决实际问题的内容. 在我们遭遇“黑色 ...
struts2框架之OGNL表达式概述（在代码中使用OGNL表达式）
1. OGNL是Object Graphic Navigation Language(对象图导航语言)的缩写 * 所谓对象图,即以任意一个对象为根,通过OGNL可以访问与这个对象关联的其它对象 * 通 ...

随机推荐

再做一遍floyed
#include<bits/stdc++.h> #define R register int using namespace std; const int inf=0x3f3f3f3f; ...
Pytest权威教程21-API参考-07-配置选项(Configuration Options)
目录配置选项(Configuration Options) addopts cache_dir confcutdir console_output_style doctest_encoding do ...
JS-选项卡制作解释部分
<!DOCTYPE html> <html> <head> <meta name="author" content "郭菊锋,7 ...
Codeforces Round #576 (div.1 + div.2)
Div2 A 长度为$n(n≤10^5)$的数组,每个元素不同,求有多少个位置$d$满足\(d - x \le j < d \And d < j \le d + y a_d< ...
mysql连接数
如何实时查看mysql当前连接数? 如何实时查看mysql当前连接数? 1.查看当前所有连接的详细资料: ./mysqladmin -uadmin -p -h10.140.1.1 processlis ...
【2019.10.30】SDN上机第1次作业
用字符命令搭建如下拓扑,要求写出命令题目一: 字符命令如下: 题目二: 字符命令如下: 利用可视化工具搭建如下拓扑要求支持OpenFlow 1.0 1.1 1.2 1.3,设置h1(10.0.0. ...
UMD、CommonJS、ES Module、AMD、CMD模块的写法
AMD异步模块规范 RequireJS就是AMD的一个典型的实现. 以下是一个只依赖与jQuery的模块代码: // foo.js define(['jquery'], function($){ // ...
go语言中type的几种使用
type是go语法里的重要而且常用的关键字,type绝不只是对应于C/C++中的typedef.搞清楚type的使用,就容易理解go语言中的核心概念struct.interface.函数等的使用.以下 ...
addEventListener与attachEvent区别
DOM2级事件处理程序 DOM2级事件定义了两个方法用于处理指定和删除事件处理程序的操作: addEventListener removeEventListener 所有的DOM节点都包含这两个方法, ...
eQTL | Expression quantitative trait loci | 表达数量性状基因座 | QTL | 数量性状位点
到底什么是eQTL? eQTL和QTL之间有什么联系?为什么说QTL比eQTL难很多? QTL和GWAS有什么关系? GTEx数据库里的eQTL数据如何利用? 说eQTL之前必须先解释QTL,QTL, ...

[C#] - 从 HTML 代码中 转换 / 提取 可读文字（PlainText）的方法

[C#] - 从 HTML 代码中 转换 / 提取 可读文字（PlainText）的方法的更多相关文章

随机推荐

热门专题

[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法

[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法的更多相关文章