C# 关于utf-8的研究

前提

如果一不小心把字符转成utf8的格式，但是却产生了乱码。这个时候要么就是寻找其他的转码方式，要么就不想要了，直接过滤吧。

这里说的是直接过滤的办法。

参考链接

https://netvignettes.wordpress.com/2011/07/03/how-to-detect-encoding/

大概的代码解释

其实主要的思路就是对照这个表（不过貌似它也不是严格对照的），比如下面的代码就是对于bytes的数量

private static bool IsLead4(byte b)

{

    return b >= 0xF0 && b < 0xF8;

}

private static bool IsLead3(byte b)

{

    return b >= 0xE0 && b < 0xF0;

}

private static bool IsLead2(byte b)

{

    return b >= 0xC0 && b < 0xE0;

}

private static bool IsExtendedByte(byte b)

{

    return b > 0x80 && b < 0xC0;

}

接下来就是主要一下特殊字符的边界情况

if (length >= )

{

    var one = bytes[offset];

    var two = bytes[offset + ];

    var three = bytes[offset + ];

    var four = bytes[offset + ];

    if (one == 0x2B &&

        two == 0x2F &&

        three == 0x76 &&

        (four == 0x38 || four == 0x39 || four == 0x2B || four == 0x2F))

    {

        return UTF7;

    }

    else if (one == 0xFE && two == 0xFF && three == 0x00 && four == 0x00)

    {

        return UTF32;

    }

    else if (four == 0xFE && three == 0xFF && two == 0x00 && one == 0x00)

    {

        throw new NotSupportedException("The byte order mark specifies UTF-32 in big endian order, which is not supported by .NET.");

    }

}

else if (length >= )

{

    var one = bytes[offset];

    var two = bytes[offset + ];

    var three = bytes[offset + ];

    if (one == 0xFF && two == 0xFE)

    {

        return Unicode;

    }

    else if (one == 0xFE && two == 0xFF)

    {

        return BigEndianUnicode;

    }

    else if (one == 0xEF && two == 0xBB && three == 0xBF)

    {

        return UTF8;

    }

}

if (length > )

{

    // Look for a leading < sign:

    if (bytes[offset] == 0x3C)

    {

        if (bytes[offset + ] == 0x00)

        {

            return Unicode;

        }

        else

        {

            return UTF8;

        }

    }

    else if (bytes[offset] == 0x00 && bytes[offset + ] == 0x3C)

    {

        return BigEndianUnicode;

    }

}

if (IsUtf8(bytes))

{

    return UTF8;

}

接下来就是测试

static void Main(string[] args)

{

    string ch = "金";

    string Ja = "らなくちゃ";

    string Re = "фыввфывфывфв";

    //byte[] Rom = {209,132,209,34,90,121,5,34,208};

    //byte[] Rom = { 100,200,3,4,5,6,7,8,9,0,0 };

    byte[] Rom = { , , ,,,,,,,,,,,,,};

    //byte[] Rom = {}

    byte[] byteArrayUTF8 = UTF8.GetBytes(Ja);

    //byte[] byteArrayDefault = Encoding.Default.GetBytes(Re);

    string Name = Encoding.UTF8.GetString(Rom,,(int)Rom.Length);

    var y = GetTextEncoding(Rom);

 }

全部代码

using System;

using System.Text;

using static System.Text.Encoding;

namespace ConsoleApp1

{

    class Program

    {

        /// <summary>

        /// Determines whether the bytes in this buffer at the specified offset represent a UTF-8 multi-byte character.

        /// </summary>

        /// <remarks>

        /// It is not guaranteed that these bytes represent a sensical character - only that the binary pattern matches UTF-8 encoding.

        /// </remarks>

        /// <param name="bytes">This buffer.</param>

        /// <param name="offset">The position in the buffer to check.</param>

        /// <param name="length">The number of bytes to check, of 4 if not specified.</param>

        /// <returns>The rank of the UTF</returns>

        public static MultibyteRank GetUtf8MultibyteRank(byte[] bytes, int offset = , int length = )

        {

            if (bytes == null)

            {

                throw new ArgumentNullException("bytes");

            }

            if (offset <  || offset > bytes.Length)

            {

                throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");

            }

            else if (length <  || length > )

            {

                throw new ArgumentOutOfRangeException("length", "Only values 1-4 are valid.");

            }

            else if ((offset + length) > bytes.Length)

            {

                throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");

            }

            // Possible 4 byte sequence

            if (length >  && IsLead4(bytes[offset]))

            {

                if (IsExtendedByte(bytes[offset + ]) && IsExtendedByte(bytes[offset + ]) && IsExtendedByte(bytes[offset + ]))

                {

                    return MultibyteRank.Four;

                }

            }

            // Possible 3 byte sequence

            else if (length >  && IsLead3(bytes[offset]))

            {

                if (IsExtendedByte(bytes[offset + ]) && IsExtendedByte(bytes[offset + ]))

                {

                    return MultibyteRank.Three;

                }

            }

            // Possible 2 byte sequence

            else if (length >  && IsLead2(bytes[offset]) && IsExtendedByte(bytes[offset + ]))

            {

                return MultibyteRank.Two;

            }

            if (bytes[offset] < 0x80)

            {

                return MultibyteRank.One;

            }

            else

            {

                return MultibyteRank.None;

            }

        }

        private static bool IsLead4(byte b)

        {

            return b >= 0xF0 && b < 0xF8;

        }

        private static bool IsLead3(byte b)

        {

            return b >= 0xE0 && b < 0xF0;

        }

        private static bool IsLead2(byte b)

        {

            return b >= 0xC0 && b < 0xE0;

        }

        private static bool IsExtendedByte(byte b)

        {

            return b > 0x80 && b < 0xC0;

        }

        public enum MultibyteRank

        {

            None = ,

            One = ,

            Two = ,

            Three = ,

            Four =

        }

        public static bool IsUtf8(byte[] bytes, int offset = , int? length = null)

        {

            if (bytes == null)

            {

                throw new ArgumentNullException("bytes");

            }

            length = length ?? (bytes.Length - offset);

            if (offset <  || offset > bytes.Length)

            {

                throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");

            }

            else if (length < )

            {

                throw new ArgumentOutOfRangeException("length");

            }

            else if ((offset + length) > bytes.Length)

            {

                throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");

            }

            var bytesRemaining = length.Value;

            while (bytesRemaining > )

            {

                var rank = GetUtf8MultibyteRank(bytes, offset, Math.Min(, bytesRemaining));

                if (rank == MultibyteRank.None)

                {

                    return false;

                }

                else

                {

                    var charsRead = (int)rank;

                    offset += charsRead;

                    bytesRemaining -= charsRead;

                }

            }

            return true;

        }

        /// <summary>

        /// Uses various discovery techniques to guess the encoding used for a byte buffer presumably containing text characters.

        /// </summary>

        /// <remarks>

        /// Note that this is only a guess and could be incorrect.  Be prepared to catch exceptions while using the <see cref="Encoding.Decoder"/> returned by

        /// the encoding returned by this method.

        /// </remarks>

        /// <param name="bytes">The buffer containing the bytes to examine.</param>

        /// <param name="offset">The offset into the buffer to begin examination, or 0 if not specified.</param>

        /// <param name="length">The number of bytes to examine.</param>

        /// <returns>An encoding, or <see langword="null"> if one cannot be determined.</returns>

        public static Encoding GetTextEncoding(byte[] bytes, int offset = , int? length = null)

        {

            if (bytes == null)

            {

                throw new ArgumentNullException("bytes");

            }

            length = length ?? bytes.Length;

            if (offset <  || offset > bytes.Length)

            {

                throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");

            }

            if (length <  || length > bytes.Length)

            {

                throw new ArgumentOutOfRangeException("length", "Length is out of range.");

            }

            else if ((offset + length) > bytes.Length)

            {

                throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");

            }

            // Look for a byte order mark:

            if (length >= )

            {

                var one = bytes[offset];

                var two = bytes[offset + ];

                var three = bytes[offset + ];

                var four = bytes[offset + ];

                if (one == 0x2B &&

                    two == 0x2F &&

                    three == 0x76 &&

                    (four == 0x38 || four == 0x39 || four == 0x2B || four == 0x2F))

                {

                    return UTF7;

                }

                else if (one == 0xFE && two == 0xFF && three == 0x00 && four == 0x00)

                {

                    return UTF32;

                }

                else if (four == 0xFE && three == 0xFF && two == 0x00 && one == 0x00)

                {

                    throw new NotSupportedException("The byte order mark specifies UTF-32 in big endian order, which is not supported by .NET.");

                }

            }

            else if (length >= )

            {

                var one = bytes[offset];

                var two = bytes[offset + ];

                var three = bytes[offset + ];

                if (one == 0xFF && two == 0xFE)

                {

                    return Unicode;

                }

                else if (one == 0xFE && two == 0xFF)

                {

                    return BigEndianUnicode;

                }

                else if (one == 0xEF && two == 0xBB && three == 0xBF)

                {

                    return UTF8;

                }

            }

            if (length > )

            {

                // Look for a leading < sign:

                if (bytes[offset] == 0x3C)

                {

                    if (bytes[offset + ] == 0x00)

                    {

                        return Unicode;

                    }

                    else

                    {

                        return UTF8;

                    }

                }

                else if (bytes[offset] == 0x00 && bytes[offset + ] == 0x3C)

                {

                    return BigEndianUnicode;

                }

            }

            if (IsUtf8(bytes))

            {

                return UTF8;

            }

            else

            {

                // Impossible to tell.

                return null;

            }

        }

        static void Main(string[] args)

        {

            string ch = "金";

            string Ja = "らなくちゃ";

            string Re = "фыввфывфывфв";

            //byte[] Rom = {209,132,209,34,90,121,5,34,208};

            //byte[] Rom = { 100,200,3,4,5,6,7,8,9,0,0 };

            byte[] Rom = { , , ,,,,,,,,,,,,,};

            //byte[] Rom = {}

            byte[] byteArrayUTF8 = UTF8.GetBytes(Ja);

            //byte[] byteArrayDefault = Encoding.Default.GetBytes(Re);

            string Name = Encoding.UTF8.GetString(Rom,,(int)Rom.Length);

            var y = GetTextEncoding(Rom);

         }

    }

}

C# 关于utf-8的研究的更多相关文章

java io 源码研究记录（一）
Java IO 源码研究: 一.输入流 1 基类 InputStream 简介: 这是Java中所有输入流的基类,它是一个抽象类,下面我们简单来了解一下它的基本方法和抽象方法. 基本方法: publ ...
闲来无聊，研究一下Web服务器的源程序
web服务器是如何工作的 1989年的夏天,蒂姆.博纳斯-李开发了世界上第一个web服务器和web客户机.这个浏览器程序是一个简单的电话号码查询软件.最初的web服务器程序就是一个利用浏览器和web服 ...
SQLSERVER聚集索引与非聚集索引的再次研究（上）
SQLSERVER聚集索引与非聚集索引的再次研究(上) 上篇主要说聚集索引下篇的地址:SQLSERVER聚集索引与非聚集索引的再次研究(下) 由于本人还是SQLSERVER菜鸟一枚,加上一些实验的逻 ...
深入研究Visual studio 2017 RC新特性
在[Xamarin+Prism开发详解三:Visual studio 2017 RC初体验]中分享了Visual studio 2017RC的大致情况,同时也发现大家对新的Visual Studio很 ...
【初码干货】使用阿里云对Web开发中的资源文件进行CDN加速的深入研究和实践
提示:阅读本文需提前了解的相关知识 1.阿里云(https://www.aliyun.com) 2.阿里云CDN(https://www.aliyun.com/product/cdn) 3.阿里云OS ...
对一致性Hash算法，Java代码实现的深入研究
一致性Hash算法关于一致性Hash算法,在我之前的博文中已经有多次提到了,MemCache超详细解读一文中"一致性Hash算法"部分,对于为什么要使用一致性Hash算法.一致性 ...
SQLSERVER聚集索引与非聚集索引的再次研究（下）
SQLSERVER聚集索引与非聚集索引的再次研究(下) 上篇主要说了聚集索引和简单介绍了一下非聚集索引,相信大家一定对聚集索引和非聚集索引开始有一点了解了. 这篇文章只是作为参考,里面的观点不一定正确 ...
开源Word读写组件DocX 的深入研究和问题总结
一. 前言前两天看到了asxinyu大神的[原创]开源Word读写组件DocX介绍与入门,正好我也有类似的自动生成word文档得需求,于是便仔细的研究了这个DocX. 我也把它融入到我的项目当中并进 ...
【移动端兼容问题研究】javascript事件机制详解（涉及移动兼容）
前言这篇博客有点长,如果你是高手请您读一读,能对其中的一些误点提出来,以免我误人子弟,并且帮助我提高如果你是javascript菜鸟,建议您好好读一读,真的理解下来会有不一样的收获在下才疏学浅, ...
从Java String实例来理解ANSI、Unicode、BMP、UTF等编码概念
转(http://www.codeceo.com/article/java-string-ansi-unicode-bmp-utf.html#0-tsina-1-10971-397232819ff9a ...

随机推荐

mask rcnn训练自己的数据集参考文章（推荐）
最近用Mask_RCNN训练模型,下面几篇文章提供了不少帮助,汇总出来,方便以后查找,并向几位博主老师表示感谢 https://blog.csdn.net/qq_29462849/article/de ...
Docker学习笔记_删除某个镜像
实验:删除某个镜像 sudo docker rmi [Image ID] 1.查看镜像的ID sudo docker images 2.删除镜像 ...
CentOS双网卡双IP设置
CentOS双网卡双IP设置系统环境:CentOS Linux 网络环境: 两个IP地址,192.168.0.10和10.10.30.2,掩码是255.255.255.0,这两个子网的网关地址分别是 ...
atom markdown报错：AssertionError: html-pdf: Failed to load PhantomJS module.
今天安装markdown-pdf之后运行的时候报错: AssertionError: html-pdf: Failed to load PhantomJS module. You have to se ...
jquery延时刷新
setTimeout(function(){ location.replace(location.href); },1000);
Azure 网站、云服务和虚拟机比较
最后更新时间(英文版):09/24/2014 最后更新时间(中文版):04/11/2015 Azure 提供几种方式托管 web 应用程序,如 Azure 网站.云服务和虚拟机.查看这些不同的选项后, ...
Java 并行和并发
并行:指两个或多个事件在同一时刻点进行. 并发:指两个或多个事件在同一时间段进行.
（转）Asp.Net底层原理(三、Asp.Net请求响应过程)
原文地址:http://www.cnblogs.com/liuhf939/archive/2013/09/16/3324753.html 在之前,我们写了自己的Asp.Net框架,对整个流程有了一个大 ...
【Android学习】Android编码规范
四种常见的命名法比较Java和c#的命名规范的不同点常量用大写 java方法首字母不大写,应该小写函数行数限制不要用拼音参照物,Android源码看源码工具,SourceInsight 和 ...
MongoDB整理笔记の新增Shard Server
1.启动一个新Shard Server 进程 [root@localhost ~]# mkdir /data/shard/s2 [root@localhost ~]# /Apps/mongo/bin/ ...

C# 关于utf-8的研究

前提

参考链接

大概的代码解释

全部代码

C# 关于utf-8的研究的更多相关文章

随机推荐

热门专题