C# 关于utf-8的研究
前提
如果一不小心把字符转成utf8的格式,但是却产生了乱码。这个时候要么就是寻找其他的转码方式,要么就不想要了,直接过滤吧。
这里说的是直接过滤的办法。
参考链接
https://netvignettes.wordpress.com/2011/07/03/how-to-detect-encoding/
大概的代码解释
其实主要的思路就是对照这个表(不过貌似它也不是严格对照的),比如下面的代码就是对于bytes的数量

private static bool IsLead4(byte b)
{
return b >= 0xF0 && b < 0xF8;
}
private static bool IsLead3(byte b)
{
return b >= 0xE0 && b < 0xF0;
}
private static bool IsLead2(byte b)
{
return b >= 0xC0 && b < 0xE0;
}
private static bool IsExtendedByte(byte b)
{
return b > 0x80 && b < 0xC0;
}
接下来就是主要一下特殊字符的边界情况
if (length >= )
{
var one = bytes[offset];
var two = bytes[offset + ];
var three = bytes[offset + ];
var four = bytes[offset + ];
if (one == 0x2B &&
two == 0x2F &&
three == 0x76 &&
(four == 0x38 || four == 0x39 || four == 0x2B || four == 0x2F))
{
return UTF7;
}
else if (one == 0xFE && two == 0xFF && three == 0x00 && four == 0x00)
{
return UTF32;
}
else if (four == 0xFE && three == 0xFF && two == 0x00 && one == 0x00)
{
throw new NotSupportedException("The byte order mark specifies UTF-32 in big endian order, which is not supported by .NET.");
}
}
else if (length >= )
{
var one = bytes[offset];
var two = bytes[offset + ];
var three = bytes[offset + ];
if (one == 0xFF && two == 0xFE)
{
return Unicode;
}
else if (one == 0xFE && two == 0xFF)
{
return BigEndianUnicode;
}
else if (one == 0xEF && two == 0xBB && three == 0xBF)
{
return UTF8;
}
}
if (length > )
{
// Look for a leading < sign:
if (bytes[offset] == 0x3C)
{
if (bytes[offset + ] == 0x00)
{
return Unicode;
}
else
{
return UTF8;
} }
else if (bytes[offset] == 0x00 && bytes[offset + ] == 0x3C)
{
return BigEndianUnicode;
}
}
if (IsUtf8(bytes))
{
return UTF8;
}
接下来就是测试
static void Main(string[] args)
{
string ch = "金";
string Ja = "らなくちゃ"; string Re = "фыввфывфывфв"; //byte[] Rom = {209,132,209,34,90,121,5,34,208};
//byte[] Rom = { 100,200,3,4,5,6,7,8,9,0,0 };
byte[] Rom = { , , ,,,,,,,,,,,,,};
//byte[] Rom = {}
byte[] byteArrayUTF8 = UTF8.GetBytes(Ja);
//byte[] byteArrayDefault = Encoding.Default.GetBytes(Re); string Name = Encoding.UTF8.GetString(Rom,,(int)Rom.Length); var y = GetTextEncoding(Rom);
}
全部代码
using System;
using System.Text;
using static System.Text.Encoding; namespace ConsoleApp1
{
class Program
{
/// <summary>
/// Determines whether the bytes in this buffer at the specified offset represent a UTF-8 multi-byte character.
/// </summary>
/// <remarks>
/// It is not guaranteed that these bytes represent a sensical character - only that the binary pattern matches UTF-8 encoding.
/// </remarks>
/// <param name="bytes">This buffer.</param>
/// <param name="offset">The position in the buffer to check.</param>
/// <param name="length">The number of bytes to check, of 4 if not specified.</param>
/// <returns>The rank of the UTF</returns>
public static MultibyteRank GetUtf8MultibyteRank(byte[] bytes, int offset = , int length = )
{
if (bytes == null)
{
throw new ArgumentNullException("bytes");
}
if (offset < || offset > bytes.Length)
{
throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");
}
else if (length < || length > )
{
throw new ArgumentOutOfRangeException("length", "Only values 1-4 are valid.");
}
else if ((offset + length) > bytes.Length)
{
throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");
}
// Possible 4 byte sequence
if (length > && IsLead4(bytes[offset]))
{
if (IsExtendedByte(bytes[offset + ]) && IsExtendedByte(bytes[offset + ]) && IsExtendedByte(bytes[offset + ]))
{
return MultibyteRank.Four;
}
}
// Possible 3 byte sequence
else if (length > && IsLead3(bytes[offset]))
{
if (IsExtendedByte(bytes[offset + ]) && IsExtendedByte(bytes[offset + ]))
{
return MultibyteRank.Three;
}
}
// Possible 2 byte sequence
else if (length > && IsLead2(bytes[offset]) && IsExtendedByte(bytes[offset + ]))
{
return MultibyteRank.Two;
}
if (bytes[offset] < 0x80)
{
return MultibyteRank.One;
}
else
{
return MultibyteRank.None;
}
}
private static bool IsLead4(byte b)
{
return b >= 0xF0 && b < 0xF8;
}
private static bool IsLead3(byte b)
{
return b >= 0xE0 && b < 0xF0;
}
private static bool IsLead2(byte b)
{
return b >= 0xC0 && b < 0xE0;
}
private static bool IsExtendedByte(byte b)
{
return b > 0x80 && b < 0xC0;
}
public enum MultibyteRank
{
None = ,
One = ,
Two = ,
Three = ,
Four =
} public static bool IsUtf8(byte[] bytes, int offset = , int? length = null)
{
if (bytes == null)
{
throw new ArgumentNullException("bytes");
}
length = length ?? (bytes.Length - offset);
if (offset < || offset > bytes.Length)
{
throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");
}
else if (length < )
{
throw new ArgumentOutOfRangeException("length");
}
else if ((offset + length) > bytes.Length)
{
throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");
}
var bytesRemaining = length.Value;
while (bytesRemaining > )
{
var rank = GetUtf8MultibyteRank(bytes, offset, Math.Min(, bytesRemaining));
if (rank == MultibyteRank.None)
{
return false;
}
else
{
var charsRead = (int)rank;
offset += charsRead;
bytesRemaining -= charsRead;
}
}
return true;
} /// <summary>
/// Uses various discovery techniques to guess the encoding used for a byte buffer presumably containing text characters.
/// </summary>
/// <remarks>
/// Note that this is only a guess and could be incorrect. Be prepared to catch exceptions while using the <see cref="Encoding.Decoder"/> returned by
/// the encoding returned by this method.
/// </remarks>
/// <param name="bytes">The buffer containing the bytes to examine.</param>
/// <param name="offset">The offset into the buffer to begin examination, or 0 if not specified.</param>
/// <param name="length">The number of bytes to examine.</param>
/// <returns>An encoding, or <see langword="null"> if one cannot be determined.</returns>
public static Encoding GetTextEncoding(byte[] bytes, int offset = , int? length = null)
{
if (bytes == null)
{
throw new ArgumentNullException("bytes");
}
length = length ?? bytes.Length;
if (offset < || offset > bytes.Length)
{
throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");
}
if (length < || length > bytes.Length)
{
throw new ArgumentOutOfRangeException("length", "Length is out of range.");
}
else if ((offset + length) > bytes.Length)
{
throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");
}
// Look for a byte order mark:
if (length >= )
{
var one = bytes[offset];
var two = bytes[offset + ];
var three = bytes[offset + ];
var four = bytes[offset + ];
if (one == 0x2B &&
two == 0x2F &&
three == 0x76 &&
(four == 0x38 || four == 0x39 || four == 0x2B || four == 0x2F))
{
return UTF7;
}
else if (one == 0xFE && two == 0xFF && three == 0x00 && four == 0x00)
{
return UTF32;
}
else if (four == 0xFE && three == 0xFF && two == 0x00 && one == 0x00)
{
throw new NotSupportedException("The byte order mark specifies UTF-32 in big endian order, which is not supported by .NET.");
}
}
else if (length >= )
{
var one = bytes[offset];
var two = bytes[offset + ];
var three = bytes[offset + ];
if (one == 0xFF && two == 0xFE)
{
return Unicode;
}
else if (one == 0xFE && two == 0xFF)
{
return BigEndianUnicode;
}
else if (one == 0xEF && two == 0xBB && three == 0xBF)
{
return UTF8;
}
}
if (length > )
{
// Look for a leading < sign:
if (bytes[offset] == 0x3C)
{
if (bytes[offset + ] == 0x00)
{
return Unicode;
}
else
{
return UTF8;
} }
else if (bytes[offset] == 0x00 && bytes[offset + ] == 0x3C)
{
return BigEndianUnicode;
}
}
if (IsUtf8(bytes))
{
return UTF8;
}
else
{
// Impossible to tell.
return null;
}
}
static void Main(string[] args)
{
string ch = "金";
string Ja = "らなくちゃ"; string Re = "фыввфывфывфв"; //byte[] Rom = {209,132,209,34,90,121,5,34,208};
//byte[] Rom = { 100,200,3,4,5,6,7,8,9,0,0 };
byte[] Rom = { , , ,,,,,,,,,,,,,};
//byte[] Rom = {}
byte[] byteArrayUTF8 = UTF8.GetBytes(Ja);
//byte[] byteArrayDefault = Encoding.Default.GetBytes(Re); string Name = Encoding.UTF8.GetString(Rom,,(int)Rom.Length); var y = GetTextEncoding(Rom);
}
}
}
C# 关于utf-8的研究的更多相关文章
- java io 源码研究记录(一)
Java IO 源码研究: 一.输入流 1 基类 InputStream 简介: 这是Java中所有输入流的基类,它是一个抽象类,下面我们简单来了解一下它的基本方法和抽象方法. 基本方法: publ ...
- 闲来无聊,研究一下Web服务器 的源程序
web服务器是如何工作的 1989年的夏天,蒂姆.博纳斯-李开发了世界上第一个web服务器和web客户机.这个浏览器程序是一个简单的电话号码查询软件.最初的web服务器程序就是一个利用浏览器和web服 ...
- SQLSERVER聚集索引与非聚集索引的再次研究(上)
SQLSERVER聚集索引与非聚集索引的再次研究(上) 上篇主要说聚集索引 下篇的地址:SQLSERVER聚集索引与非聚集索引的再次研究(下) 由于本人还是SQLSERVER菜鸟一枚,加上一些实验的逻 ...
- 深入研究Visual studio 2017 RC新特性
在[Xamarin+Prism开发详解三:Visual studio 2017 RC初体验]中分享了Visual studio 2017RC的大致情况,同时也发现大家对新的Visual Studio很 ...
- 【初码干货】使用阿里云对Web开发中的资源文件进行CDN加速的深入研究和实践
提示:阅读本文需提前了解的相关知识 1.阿里云(https://www.aliyun.com) 2.阿里云CDN(https://www.aliyun.com/product/cdn) 3.阿里云OS ...
- 对一致性Hash算法,Java代码实现的深入研究
一致性Hash算法 关于一致性Hash算法,在我之前的博文中已经有多次提到了,MemCache超详细解读一文中"一致性Hash算法"部分,对于为什么要使用一致性Hash算法.一致性 ...
- SQLSERVER聚集索引与非聚集索引的再次研究(下)
SQLSERVER聚集索引与非聚集索引的再次研究(下) 上篇主要说了聚集索引和简单介绍了一下非聚集索引,相信大家一定对聚集索引和非聚集索引开始有一点了解了. 这篇文章只是作为参考,里面的观点不一定正确 ...
- 开源Word读写组件DocX 的深入研究和问题总结
一. 前言 前两天看到了asxinyu大神的[原创]开源Word读写组件DocX介绍与入门,正好我也有类似的自动生成word文档得需求,于是便仔细的研究了这个DocX. 我也把它融入到我的项目当中并进 ...
- 【移动端兼容问题研究】javascript事件机制详解(涉及移动兼容)
前言 这篇博客有点长,如果你是高手请您读一读,能对其中的一些误点提出来,以免我误人子弟,并且帮助我提高 如果你是javascript菜鸟,建议您好好读一读,真的理解下来会有不一样的收获 在下才疏学浅, ...
- 从Java String实例来理解ANSI、Unicode、BMP、UTF等编码概念
转(http://www.codeceo.com/article/java-string-ansi-unicode-bmp-utf.html#0-tsina-1-10971-397232819ff9a ...
随机推荐
- Solidity notes
1. 查询transaction历史记录 https://forum.ethereum.org/discussion/2116/in-what-ways-can-storage-history-be- ...
- 无返回值的函数如何捕获出错情况(检查errno常量)
在执行这个函数前,先清除errno,函数返回时,检查errno常量. 每次程序调用失败的时候,系统会自动用用错误代码填充errno这个全局变量,这样你只需要读errno这个全局变量就可以获得失败原因了 ...
- 如何实现字符串的翻转,不用php库函数翻转字符串
- URAL 1141. RSA Attack(欧拉定理+扩展欧几里得+快速幂模)
题目链接 题意 : 给你n,e,c,并且知道me ≡ c (mod n),而且n = p*q,pq都为素数. 思路 : 这道题的确与题目名字很相符,是个RSA算法,目前地球上最重要的加密算法.RSA算 ...
- (转)自制AutoMapper实现DTO到持久层Entity的转换
原文地址:http://www.cnblogs.com/qidian10/p/3173907.html 项目中经常涉及到页面DTO更新,保存到数据库的操作,这就必然牵扯到DTO和持久层对象的转换,常见 ...
- tomcat启动startup.bat一闪而过
编辑startup.bat,在文本最后添加PAUSE,保存后打开startup.bat,此时窗口会暂停,并出现错误信息,然后按照错误提示纠正即可!
- mongodb数据库学习【安装及简单增删改查】
//@desn:mongodb数据库学习 //@desn:码字不宜,转载请注明出处 //@author:张慧源 <turing_zhy@163.com> //@date:2018/08/ ...
- C++初始化,之不明白篇 cout<<x<<endl 与 cout<<"x = "<<cout<<x<<endl的输出的值会不一样
代码如下 #include <iostream> using namespace std; class point { public : int x; int y; ...
- webservice不能序列化接口问题,返回值为IList或者参数为接口的解决办法。
1. webservice 不能返回泛型接口集合IList,解决办法如下链接: 参考资料:http://www.cnblogs.com/yinhaiming/articles/1379424.html ...
- IO流-File,字节流,缓冲流
1.1 IO概述 回想之前写过的程序,数据都是在内存中,一旦程序运行结束,这些数据都没有了,等下次再想使用这些数据,可是已经没有了.那怎么办呢?能不能把运算完的数据都保存下来,下次程序启动的时候,再把 ...