https://github.com/BassLC/idUTF8lib

Idiot's UTF-8 Library

A very (too much really) simple Utf8 library for C++

Usage

#include "lib/idutf8lib.hpp"

Utf8String text; //Empty UTF8 object

Utf8String utf8_text("Héĺĺò Ẃórld"); //std::string compatible constructor

text = "Jello!"; //Supports assignment with std::string AND Utf8String objects

text.to_string(); // == std::string("Jello!")

utf8_text.size_in_chars(); // == 11

utf8_text.size_in_bytes(); // == 18 

utf8_text[0]; // == std::string("H")

utf8_text.sub_utf8str(1,3); // == Utf8String("éĺĺ")

Features

Decodes and parses UTF-8 strings correctly (at least until now)
Very lightweight and small: less than 200 newlines total (*without counting tests)

Requirements

A C++14 compatible compiler

Notes

Makefile serves only for testing purposes.

Uses the Catch framework for tests.

Thanks

UTF8-CPP

tiny-utf8

#ifndef UTF8_CPP

#define UTF8_CPP

#include <string>

#include <vector>

class Utf8String {

private:

    using Utf8Struct = std::vector<std::vector<uint8_t>>;

    Utf8Struct content;

    bool is_valid_utf8_string(const std::string &string) const;

public:

    Utf8String() = default;

    Utf8String(const Utf8String &) = default;

    Utf8String(Utf8Struct &&content);

    Utf8String(const std::string &string);

    ~Utf8String() = default;

    std::string to_string() const;

    std::size_t size_in_chars() const;

    std::size_t size_in_bytes() const;

    void clear();

    Utf8String sub_utf8str(const std::size_t &initial_pos, const std::size_t &distance = std::string::npos) const;

    void operator=(const std::string &string);

    void operator=(const Utf8String &utf8_structure) noexcept;

    Utf8String operator+(const Utf8String &utf8_structure) const noexcept;

    void operator+=(const Utf8String &utf8_structure) noexcept;

    std::string operator[](const std::size_t &pos) const;

    friend std::ostream& operator<<(std::ostream &out, const Utf8String &utf8_structure) noexcept;

    bool operator==(const Utf8String &utf8_structure) const noexcept;

    bool operator==(const std::string &string) const noexcept;

};

#endif

#include "idutf8lib.hpp"

#include <iostream>

#include <bitset>

#include <exception>

//* Private functions *

bool Utf8String::is_valid_utf8_string(const std::string &string) const {

    for ( std::size_t pos = ; pos < string.size(); ++pos ) {

        //IMPORTANT: The way you access a bitset object is completely backwards.

         //EXAMPLE: bitset = 0b10; bitset[0] == 0

        std::bitset<> bits = string[pos] >> ;

        //ASCII character

        if ( bits[] ==  ) {

            continue;

        //Continuation character - should NOT be here

        } else if ( bits[] ==  && bits[] ==  ){

            return false;

        } else {

            //Check number of characters

            while ( (bits <<= )[] ) {

                if ( ++pos >= string.size() ) {

                    return false;

                }

                if ( std::bitset<>(string[pos] >> ) != 0b10 ) {

                    return false;

                }

            }

        }

    }

    return true;

}

//* Constructors *

Utf8String::Utf8String(const std::string &string) {

    std::vector<uint8_t> utf8_char;

    if ( !is_valid_utf8_string(string) ) {

        throw(std::runtime_error("Invalid UTF8 String in constructor"));

    }

    for ( const auto &chr : string ) {

        std::bitset<> start_bits = (chr >> );

        if ( start_bits[] ==  ) {

            //ASCII character is pushed after making sure of the character before

            if ( !utf8_char.empty() ) {

                content.push_back(utf8_char);

                utf8_char.clear();

            }

            content.push_back(std::vector<uint8_t>(, chr));

            continue;

        //If there's more than one byte

        } else if ( start_bits == 0b11 ) {

            //Check to see if it has to flush the last character

            if ( !utf8_char.empty() ) {

                content.push_back(utf8_char);

                utf8_char.clear();

            }

        }

        utf8_char.push_back(chr);

    }

    //If last character is non-ASCII

    if ( !utf8_char.empty() ) {

        content.push_back(utf8_char);

    }

}

Utf8String::Utf8String(Utf8Struct &&temp) {

    content = temp;

}

//* Public Interface *

std::string Utf8String::to_string() const {

    std::string temp;

    for ( const auto &chr : content ) {

        temp += std::string(chr.begin(), chr.end());

    }

    return temp;

}

std::size_t Utf8String::size_in_chars() const { return content.size(); }

std::size_t Utf8String::size_in_bytes() const {

    std::size_t size = ;

    for ( const auto &chr : content ) {

        size += chr.size();

    }

    return size;

}

void Utf8String::clear() { content.clear(); }

Utf8String Utf8String::sub_utf8str(const std::size_t &initial_pos, const std::size_t &distance) const {

    const std::size_t end_pos = (distance == std::string::npos) ? content.size() : (initial_pos + distance);

    // To be sure we don't try to overflow

    if ( initial_pos >= content.size() || end_pos > content.size() ){

        throw std::out_of_range("Too big substr access");

    }

    return Utf8String(Utf8Struct(content.begin()+initial_pos, content.begin()+end_pos));

}

//* Operators *

void Utf8String::operator=(const std::string &string) {

    Utf8String temp(string);

    content = temp.content;

}

void Utf8String::operator=(const Utf8String &utf8_object) noexcept { content = utf8_object.content; }

std::string Utf8String::operator[](const std::size_t &pos) const {

    if ( pos >= content.size() ) {

        throw std::out_of_range("Bad UTF-8 range access with []");

    }

    return std::string(content[pos].begin(), content[pos].end());

}

Utf8String Utf8String::operator+(const Utf8String &utf8_structure) const noexcept {

    Utf8Struct temp = content;

    temp.insert(std::end(temp), std::begin(utf8_structure.content), std::end(utf8_structure.content));

    return Utf8String(std::move(temp));

}

void Utf8String::operator+=(const Utf8String &utf8_structure) noexcept {

    content.insert(std::end(content), std::begin(utf8_structure.content), std::end(utf8_structure.content));

}

std::ostream& operator<<(std::ostream &out, const Utf8String &utf8_structure) noexcept{

    out << utf8_structure.to_string();

    return out;

}

bool Utf8String::operator==(const Utf8String &utf8_structure) const noexcept {

    return (content == utf8_structure.content);

}

bool Utf8String::operator==(const std::string &string) const noexcept {

    return (this->to_string() == string);

}

utf8 string的更多相关文章

java.lang.ClassFormatError: Illegal UTF8 string in constant pool in class file Server/Request
Linux服务器上,将本地编译好的文件上传后,Tomcat启动时报错: Exception in thread "Thread-2" java.lang.ClassFormatEr ...
C++Builder RAD Studio XE, UTF-8 String 转换为 char * 字符串的最简单方式, 常用于sqlite3开发
前段时间突然使用sqlite3开发,中间需要用中文,XE的缺省char*直接使用中文,在sqlite *.db3的数据库表格中显示是乱码,用数据库管理器来浏览等管理时非常不便. 于是决定还是使用utf ...
SOAP-ERROR: Encoding: string … is not a valid utf-8 string
今天遇到一个错误,看标题就知道是什么错误了.... 最坑爹的是,不是所有的用户会报这个错误.只有少部分.在生产环境又没办法调试. 找了半天都不知道什么原因,字面意思大概是需要一个utf8编码的字符串, ...
在 Perl看来, 字符串只有两种形式. 一种是octets, 即8位序列, 也就是我们通常说的字节数组. 另一种utf8编码的字符串, perl管它叫string. 也就是说: Perl只熟悉两种编
在 Perl看来, 字符串只有两种形式. 一种是octets, 即8位序列, 也就是我们通常说的字节数组. 另一种utf8编码的字符串, perl管它叫string. 也就是说: Perl只熟悉两种编 ...
QString，string，char* 在utf8和gbk不同编码下的相互转化
关于编码简介:ascii编码是最开始的编码规则本,里面只收纳了英文.特殊字符.数字等有限字符,采用的是8位一个字节的方式进行编码对照:unicode在ascii码的基础上进行了升级扩展,立志将全世界所 ...
new String(getBytes(ISO-8859-1),UTF-8)中文编码避免乱码
byte[] b_gbk = "深".getBytes("GBK"); byte[] b_utf8 = "深".getBytes(" ...
构造UTF8的std::string
在VC++的世界里,MS比较鼓励使用_UNICODE,std::wstring.而在Web, XML则提倡用UTF8.当在C++的程序里要保存/读取XML数据,就存在wstring与string之间的 ...
Java读带有BOM的UTF-8文件乱码原因及解决方法
原因: 关于utf-8编码的txt文件,windows以记事本方式保存时会在第一行最开始处自动加入bom格式的相关信息,大概三个字节! 所以java在读取此类文件时第一行时会多出三个不相关的字节,这样 ...
XML编码utf-8有中文无法解析或乱码 C#
XML的encoding="UTF-8" ,含有中文的话(部分)会出现乱码. 网上还是很多这类问题跟解决办法的. 表现为用ie或者infopath之类的xml软件打不开这个xml, ...

随机推荐

Centos7 minimal 系列之Redis集群搭建（六）
一.redis安装借鉴上篇博客:http://www.cnblogs.com/WJ--NET/p/8176071.html 二.集群搭建 2.1.创建文件夹 mkdir redis_cluster ...
sql server 授权相关命令
原文:https://blog.csdn.net/hfdgjhv/article/details/83834076 https://www.cnblogs.com/shi-yongcui/p/7755 ...
JavaScript实现乘法表
JavaScript实现乘法表 <script type="text/javascript"> function c(n,m) { ...
js复制克隆
$(function() {//开始加载updateIndex()}) function add(){ var str = $(".tr_wqxx").first().clone( ...
算法之 aabb
题目描述:输出所有形如aabb的4位完全平方数(即前两位数字相等,后两位数字也相等). 分支和循环结合在一起时功能强大: 下面列举所有可能的结果aabb,然后判断它们是否为完全平方数.注意a的范围是1 ...
mac上卸载node
//卸载方法一有时手贱看到新版本就升级,升级后发现一堆模块不能用了,心情好慢慢调,但也有调不好的时候,只能卸载重装低版本的node了. 我的机器环境如下 1. Mac OSX 10.10.3 2. ...
ZBrush中物体的显示与隐藏
在ZBrush®中除了遮罩功能可以对局部网格进行编辑外,通过显示和隐藏局部网格也可以对局部进行控制.选择网格的控制都是手动操作,在软件中并没有相应的命令进行操作.选择局部网格的工作原理也很简单,即被选 ...
ZBrush中如何实现智能对称
ZBrush软件智能化和人性化的工作流程让用户在创作中提高工作效率,体验创作乐趣,说起智能化不得不提的就是ZBrush 4R8®给我们提供的智能对称功能,所谓的智能对称就是当您在编辑其中一半的物体模型 ...
密信(MeSince)，将取代传统电子邮件
电子邮件发展至今已经有几十年的历史,但仍然是最重要的现代互联网应用之一.在全球范围内,每小时发送的非垃圾邮件数量超过30亿封,从工作场景的使用到个人生活,电子邮件都扮演着不可或缺的角色.但是由于明文电 ...
HDU 5289 Assignment [优先队列贪心]
HDU 5289 - Assignment http://acm.hdu.edu.cn/showproblem.php?pid=5289 Tom owns a company and he is th ...

utf8 string

Idiot's UTF-8 Library

Usage

Features

Requirements

Notes

Thanks

utf8 string的更多相关文章

随机推荐

热门专题