基于COCA词频表的文本词汇分布测试工具v0.2
update:
- 简单整理了一下代码的组织。
- 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。
写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。
项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer
TypeDefine.h
#ifndef _TYPE_DEFINE_H_
#define _TYPE_DEFINE_H_ #include <iostream>
#include <fstream>
#include <string>
#include <array>
#include <vector>
#include <iterator>
#include <map> using namespace std; #define COCA_WORDS_NUM 20201U
#define WORDS_HEAD_NUM 26U #define WORDS_HEAD_A 0U
#define WORDS_HEAD_B 1U
#define WORDS_HEAD_C 2U
#define WORDS_HEAD_D 3U
#define WORDS_HEAD_E 4U
#define WORDS_HEAD_F 5U
#define WORDS_HEAD_G 6U
#define WORDS_HEAD_H 7U
#define WORDS_HEAD_I 8U
#define WORDS_HEAD_J 9U
#define WORDS_HEAD_K 10U
#define WORDS_HEAD_L 11U
#define WORDS_HEAD_M 12U
#define WORDS_HEAD_N 13U
#define WORDS_HEAD_O 14U
#define WORDS_HEAD_P 15U
#define WORDS_HEAD_Q 16U
#define WORDS_HEAD_R 17U
#define WORDS_HEAD_S 18U
#define WORDS_HEAD_T 19U
#define WORDS_HEAD_U 20U
#define WORDS_HEAD_V 21U
#define WORDS_HEAD_W 22U
#define WORDS_HEAD_X 23U
#define WORDS_HEAD_Y 24U
#define WORDS_HEAD_Z 25U #define USUAL_WORD_NUM 17U typedef enum WordFrequencyType
{
WORD_UNDER_4000 = 0,
WORD_4000_6000,
WORD_6000_8000,
WORD_8000_10000,
WORD_10000_12000,
WORD_12000_14000,
WORD_14000_16000,
WORD_OVER_16000,
WORD_NOT_FOUND_COCA,
WORD_LEVEL_NUM
}TagWordFrequencyType; const string alphabet_str = "abcdefghijklmnopqrstuvwxyz"; const string report_str[WORD_LEVEL_NUM] = {
"UNDER 4000: ",
"4000-6000: ",
"6000-8000: ",
"8000-10000: ",
"10000-12000: ",
"12000-14000: ",
"14000-16000: ",
"16000-20000+: ",
"\nNot found in COCA:"
}; //for usual words not included in COCA
const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =
{
"s","is","are","re","was","were",
"an","won","t","has","had","been",
"did","does","cannot","got","men"
}; #endif
TextVocabularyAnalyzer.h
#ifndef _TEXT_VOCABULARY_ANALYZER_H_
#define _TEXT_VOCABULARY_ANALYZER_H_ #include "TypeDefine.h" extern TagWordFrequencyType frequency_classify(const int wfrq);
extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);
extern bool isaletter(const char& c); class CLetters
{
private:
string m_word; public:
CLetters();
~CLetters();
void fill(vector<char>& vw);
const string word();
const char firstletter();
void processing();
bool usual_recheck();
bool form_recheck();
}; #endif // !_TEXT_VOCABULARY_ANALYZER_H_
TextVocabularyAnalyzer.cpp
/* TextVocabularyAnalyzer.cpp */ #include <algorithm>
#include "TextVocabularyAnalyzer.h" TagWordFrequencyType frequency_classify(const int wfrq)
{
if (wfrq == 0)
{
return WORD_NOT_FOUND_COCA;
}
else if (wfrq > 0 && wfrq <= 4000)
{
return WORD_UNDER_4000;
}
else if (wfrq > 4000 && wfrq <= 6000)
{
return WORD_4000_6000;
}
else if (wfrq > 6000 && wfrq <= 8000)
{
return WORD_6000_8000;
}
else if (wfrq > 8000 && wfrq <= 10000)
{
return WORD_8000_10000;
}
else if (wfrq > 10000 && wfrq <= 12000)
{
return WORD_10000_12000;
}
else if (wfrq > 12000 && wfrq <= 14000)
{
return WORD_12000_14000;
}
else if (wfrq > 14000 && wfrq <= 16000)
{
return WORD_14000_16000;
}
else
{
return WORD_OVER_16000;
}
} void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)
{
switch (wfrq_tag)
{
case WORD_UNDER_4000:
{
wfrq_array[WORD_UNDER_4000] += 1;
break;
}
case WORD_4000_6000:
{
wfrq_array[WORD_4000_6000] += 1;
break;
}
case WORD_6000_8000:
{
wfrq_array[WORD_6000_8000] += 1;
break;
}
case WORD_8000_10000:
{
wfrq_array[WORD_8000_10000] += 1;
break;
}
case WORD_10000_12000:
{
wfrq_array[WORD_10000_12000] += 1;
break;
}
case WORD_12000_14000:
{
wfrq_array[WORD_12000_14000] += 1;
break;
}
case WORD_14000_16000:
{
wfrq_array[WORD_14000_16000] += 1;
break;
}
case WORD_OVER_16000:
{
wfrq_array[WORD_OVER_16000] += 1;
break;
}
default:
{
wfrq_array[WORD_NOT_FOUND_COCA] += 1;
break;
}
}
} bool isaletter(const char& c)
{
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
{
return true;
}
else
{
return false;
}
} //Class Cletters realization
CLetters::CLetters()
{
m_word = "";
} CLetters::~CLetters()
{
//do nothing
} void CLetters::fill(vector<char>& vw)
{
//store the word with lower form
m_word.assign(vw.begin(), vw.end());
transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);
} const string CLetters::word()
{
return m_word;
} const char CLetters::firstletter()
{
return m_word[0];
} void CLetters::processing()
{
cout << "Finding word \"" << m_word << "\"...\t";
} bool CLetters::usual_recheck()
{
//check if the word is usual
bool RetVal = false;
for (int i = 0; i < USUAL_WORD_NUM; i++)
{
if (m_word == usual_w_out_of_COCA_str[i])
{
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
} bool CLetters::form_recheck()
{
bool RetVal = false;
if (m_word.length() > 3)
{
char e1, e2, e3;
e3 = m_word[m_word.length() - 3]; //last but two letter
e2 = m_word[m_word.length() - 2]; //last but one letter
e1 = m_word[m_word.length() - 1]; //last letter if (e1 == 's')
{
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e2 == 'e' && e1 == 'd')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e3 == 'i' && e2 == 'n' && e1 == 'g')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
}
main.cpp
/* main .cpp */ #include <numeric>
#include <iomanip>
#include <ctime>
#include "TextVocabularyAnalyzer.h" int main()
{
//file init
ifstream COCA_txt("D:\\COCA.txt");
ifstream USER_txt("D:\\JobsSpeech.txt"); //time init
clock_t startTime, endTime;
double build_map_time = 0;
double process_time = 0; startTime = clock(); //build time start //build COCA words map
map<string, int> COCA_WordsList[WORDS_HEAD_NUM];
int readlines = 0; while (readlines < COCA_WORDS_NUM)
{
int frequency = 0; string word = "";
COCA_txt >> frequency;
COCA_txt >> word; //transform to lower uniformly
transform(word.begin(), word.end(), word.begin(), tolower); //import every word
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
//check word head
if (word[0] == alphabet_str[whead])
{
//if a word already exists, only load its lower frequency
if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())
{
COCA_WordsList[whead].insert(make_pair(word, frequency));
}
else
{
COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];
}
}
else
{
// do nothing
}
}
readlines++;
} endTime = clock(); //build time stop
build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //user prompt
cout << "COCA words list imported.\nPress any key to start frequency analysis...\n";
cin.get(); startTime = clock(); //process time start //find text words
vector<char> content_read;
CLetters word_readed;
vector<int> frequecy_processed = { 0 };
array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };
char char_read = ' '; //get text char one by one
while (USER_txt.get(char_read))
{
//only letters and '-' between letters will be received
if (isaletter(char_read) || char_read == '-')
{
content_read.push_back(char_read);
}
else
{
//char which is not a letter marks the end of a word
if (!content_read.empty()) //skip single letter
{
int current_word_frequency = 0; //assign letters to make the word
word_readed.fill(content_read);
word_readed.processing(); cout << "Frequency:";
//check the word's head and find its frequency in COCA list
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
if (word_readed.firstletter() == alphabet_str[whead])
{
cout << COCA_WordsList[whead][word_readed.word()];
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //check if the word has been processed
if (current_word_frequency == 0)
{
//addtional check
if (word_readed.usual_recheck())
{
word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);
}
else if (word_readed.form_recheck())
{
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //try again
if (current_word_frequency > 0)
{
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);
}
}
else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)
== frequecy_processed.end())
{
//classify this word and make statistics
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
//do nothing
}
}
cout << endl; content_read.clear();
}
else
{
//do nothing
}
}
} endTime = clock(); //process time stop
process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //calc whole words processed
int whole_words_analyzed = 0;
whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0); //report result
cout << "\n////////// Report ////////// \n";
for (int i = 0;i< words_analysis_array.size();i++)
{
cout << report_str[i] <<"\t"<< words_analysis_array[i] << " (";
cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;
}
cout << "\nWords totally analyzed: " << whole_words_analyzed << endl; //show run time
cout << "Map build time: " << build_map_time*1000 << "ms.\n";
cout << "Process time: " << process_time*1000 << "ms.\n";
cout << "////////////////////////////" << endl; //close file
COCA_txt.close();
USER_txt.close(); return 0;
}
基于COCA词频表的文本词汇分布测试工具v0.2的更多相关文章
- 基于COCA词频表的文本词汇分布测试工具v0.1
美国语言协会对美国人日常使用的英语单词做了一份详细的统计,按照日常使用的频率做成了一张表,称为COCA词频表.排名越低的单词使用频率越高,该表可以用来统计词汇量. 如果你的词汇量约为6000,那么这张 ...
- 基于Text-CNN模型的中文文本分类实战 流川枫 发表于AI星球订阅
Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...
- 基于Text-CNN模型的中文文本分类实战
Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...
- 基于jquery的bootstrap在线文本编辑器插件Summernote
Summernote是一个基于jquery的bootstrap超级简单WYSIWYG在线编辑器.Summernote非常的轻量级,大小只有30KB,支持Safari,Chrome,Firefox.Op ...
- Chinese-Text-Classification,用卷积神经网络基于 Tensorflow 实现的中文文本分类。
用卷积神经网络基于 Tensorflow 实现的中文文本分类 项目地址: https://github.com/fendouai/Chinese-Text-Classification 欢迎提问:ht ...
- Android版数据结构与算法(四):基于哈希表实现HashMap核心源码彻底分析
版权声明:本文出自汪磊的博客,未经作者允许禁止转载. 存储键值对我们首先想到HashMap,它的底层基于哈希表,采用数组存储数据,使用链表来解决哈希碰撞,它是线程不安全的,并且存储的key只能有一个为 ...
- HDFS的快照原理和Hbase基于快照的表修复
前一篇文章<HDFS和Hbase误删数据恢复>主要讲了hdfs的回收站机制和Hbase的删除策略.根据hbase的删除策略进行hbase的数据表恢复.本文主要介绍了hdfs的快照原理和根据 ...
- js语言评价--js 基于哈希表、原型链、作用域、属性类型可配置的多范式编程语言
js 基于哈希表.原型链.作用域.属性类型可配置的多范式编程语言 值类型.引用类型.直接赋值: 原型是以对象形式存在的类型信息. ECMA-262把对象定义为:无序属性的集合,其属性可以包含基本值,对 ...
- mysql中【update/Delete】update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update in FROM clause.
关键词:mysql update,mysql delete update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update ...
随机推荐
- Unity3D中可重载虚函数的总结
重载虚函数:Unity3D中所有控制脚本的基类MonoBehaviour有一些虚函数用于绘制中事件的回调,也可以直接理解为事件函数,例如大家都很清楚的Start,Update等函数,以下做个总结. A ...
- 面试【JAVA基础】多线程
本次整理的内容如下: 1.进程与线程的区别 进程是一个可执行的程序,是系统资源分配的基本单位:线程是进程内相对独立的可执行单元,是操作系统进行任务调度的基本单位. 2.进程间的通信方式 2.1.操作系 ...
- Prometheus监控神器-服务发现篇(二)
本章节讲解服务发现与Relabelling的机制与范例. 通过服务发现的方式,我们可以在不重启Prometheus服务的情况下动态的发现需要监控的Target实例信息. 如上图所示,对于线上环境我们可 ...
- about blog
前言 今天无意中发现了一个小姐姐自己设计的的博客,感觉非常的nice,就随手copy一下,完了感觉效果还蛮好的 end 附上小姐姐的博客以及教程
- ARM函数调用总结
ARM架构寄存器介绍 ARM架构下处理器有7种工作模式: 1. USR模式:正常用户模式,在USR模式下进程正常执行 2. FIQ模式(Fast Interrupt Request):处理快速中断模式 ...
- CSS居中的多种方法
1.水平居中:text-align 与 inline-block 的配合 <div id = "div_center_align"> <div id = &quo ...
- 常用Linux Shell命令,了解一下!
目录 1 前言 2 正文 2.1 关机/重启 2.2 echo 2.3 vim文本编辑器 2.3.1 最基本用法 2.3.2 常用快捷键 2.3.3 查找/替换 2.4 拷贝/删除/移动/重命名 2. ...
- Go Http包解析:为什么需要response.Body.Close()
简单来讲就是:为了提高效率,http.Get 等请求的 TCP 连接是不会关闭的(再次向同一个域名请求时,复用连接),所以必须要手动关闭. 2019-01-24 10:43:32 更新 不管是否使用 ...
- List和Dictionary的使用技巧总结
List和Dictionary想必是我们平常用到最多的C#容器了,他们使用起来都很简单,所以很多人就可能就没去深究,其实在使用过程中有很多的小技巧能让我们写的代码变得更高效也更安全. 1·合理的指定初 ...
- oracle之dblink
当用户要跨本地Oracle数据库,访问另外一个数据库表中的数据时,本地数据库中必须创建了远程数据库的dblink,通过dblink本地数据库可以像访问本地数据库一样访问远程数据库表中的数据.下面讲介绍 ...