update:

  • 简单整理了一下代码的组织。
  • 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。

写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。

项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer

TypeDefine.h

#ifndef _TYPE_DEFINE_H_
#define _TYPE_DEFINE_H_ #include <iostream>
#include <fstream>
#include <string>
#include <array>
#include <vector>
#include <iterator>
#include <map> using namespace std; #define COCA_WORDS_NUM 20201U
#define WORDS_HEAD_NUM 26U #define WORDS_HEAD_A 0U
#define WORDS_HEAD_B 1U
#define WORDS_HEAD_C 2U
#define WORDS_HEAD_D 3U
#define WORDS_HEAD_E 4U
#define WORDS_HEAD_F 5U
#define WORDS_HEAD_G 6U
#define WORDS_HEAD_H 7U
#define WORDS_HEAD_I 8U
#define WORDS_HEAD_J 9U
#define WORDS_HEAD_K 10U
#define WORDS_HEAD_L 11U
#define WORDS_HEAD_M 12U
#define WORDS_HEAD_N 13U
#define WORDS_HEAD_O 14U
#define WORDS_HEAD_P 15U
#define WORDS_HEAD_Q 16U
#define WORDS_HEAD_R 17U
#define WORDS_HEAD_S 18U
#define WORDS_HEAD_T 19U
#define WORDS_HEAD_U 20U
#define WORDS_HEAD_V 21U
#define WORDS_HEAD_W 22U
#define WORDS_HEAD_X 23U
#define WORDS_HEAD_Y 24U
#define WORDS_HEAD_Z 25U #define USUAL_WORD_NUM 17U typedef enum WordFrequencyType
{
WORD_UNDER_4000 = 0,
WORD_4000_6000,
WORD_6000_8000,
WORD_8000_10000,
WORD_10000_12000,
WORD_12000_14000,
WORD_14000_16000,
WORD_OVER_16000,
WORD_NOT_FOUND_COCA,
WORD_LEVEL_NUM
}TagWordFrequencyType; const string alphabet_str = "abcdefghijklmnopqrstuvwxyz"; const string report_str[WORD_LEVEL_NUM] = {
"UNDER 4000: ",
"4000-6000: ",
"6000-8000: ",
"8000-10000: ",
"10000-12000: ",
"12000-14000: ",
"14000-16000: ",
"16000-20000+: ",
"\nNot found in COCA:"
}; //for usual words not included in COCA
const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =
{
"s","is","are","re","was","were",
"an","won","t","has","had","been",
"did","does","cannot","got","men"
}; #endif

TextVocabularyAnalyzer.h

#ifndef _TEXT_VOCABULARY_ANALYZER_H_
#define _TEXT_VOCABULARY_ANALYZER_H_ #include "TypeDefine.h" extern TagWordFrequencyType frequency_classify(const int wfrq);
extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);
extern bool isaletter(const char& c); class CLetters
{
private:
string m_word; public:
CLetters();
~CLetters();
void fill(vector<char>& vw);
const string word();
const char firstletter();
void processing();
bool usual_recheck();
bool form_recheck();
}; #endif // !_TEXT_VOCABULARY_ANALYZER_H_

TextVocabularyAnalyzer.cpp

/* TextVocabularyAnalyzer.cpp */

#include <algorithm>
#include "TextVocabularyAnalyzer.h" TagWordFrequencyType frequency_classify(const int wfrq)
{
if (wfrq == 0)
{
return WORD_NOT_FOUND_COCA;
}
else if (wfrq > 0 && wfrq <= 4000)
{
return WORD_UNDER_4000;
}
else if (wfrq > 4000 && wfrq <= 6000)
{
return WORD_4000_6000;
}
else if (wfrq > 6000 && wfrq <= 8000)
{
return WORD_6000_8000;
}
else if (wfrq > 8000 && wfrq <= 10000)
{
return WORD_8000_10000;
}
else if (wfrq > 10000 && wfrq <= 12000)
{
return WORD_10000_12000;
}
else if (wfrq > 12000 && wfrq <= 14000)
{
return WORD_12000_14000;
}
else if (wfrq > 14000 && wfrq <= 16000)
{
return WORD_14000_16000;
}
else
{
return WORD_OVER_16000;
}
} void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)
{
switch (wfrq_tag)
{
case WORD_UNDER_4000:
{
wfrq_array[WORD_UNDER_4000] += 1;
break;
}
case WORD_4000_6000:
{
wfrq_array[WORD_4000_6000] += 1;
break;
}
case WORD_6000_8000:
{
wfrq_array[WORD_6000_8000] += 1;
break;
}
case WORD_8000_10000:
{
wfrq_array[WORD_8000_10000] += 1;
break;
}
case WORD_10000_12000:
{
wfrq_array[WORD_10000_12000] += 1;
break;
}
case WORD_12000_14000:
{
wfrq_array[WORD_12000_14000] += 1;
break;
}
case WORD_14000_16000:
{
wfrq_array[WORD_14000_16000] += 1;
break;
}
case WORD_OVER_16000:
{
wfrq_array[WORD_OVER_16000] += 1;
break;
}
default:
{
wfrq_array[WORD_NOT_FOUND_COCA] += 1;
break;
}
}
} bool isaletter(const char& c)
{
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
{
return true;
}
else
{
return false;
}
} //Class Cletters realization
CLetters::CLetters()
{
m_word = "";
} CLetters::~CLetters()
{
//do nothing
} void CLetters::fill(vector<char>& vw)
{
//store the word with lower form
m_word.assign(vw.begin(), vw.end());
transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);
} const string CLetters::word()
{
return m_word;
} const char CLetters::firstletter()
{
return m_word[0];
} void CLetters::processing()
{
cout << "Finding word \"" << m_word << "\"...\t";
} bool CLetters::usual_recheck()
{
//check if the word is usual
bool RetVal = false;
for (int i = 0; i < USUAL_WORD_NUM; i++)
{
if (m_word == usual_w_out_of_COCA_str[i])
{
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
} bool CLetters::form_recheck()
{
bool RetVal = false;
if (m_word.length() > 3)
{
char e1, e2, e3;
e3 = m_word[m_word.length() - 3]; //last but two letter
e2 = m_word[m_word.length() - 2]; //last but one letter
e1 = m_word[m_word.length() - 1]; //last letter if (e1 == 's')
{
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e2 == 'e' && e1 == 'd')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e3 == 'i' && e2 == 'n' && e1 == 'g')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
}

main.cpp

/* main .cpp */

#include <numeric>
#include <iomanip>
#include <ctime>
#include "TextVocabularyAnalyzer.h" int main()
{
//file init
ifstream COCA_txt("D:\\COCA.txt");
ifstream USER_txt("D:\\JobsSpeech.txt"); //time init
clock_t startTime, endTime;
double build_map_time = 0;
double process_time = 0; startTime = clock(); //build time start //build COCA words map
map<string, int> COCA_WordsList[WORDS_HEAD_NUM];
int readlines = 0; while (readlines < COCA_WORDS_NUM)
{
int frequency = 0; string word = "";
COCA_txt >> frequency;
COCA_txt >> word; //transform to lower uniformly
transform(word.begin(), word.end(), word.begin(), tolower); //import every word
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
//check word head
if (word[0] == alphabet_str[whead])
{
//if a word already exists, only load its lower frequency
if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())
{
COCA_WordsList[whead].insert(make_pair(word, frequency));
}
else
{
COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];
}
}
else
{
// do nothing
}
}
readlines++;
} endTime = clock(); //build time stop
build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //user prompt
cout << "COCA words list imported.\nPress any key to start frequency analysis...\n";
cin.get(); startTime = clock(); //process time start //find text words
vector<char> content_read;
CLetters word_readed;
vector<int> frequecy_processed = { 0 };
array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };
char char_read = ' '; //get text char one by one
while (USER_txt.get(char_read))
{
//only letters and '-' between letters will be received
if (isaletter(char_read) || char_read == '-')
{
content_read.push_back(char_read);
}
else
{
//char which is not a letter marks the end of a word
if (!content_read.empty()) //skip single letter
{
int current_word_frequency = 0; //assign letters to make the word
word_readed.fill(content_read);
word_readed.processing(); cout << "Frequency:";
//check the word's head and find its frequency in COCA list
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
if (word_readed.firstletter() == alphabet_str[whead])
{
cout << COCA_WordsList[whead][word_readed.word()];
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //check if the word has been processed
if (current_word_frequency == 0)
{
//addtional check
if (word_readed.usual_recheck())
{
word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);
}
else if (word_readed.form_recheck())
{
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //try again
if (current_word_frequency > 0)
{
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);
}
}
else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)
== frequecy_processed.end())
{
//classify this word and make statistics
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
//do nothing
}
}
cout << endl; content_read.clear();
}
else
{
//do nothing
}
}
} endTime = clock(); //process time stop
process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //calc whole words processed
int whole_words_analyzed = 0;
whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0); //report result
cout << "\n////////// Report ////////// \n";
for (int i = 0;i< words_analysis_array.size();i++)
{
cout << report_str[i] <<"\t"<< words_analysis_array[i] << " (";
cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;
}
cout << "\nWords totally analyzed: " << whole_words_analyzed << endl; //show run time
cout << "Map build time: " << build_map_time*1000 << "ms.\n";
cout << "Process time: " << process_time*1000 << "ms.\n";
cout << "////////////////////////////" << endl; //close file
COCA_txt.close();
USER_txt.close(); return 0;
}

基于COCA词频表的文本词汇分布测试工具v0.2的更多相关文章

  1. 基于COCA词频表的文本词汇分布测试工具v0.1

    美国语言协会对美国人日常使用的英语单词做了一份详细的统计,按照日常使用的频率做成了一张表,称为COCA词频表.排名越低的单词使用频率越高,该表可以用来统计词汇量. 如果你的词汇量约为6000,那么这张 ...

  2. 基于Text-CNN模型的中文文本分类实战 流川枫 发表于AI星球订阅

    Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...

  3. 基于Text-CNN模型的中文文本分类实战

    Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...

  4. 基于jquery的bootstrap在线文本编辑器插件Summernote

    Summernote是一个基于jquery的bootstrap超级简单WYSIWYG在线编辑器.Summernote非常的轻量级,大小只有30KB,支持Safari,Chrome,Firefox.Op ...

  5. Chinese-Text-Classification,用卷积神经网络基于 Tensorflow 实现的中文文本分类。

    用卷积神经网络基于 Tensorflow 实现的中文文本分类 项目地址: https://github.com/fendouai/Chinese-Text-Classification 欢迎提问:ht ...

  6. Android版数据结构与算法(四):基于哈希表实现HashMap核心源码彻底分析

    版权声明:本文出自汪磊的博客,未经作者允许禁止转载. 存储键值对我们首先想到HashMap,它的底层基于哈希表,采用数组存储数据,使用链表来解决哈希碰撞,它是线程不安全的,并且存储的key只能有一个为 ...

  7. HDFS的快照原理和Hbase基于快照的表修复

    前一篇文章<HDFS和Hbase误删数据恢复>主要讲了hdfs的回收站机制和Hbase的删除策略.根据hbase的删除策略进行hbase的数据表恢复.本文主要介绍了hdfs的快照原理和根据 ...

  8. js语言评价--js 基于哈希表、原型链、作用域、属性类型可配置的多范式编程语言

    js 基于哈希表.原型链.作用域.属性类型可配置的多范式编程语言 值类型.引用类型.直接赋值: 原型是以对象形式存在的类型信息. ECMA-262把对象定义为:无序属性的集合,其属性可以包含基本值,对 ...

  9. mysql中【update/Delete】update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update in FROM clause.

    关键词:mysql update,mysql delete update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update ...

随机推荐

  1. JDBC的架构设计

    本文探讨JDBC需要解决的问题及如何解决和设计的,包括: JDBC要解决的问题 数据库事务 JDBC的架构设计 JDBC代码注意点 Spring是如何处理事务 什么是事务的传播特性 Redis事务与数 ...

  2. 解决 SQL 注入和 XSS 攻击(Node.js 项目中)

    1.SQL 注入 SQL 注入,一般是通过把 SQL 命令插入到 Web 表单提交或输入域名或页面请求的查询字符串,最终达到欺骗服务器执行恶意的 SQL 命令. SQL 注入示例 在登录界面,后端会根 ...

  3. SpringBoot系列之从入门到精通系列教程

    对应SpringBoot系列博客专栏,例子代码,本博客不定时更新 Spring框架:作为JavaEE框架领域的一款重要的开源框架,在企业应用开发中有着很重要的作用,同时Spring框架及其子框架很多, ...

  4. Macos 编译运行调试Mysql源代码

    准备编译工具Clion 下载地址 工具是macos用的系统 百度云盘下载地址(密码: 7dus) 下载mysql源码 Mysql源码下载地址 下载boost boost下载地址 前期准备工作 MySQ ...

  5. js map对象处理if

    onButtonClick只有一个参数时候,map和object对象都可以 // onButtonClick1(3) onButtonClick只有一个参数时候,map和object对象都可以 con ...

  6. Agumaster添加股票日交易爬虫画面

  7. JAVA 去除实体中类型为string的属性值中的空格

    前端传入的参数实体中,有时候会出现传入了一空格,导致操作失败,这时就可以利用java反射机制去除实体中类型为sting的属性值中的空格. java代码示例: package com.spyang.ut ...

  8. Spring IoC 到底是什么

    前言 「上一篇文章」我们对 Spring 有了初步的认识,而 Spring 全家桶中几乎所有组件都是依赖于 IoC 的. 刚开始听到 IoC,会觉得特别高大上,但其实掰开了很简单. 跟着我的脚步,一文 ...

  9. jsop之---实现过程

    JSONP(JSONP - JSON with Padding是JSON的一种“使用模式”),利用script标签的src属性(浏览器允许script标签跨域) 跨域访问,非同源访问 <!DOC ...

  10. 工具类-Fastjson入门使用

    简介 什么是Fastjson? fastjson是阿里巴巴的开源JSON解析库,它可以解析JSON格式的字符串,支持将Java Bean序列化为JSON字符串,也可以从JSON字符串反序列化到Java ...