由于carrot2对中文的理解很不靠谱,所以参考了网络上的一些资料,现在贡献出来所有代码。

代码的思路就是找字或者词出现的频度,并进行打分,最后按照出现次数和重要性,找出重要的语汇。现在贴出来一些可用的代码。

ClusterBuilder.java

/** 
*
* @author
* @version 创建时间:2011-3-8 下午02:02:36
* 聚类生成器
*/
public class ClusterBuilder {
private static final Log LOG;
private List<DocCluster> clusters;
private ICTHit[] docs;
private int maxLevels;
private ClusteringOptions[] options;
private boolean useTagsAsTitle;
private String wordsExcluded;
private static short[] bit1Table; static {
LOG = LogFactory.getLog(ClusterBuilder.class.getName()); bit1Table = new short[65536]; for (int n = 0; n < bit1Table.length; n++) {
String s = Integer.toBinaryString(n);
short m = 0;
for (int k = 0; k < s.length(); k++) {
if (s.charAt(k) == '1') {
m = (short) (m + 1);
}
}
bit1Table[n] = m;
}
} private static int getValidBitCount(long n) {
int i3 = (int) (n % 65536L);
n /= 65536L;
int i2 = (int) (n % 65536L);
n /= 65536L;
int i1 = (int) (n % 65536L);
n /= 65536L;
int i0 = (int) (n % 65536L);
return bit1Table[i0] + bit1Table[i1] + bit1Table[i2] + bit1Table[i3];
} private static int getDocHitCount(long[] hits) {
assert (hits != null);
if (hits == null)
return 0;
int n0 = 0;
for (int i = 0; i < hits.length; i++) {
n0 += getValidBitCount(hits[i]);
}
return n0;
} public ClusterBuilder() {
for (int n = 0; n < bit1Table.length; n++)
{
String s = Integer.toBinaryString(n);
short m = 0;
for (int k = 0; k < s.length(); k++)
{
if (s.getBytes()[k] == '1')
{
m = (short)(m + 1);
}
}
bit1Table[n] = m;
}
}
/**
*
* @param docsToCluster 要聚类的记录列表
* @param exWords 不使用的主题词列表,多个词用西文逗号分隔。这些词将不会作为主题词。
* @param maxLevels 最大聚类级数
* @param useTagsAsTitle 是否使用主题词作为类别主题词。如果不使用,则根据文档标题自动生成类别主题词。
*/
public ClusterBuilder(ICTHit[] docsToCluster, String exWords, int maxLevels, boolean useTagsAsTitle) {
this.useTagsAsTitle = useTagsAsTitle;
this.wordsExcluded = exWords;
this.maxLevels = maxLevels;
this.docs = docsToCluster;
this.options = new ClusteringOptions[3];
this.options[0] = new ClusteringOptions();
this.options[0].setDocMaxTagCount(10);
this.options[0].setMinTagRelevance(60);
this.options[0].setMinSameDocPercent(80); this.options[1] = new ClusteringOptions();
this.options[1].setDocMaxTagCount(8);
this.options[1].setMinTagRelevance(85);
this.options[1].setMinSameDocPercent(70);
this.options[1].setTagMinDocCount(2);
this.options[1].setMinSameDocs(2); this.options[2] = new ClusteringOptions();
this.options[2].setDocMaxTagCount(8);
this.options[2].setMinTagRelevance(50);
this.options[2].setMinSameDocPercent(70);
this.options[2].setTagMinDocCount(2);
this.options[2].setMinSameDocs(2);
}
/**
* 对Docs记录列表执行聚类,结果存放于Clusters中
*/
public void cluster() {
this.clusters = createLevelClusters(docs, 0, options[0]);
List subs = null;
if (this.maxLevels <= 1) {
return;
}
for (DocCluster dc : this.clusters) {
if ((dc.getDocList().length < options[0].getMinDocsToCluster()) || (dc.getTags() == "其他"))
continue;
subs = createLevelClusters(dc.getDocList(), 1, options[1]);
if (subs.size() > 1)
dc.setSubclusters(subs);
}
}
/**
* 创建一个层级的聚类
* @param docs 文档列表
* @param level 层级号
* @param levelOpt 该层级的聚类选项
* @return
*/
private List<DocCluster> createLevelClusters(ICTHit[] docs, int level, ClusteringOptions levelOpt) {
TagHitMatrix matrix = new TagHitMatrix(docs.length, levelOpt.getDocMaxTagCount());
List clusters = new ArrayList();
int i, ValidTagCount;
int DocCount = 0;
// 扫描文档列表,根据每个文档的主题词列表,初始化主题词文档对照表。
for (i = 0; i < docs.length; i++) {
ICTHit d = docs[i];
int validTagCount = 0;
if (d.getTagList() != null) {
String[] tagList = d.getTagList();
for (int tagIdx = 0; (tagIdx < tagList.length) && (validTagCount < levelOpt.getDocMaxTagCount()); tagIdx++) {
String tag = tagList[tagIdx].trim();
// 主题词长度大于6个字的丢弃
if ((tag.length() <= 0)
|| (tag.length() > 20)
|| ((this.wordsExcluded.length() != 0) && ((tag.contains(this.wordsExcluded)) || (this.wordsExcluded
.contains(tag)))))
continue;
matrix.AddDocHit(tag, i);
validTagCount++;
}
} } int maxKwDocCount = 0;
List entryListToRemove = new ArrayList();
String kwWithMaxDocCount = "";
LOG.debug("有效关键词:");
for (Map.Entry entry : matrix.entrySet()) {
// 统计当前主题词的命中文档数,文档数小于预设值,则该主题词将被删除
int n = getDocHitCount((long[]) entry.getValue());
if (n < levelOpt.getTagMinDocCount()) {
entryListToRemove.add((String) entry.getKey());
} else {
LOG.debug((String) entry.getKey() + "(" + n + "), "); DocCount += n;
}
if (n > maxKwDocCount) {
maxKwDocCount = n;
kwWithMaxDocCount = (String) entry.getKey();
}
}
LOG.debug(""); LOG.debug("被忽略的关键词:"); for (i = 0; i < entryListToRemove.size(); i++) {
LOG.debug((String) entryListToRemove.get(i) + ", ");
matrix.remove(entryListToRemove.get(i));
} LOG.debug(""); LOG.debug(entryListToRemove.size() + "个关键词被忽略。剩余" + matrix.size() + "个关键词。"); LOG.debug("最大文档数的关键词:" + kwWithMaxDocCount + ",文档数:" + maxKwDocCount + "。"); double docCountPerTag = matrix.size() > 0 ? DocCount / matrix.size() : 0.0D;
LOG.debug("关键词平均文档数:" + docCountPerTag); levelOpt.setMinSameDocs((int) (docCountPerTag / (2.0D + level)));
if (levelOpt.getMinSameDocs() < 1) {
levelOpt.setMinSameDocs(1);
} while (mergeClusters(matrix, levelOpt) > 0) {
}
return createResult(matrix, docs, level, levelOpt);
} private int mergeClusters(TagHitMatrix matrix, ClusteringOptions opt) {
if (matrix.size() == 0)
return 0;
long[] docHitsMerged = (long[]) null;
long[] maxDocHitsMerged = (long[]) null;
String word1 = "";
String word2 = "";
String word1ToMerge = "";
String word2ToMerge = "";
int i,j;
int sameDocs = 0;
// 初始化一个相关度数组,0到100分,共101项
List rankMatrix = new ArrayList();
for (i = 0; i < 101; i++) {
rankMatrix.add(new ArrayList());
}
List matrix2List = new ArrayList();
matrix2List.addAll(matrix.entrySet());
// 将主题词文档映射表中的主题词两两比对
for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {
Map.Entry hits1 = (Map.Entry) matrix2List.get(i1);
word1 = (String) hits1.getKey();
for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {
Map.Entry hits2 = (Map.Entry) matrix2List.get(i2);
word2 = (String) hits2.getKey();
Object[] re = getWordsRelevance(mapEntry2TagHitEntry(hits1), mapEntry2TagHitEntry(hits2),
docHitsMerged, sameDocs, opt, matrix.hitsItemCount);
// 计算两个词的相关性,获取两词的文档汇总表,以及相同文档数
int nRank = ((Integer) re[0]).intValue();
docHitsMerged = (long[]) re[1];
sameDocs = ((Integer) re[2]).intValue();
// 相关度小于预设阈值的忽略
if (nRank >= opt.getMinTagRelevance()) {
((List) rankMatrix.get(nRank)).add(new IdPair(i1, i2));
} } } List tagListToRemove = new ArrayList();
List entryListMerged = new ArrayList();
entryListMerged.add(new TagHitEntry("", null));
HashSet idPairTable = new HashSet();
TagHitEntry entryToMerge1;
while (true) {
// 找到最大相关性的两个主题词
for (i = 100; (i >= opt.getMinTagRelevance()) && (((List) rankMatrix.get(i)).size() == 0); i--){};
if (i < opt.getMinTagRelevance()) {
break;
}
IdPair ip = (IdPair) ((List) rankMatrix.get(i)).get(0);
// 合并两个类别
((List) rankMatrix.get(i)).remove(0); entryToMerge1 = ip.Id1 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id1))
: (TagHitEntry) entryListMerged.get(-ip.Id1);
TagHitEntry entryToMerge2 = ip.Id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id2))
: (TagHitEntry) entryListMerged.get(-ip.Id2);
word1ToMerge = entryToMerge1.key;
word2ToMerge = entryToMerge2.key;
assert ((word1ToMerge.length() > 0) && (word2ToMerge.length() > 0)); String wordsMerged = word1ToMerge + "," + word2ToMerge;
long[] lDocs0 = entryToMerge1.value;
long[] lDocs1 = entryToMerge2.value;
maxDocHitsMerged = new long[matrix.hitsItemCount];
for (i = 0; i < lDocs0.length; i++) {
lDocs0[i] |= lDocs1[i];// 获取合并的文档集
}
if (ip.Id1 >= 0)
tagListToRemove.add(word1ToMerge);
else
entryListMerged.set(-ip.Id1, new TagHitEntry("", null));
if (ip.Id2 >= 0)
tagListToRemove.add(word2ToMerge);
else {
entryListMerged.set(-ip.Id2, new TagHitEntry("", null));
}
entryListMerged.add(new TagHitEntry(wordsMerged, maxDocHitsMerged));
// 替换与合并主题词有关联的其他相关主题词对的评分
int idMerged = -(entryListMerged.size() - 1);
int id2 = 0; boolean CanDelete = false; for (i = 0; i <= 100; i++) {
int ListCount = ((List) rankMatrix.get(i)).size();
if (ListCount == 0) {
continue;
} for (j = 0; j < ListCount; j++) {
IdPair p = (IdPair) ((List) rankMatrix.get(i)).get(j);
CanDelete = false;
if ((ip.Id1 == p.Id1) || (ip.Id2 == p.Id1)) {
id2 = p.Id2;
CanDelete = true;
} else if ((ip.Id1 == p.Id2) || (ip.Id2 == p.Id2)) {
id2 = p.Id1;
CanDelete = true;
}
if (!CanDelete)
continue;
if (idMerged == id2) {
continue;
} ((List) rankMatrix.get(i)).remove(j);
j--;
ListCount--; IdPair pairMerged = new IdPair(idMerged, id2);
if (idPairTable.contains(pairMerged)) {
continue;
} TagHitEntry e2 = id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(id2))
: (TagHitEntry) entryListMerged.get(-id2); assert ((e2.key.length() != 0) && (e2.key != wordsMerged)); Object[] re = getWordsRelevance(new TagHitEntry(wordsMerged, maxDocHitsMerged), e2, docHitsMerged,
sameDocs, opt, matrix.hitsItemCount);
int rank = ((Integer) re[0]).intValue();
docHitsMerged = (long[]) re[1];
sameDocs = ((Integer) re[2]).intValue(); if (rank <= opt.getMinTagRelevance())
continue;
((List) rankMatrix.get(rank)).add(pairMerged);
idPairTable.add(pairMerged);
} } }
// 删除被合并的主题词
for (int m =0;m<tagListToRemove.size();m++){
matrix.remove(tagListToRemove.get(m));
}
/**
for (String w : tagListToRemove)
matrix.remove(w);
**/
// 添加合并而成的新主题词
for (int n=0;n<entryListMerged.size();n++){
TagHitEntry e = (TagHitEntry) entryListMerged.get(n);
matrix.put(e.getKey(), e.getValue());
}
/**
for (TagHitEntry e : entryListMerged) {
if (e.getKey().length() > 0)
matrix.put(e.getKey(), e.getValue());
}
**/
return 0;
} private int mergeClusters1(TagHitMatrix matrix, ClusteringOptions opt) {
if (matrix.size() == 0)
return 0;
long[] docHitsMerged = (long[]) null;
long[] maxDocHitsMerged = (long[]) null;
int nMaxRank = 0;
String word1 = "";
String word2 = "";
String word1ToMerge = "";
String word2ToMerge = "";
int sameDocs = 0; List matrix2List = new ArrayList();
matrix2List.addAll(matrix.entrySet()); for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {
TagHitEntry hits1 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i1));
word1 = hits1.getKey();
for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {
TagHitEntry hits2 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i2));
word2 = hits2.getKey();
Object[] re = getWordsRelevance(hits1, hits2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount);
int nRank = ((Integer) re[0]).intValue();
docHitsMerged = (long[]) re[1];
sameDocs = ((Integer) re[2]).intValue(); if ((nRank <= nMaxRank) || (nRank <= opt.getMinTagRelevance()))
continue;
nMaxRank = nRank;
maxDocHitsMerged = docHitsMerged;
word1ToMerge = word1;
word2ToMerge = word2;
} } if ((word1ToMerge.length() == 0) || (word2ToMerge.length() == 0)) {
return 0;
} String wordsMerged = word1ToMerge + "," + word2ToMerge;
if ((nMaxRank > opt.getMinTagRelevance()) && (wordsMerged != "")) {
matrix.remove(word1ToMerge);
matrix.remove(word2ToMerge);
matrix.put(wordsMerged, maxDocHitsMerged);
LOG.debug("(" + word1ToMerge + ") - (" + word2ToMerge + ")"); return 1;
} return 0;
} private Object[] getWordsRelevance(TagHitEntry entry1, TagHitEntry entry2, long[] docHitsMerged, int sameDocCount,
ClusteringOptions opt, int hitsItemCount) {
Object[] re = new Object[3];
docHitsMerged = new long[hitsItemCount];
sameDocCount = 0; String tag1 = entry1.getKey();
String tag2 = entry2.getKey();
assert (tag2 != tag1); long[] lDocs0 = entry1.getValue();
long[] lDocs1 = entry2.getValue();
int n0 = 0;
int n1 = 0;
n0 = getDocHitCount(lDocs0);
n1 = getDocHitCount(lDocs1);
int docCountMin = Math.min(n0, n1);
int docCountMax = Math.max(n0, n1);
int docCountMerged = 0; long sameDocBits = 0L;
long diffDocBits = 0L;
int diffDocCount = 0;
for (int i = 0; i < lDocs0.length; i++) {
docHitsMerged[i] = lDocs0[i] | lDocs1[i];// 获取合并的文档集
docCountMerged += getValidBitCount(docHitsMerged[i]);
diffDocBits = lDocs0[i] ^ lDocs1[i];// 获取不同的文档集
diffDocCount += getValidBitCount(diffDocBits);
sameDocBits = lDocs0[i] & lDocs1[i];// 获取相同的文档集
sameDocCount += getValidBitCount(sameDocBits);
} boolean IsSubstring = false;
// 一个主题词是另一个的子串,则得分较高
if ((tag2.contains(tag1)) || (tag1.contains(tag2))) {
IsSubstring = true;
docCountMin += opt.getTagMinDocCount();
} if ((sameDocCount == 0) && (!IsSubstring)) {
re[0] = Integer.valueOf(0);
re[1] = docHitsMerged;
re[2] = Integer.valueOf(sameDocCount);
return re;
} if (docCountMin < opt.getTagMinDocCount()) {
re[0] = Integer.valueOf(0);
re[1] = docHitsMerged;
re[2] = Integer.valueOf(sameDocCount);
return re;
} int samePercent = (int) Math.round(sameDocCount * 100.0D / docCountMerged);
int samePercentMin = (int) Math.round(sameDocCount * 100.0D / docCountMin);
int diffPercent = (int) Math.round(diffDocCount * 100.0D / docCountMerged);
LOG.debug("相关性:" + tag1 + "(" + n0 + ")-(" + n1 + ")" + tag2);
LOG.debug(", SamePercent=" + samePercent);
LOG.debug(", SamePercentMin=" + samePercentMin);
LOG.debug(", DiffPercent=" + diffPercent);
int nRank;
if ((sameDocCount >= opt.getMinSameDocs())
&& ((docCountMin < 10) || (samePercentMin >= opt.getMinSameDocPercent()))) {
nRank = (int) Math.round((samePercentMin + samePercent) * 0.85D - diffPercent * 0.2D);
} else {
nRank = 0;
}
if (IsSubstring)
nRank += 80;
LOG.debug(", Rank=" + nRank); re[0] = Integer.valueOf(Math.min(nRank, 100));
re[1] = docHitsMerged;
re[2] = Integer.valueOf(sameDocCount);
return re;
} private TagHitEntry mapEntry2TagHitEntry(Map.Entry<String, long[]> e) {
return new TagHitEntry((String) e.getKey(), (long[]) e.getValue());
} @SuppressWarnings("unchecked")
private List<DocCluster> createResult(TagHitMatrix matrix, ICTHit[] docs, int level, ClusteringOptions opt) {
int i,j;
Map<String,DocValue> clsIdList = new HashMap();
List ClassTitleList = new ArrayList();
for (Map.Entry de : matrix.entrySet()) {
DocValue dv = new DocValue();
clsIdList.put((String) de.getKey(), dv);
} List<Integer> otherIdList = new ArrayList();
TagHitEntry maxTagHitEntry = new TagHitEntry();
int clsCount;
String tag;
// 确定每个文档所属的类别
for (i = 0; i < docs.length; i++) {
ICTHit d = docs[i];
TagHitMatrix.ClusterDocInfo di = matrix.docs[i];
assert (docs[i] != null);
int maxTagHit = 0;
clsCount = 0; for (Map.Entry hits : matrix.entrySet()) {
int tagHitCount = 0;
int score = 0;
String clsWordListStr = "," + (String) hits.getKey() + ",";
// 那个类别包含当前文档的主题词最多,该文档就属于哪个类别
for (j = 0; j < di.TagCount; j++) {
tag = di.TagList[j];
score = j < 3 ? 2 : 1;
assert (tag.length() > 0);
if (!clsWordListStr.contains("," + tag + ","))
continue;
tagHitCount += score;
clsCount++;
} if (maxTagHit >= tagHitCount)
continue;
maxTagHit = tagHitCount;
maxTagHitEntry = mapEntry2TagHitEntry(hits);
} if (maxTagHit > 0) {
DocValue dv = (DocValue) clsIdList.get(maxTagHitEntry.getKey());
dv.idList.add(Integer.valueOf(i));
} else {
otherIdList.add(Integer.valueOf(i));
} }
// 生成类别列表
List<DocCluster> clusterList = new ArrayList();
String[] TagList;
Object dc;
for (Map.Entry<String,DocValue> kv : clsIdList.entrySet()) {
DocValue dv = (DocValue) kv.getValue();
if (dv.idList.size() <= 0)
continue;
if (dv.idList.size() == 1) {
otherIdList.add((Integer) dv.idList.get(0));
} else {
dc = new DocCluster();
((DocCluster) dc).setDocIdList(new String[dv.idList.size()]);
((DocCluster) dc).setDocList(new ICTHit[dv.idList.size()]);
for (i = 0; i < dv.idList.size(); i++) {
((DocCluster) dc).getDocIdList()[i] = docs[((Integer) dv.idList.get(i)).intValue()].getDocId();
((DocCluster) dc).getDocList()[i] = docs[((Integer) dv.idList.get(i)).intValue()];
}
((DocCluster) dc).setLevel(level);
((DocCluster) dc).setTags((String) kv.getKey()); for (i = 0; (i < clusterList.size())
&& (((DocCluster) dc).getDocIdList().length <= ((DocCluster) clusterList.get(i)).getDocIdList().length);) {
i++;
}
clusterList.add(i, (DocCluster) dc);
}
}
for (i = opt.getMaxClusterCount(); i < clusterList.size();) {
DocCluster c = (DocCluster) clusterList.get(i);
List idList = ((DocValue) clsIdList.get(c.getTags())).idList;
for (dc = idList.iterator(); ((Iterator) dc).hasNext();) {
int idx = ((Integer) ((Iterator) dc).next()).intValue();
otherIdList.add(Integer.valueOf(idx));
}
clusterList.remove(i);
}
int i1;
for (i = 0; i < clusterList.size(); i++) {
DocCluster dc1 = (DocCluster) clusterList.get(i);
String[] tagList = dc1.getTags().split(",");
String newTags = ""; for (j = 0; j < tagList.length; j++) {
i1 = dc1.getTags().indexOf(tagList[j]);
int i2 = dc1.getTags().lastIndexOf(tagList[j]);
if (i1 == i2)
newTags = newTags + tagList[j] + ",";
}
if ((newTags.trim().length() > 0) && (newTags.endsWith(","))) {
newTags = newTags.substring(0, newTags.length() - 1);
}
dc1.setTags(newTags); dc1.setTitle(""); if (this.useTagsAsTitle) {
tagList = dc1.getTags().split(",");
for (j = 0; (tagList != null) && (j < tagList.length); j++) {
if ((dc1.getTitle() + tagList[j]).length() > 16)
break;
boolean isSubstr = false;
for (DocCluster c : clusterList) {
if ((c.getTitle().length() <= 0)
|| ((!c.getTitle().contains(tagList[j])) && (!tagList[j].contains(c.getTitle()))))
continue;
isSubstr = true;
break;
}
if (!isSubstr)
dc1.setTitle(dc1.getTitle() + tagList[j] + ",");
}
if ((dc1.getTitle().trim().length() > 0) && (dc1.getTitle().endsWith(","))) {
dc1.setTitle(dc1.getTitle().substring(0, dc1.getTitle().length() - 1));
} } if (dc1.getTitle() != "")
continue;
dc1.setTitle(dc1.getTags());
if (dc1.getTitle().length() <= 16)
continue;
String s = dc1.getTitle().substring(0, 16);
int li = s.lastIndexOf(',');
if (li > 0) {
dc1.setTitle(s.substring(0, li));
} } if (otherIdList.size() > 0) {
DocCluster clusterOther = new DocCluster();
clusterOther.setDocIdList(new String[otherIdList.size()]);
clusterOther.setDocList(new ICTHit[otherIdList.size()]);
clusterOther.setLevel(level);
clusterOther.setTitle("其他");
clusterOther.setTags("其他");
i = 0;
for (int k=0;k<otherIdList.size();k++) {
int idx = otherIdList.get(k); clusterOther.getDocIdList()[i] = docs[idx].getDocId();
clusterOther.getDocList()[i] = docs[idx];
i++;
}
clusterList.add(clusterOther);
} return (List<DocCluster>) clusterList;
} public List<DocCluster> getClusters() {
return this.clusters;
} public void setClusters(List<DocCluster> clusters) {
this.clusters = clusters;
} public ICTHit[] getDocs() {
return this.docs;
} public void setDocs(ICTHit[] docs) {
this.docs = docs;
} public int getMaxLevels() {
return this.maxLevels;
} public void setMaxLevels(int maxLevels) {
this.maxLevels = maxLevels;
} public ClusteringOptions[] getOptions() {
return this.options;
} public void setOptions(ClusteringOptions[] options) {
this.options = options;
} public boolean isUseTagsAsTitle() {
return this.useTagsAsTitle;
} public void setUseTagsAsTitle(boolean useTagsAsTitle) {
this.useTagsAsTitle = useTagsAsTitle;
} public String getWordsExcluded() {
return this.wordsExcluded;
} public void setWordsExcluded(String wordsExcluded) {
this.wordsExcluded = wordsExcluded;
} private class DocValue {
public List<Integer> idList = new ArrayList();
public String titleListStr = ""; private DocValue() {
}
}
/**
* 主题词ID对,主题词ID为该主题词在主题词文档映射表中的主键位置。
* @author
* @version 创建时间:2011-3-9 下午02:52:44
*/
private class IdPair {
public int Id1;
public int Id2; public IdPair(int id1, int id2) {
assert (id1 != id2);
if (id1 < id2) {
this.Id1 = id1;
this.Id2 = id2;
} else {
this.Id1 = id2;
this.Id2 = id1;
}
} public int hashCode() {
return -1;
} public boolean equals(Object o) {
return (((IdPair) o).Id1 == this.Id1) && (((IdPair) o).Id2 == this.Id2);
}
} public static class TagHitEntry {
public String key;
public long[] value; public TagHitEntry() {
} public TagHitEntry(String k, long[] v) {
this.key = k;
this.value = v;
} public String getKey() {
return this.key;
} public long[] getValue() {
return this.value;
}
}
}

ClusteringOptions.java

/**
*
* @author
* @version 创建时间:2011-3-8 上午10:23:27
*/
public class ClusteringOptions {
public static int DefMaxClusterCount = 20;
public static int DefMaxKeywordCount = 6;
public static int DefMinWordsRelevance = 10;
public static int DefTagMinDocCount = 3;
public static int DefIgnoreSameDocs = 2;
public static int DefSameDocPercent = 50;
public static int DefMinDocsToCluster = 8;
private int docMaxTagCount;
private int maxClusterCount;
private int minDocsToCluster;
private int minSameDocPercent;
private int minSameDocs;
private int minTagRelevance;
private int tagMinDocCount; public ClusteringOptions() {
this.maxClusterCount = DefMaxClusterCount;
this.minTagRelevance = DefMinWordsRelevance;
this.tagMinDocCount = DefTagMinDocCount;
this.minSameDocs = DefIgnoreSameDocs;
this.minSameDocPercent = DefSameDocPercent;
this.docMaxTagCount = DefMaxKeywordCount;
this.minDocsToCluster = DefMinDocsToCluster;
} public int getDocMaxTagCount() {
return this.docMaxTagCount;
} public void setDocMaxTagCount(int docMaxTagCount) {
this.docMaxTagCount = docMaxTagCount;
} public int getMaxClusterCount() {
return this.maxClusterCount;
} public void setMaxClusterCount(int maxClusterCount) {
this.maxClusterCount = maxClusterCount;
} public int getMinDocsToCluster() {
return this.minDocsToCluster;
} public void setMinDocsToCluster(int minDocsToCluster) {
this.minDocsToCluster = minDocsToCluster;
} public int getMinSameDocPercent() {
return this.minSameDocPercent;
} public void setMinSameDocPercent(int minSameDocPercent) {
this.minSameDocPercent = minSameDocPercent;
} public int getMinSameDocs() {
return this.minSameDocs;
} public void setMinSameDocs(int minSameDocs) {
this.minSameDocs = minSameDocs;
} public int getMinTagRelevance() {
return this.minTagRelevance;
} public void setMinTagRelevance(int minTagRelevance) {
this.minTagRelevance = minTagRelevance;
} public int getTagMinDocCount() {
return this.tagMinDocCount;
} public void setTagMinDocCount(int tagMinDocCount) {
this.tagMinDocCount = tagMinDocCount;
}
}

DocCluster.java

/**
*
* @author
* @version 创建时间:2011-3-8 上午10:23:35
*/
public class DocCluster {
private String[] docIdList;
private ICTHit[] docList;
private int level;
private List<DocCluster> subclusters;
private String tags;
private String title; public String[] getDocIdList() {
return this.docIdList;
} public void setDocIdList(String[] docIdList) {
this.docIdList = docIdList;
} public ICTHit[] getDocList() {
return this.docList;
} public void setDocList(ICTHit[] docList) {
this.docList = docList;
} public int getLevel() {
return level;
} public void setLevel(int level) {
this.level = level;
} public List<DocCluster> getSubclusters() {
return this.subclusters;
} public void setSubclusters(List<DocCluster> subclusters) {
this.subclusters = subclusters;
} public String getTags() {
return this.tags;
} public void setTags(String tags) {
this.tags = tags;
} public String getTitle() {
if (title == null)
title = "";
return this.title;
} public void setTitle(String title) {
this.title = title;
}
}

ICTHit.java

public class ICTHit implements Serializable {
/*
* 关键词数组
*/
private String[] TagList;
private String docId;
private String title; public String[] getTagList() {
return TagList;
} public void setTagList(String[] tagList) {
TagList = tagList;
} public String getDocId() {
return docId;
} public void setDocId(String docId) {
this.docId = docId;
} public String getTitle() {
return title;
} public void setTitle(String title) {
this.title = title;
} }

TagHitMatrix.java

public class TagHitMatrix extends LinkedHashMap<String, long[]> {
/**
*
*/
private static final long serialVersionUID = -7511464445378974433L;
public static int ii = 0;
public ClusterDocInfo[] docs;
public int hitsItemCount; public TagHitMatrix(int DocCount, int MaxTagCount) {
this.hitsItemCount = (int) (DocCount / 62.0D + 0.984375D);
this.docs = new ClusterDocInfo[DocCount]; for (int i = 0; i < this.docs.length; i++)
this.docs[i] = new ClusterDocInfo(MaxTagCount);
} public void AddDocHit(String TagStr, int Position) {
TagStr = TagStr.trim(); int n = Position / 62;
int m = Position % 62;
long[] DocHits = (long[]) get(TagStr);
if (DocHits == null) {
DocHits = new long[this.hitsItemCount];
put(TagStr, DocHits);
}
DocHits[n] |= Math.round(Math.pow(2.0D, m));
ClusterDocInfo di = this.docs[Position];
di.TagList[(di.TagCount++)] = TagStr;
} class ClusterDocInfo {
public String[] TagList;
public int TagCount; public ClusterDocInfo(int MaxTagCount) {
this.TagList = new String[MaxTagCount];
this.TagCount = 0;
}
}
}

测试方法:

public void test(ICTHit[] icthits) throws IOException {
ClusterBuilder clusterBuilder = new ClusterBuilder();
// 设置需要聚类的数据集合,测试中用的null。
clusterBuilder.setDocs(icthits);
// 设置聚类级别,只使用1级
clusterBuilder.setMaxLevels(10);
clusterBuilder.setUseTagsAsTitle(true);
// 一般将检索词设置为wordsExcluded
clusterBuilder.setWordsExcluded("万美元,日本,公司,视频,北京时间,图文,新华网,新浪,消息,通讯,互联网,美国,中国");
clusterBuilder
.setOptions(new ClusteringOptions[] { new ClusteringOptions(),new ClusteringOptions() }); // 开始聚类
clusterBuilder.cluster();
FileWriter fw1 = new FileWriter("c:/today-20110509-cluster.txt ", true);
BufferedWriter bw1 = new BufferedWriter(fw1); // 打印结果
if (clusterBuilder.getClusters() != null) {
int i = 0;
for (DocCluster docCluster : clusterBuilder.getClusters()) {
i++;
System.out.println("tag:" + docCluster.getTags() + "("
+ docCluster.getDocIdList().length + ")");
bw1.write(docCluster.getTags() + "("+ docCluster.getDocIdList().length + ")"+"\r\n "); if (docCluster.getDocList() != null
&& docCluster.getDocList().length > 0) {
for (ICTHit co : docCluster.getDocList()) {
System.out.println(" DocID: " + co.getDocId());
bw1.write("标题为: " + co.getTitle()+",ID为"+co.getDocId()+"\r\n ");
for (int m = 0; m < co.getTagList().length; m++) {
bw1.write("标题为: " + co.getTitle()+",ID为"+co.getDocId()+"\r\n ");
System.out.println(" Key Word: "
+ co.getTagList()[m]);
}
System.out.println("");
}
System.out.println("");
} else {
bw1.write(" 该分类下无数据!"+"\r\n ");
}
bw1.write("-------------------------------------------------------------------------------\r\n");
}
}
bw1.close();
fw1.close();
}

如上方法可以,是一个示例性的,没有用在生产当中。核心方法有了。大家可以引用到项目当中。效果比carrot2标准的方法要好很多。

生成文本聚类java实现3的更多相关文章

  1. 文本挖掘之文本聚类(OPTICS)

    刘 勇  Email:lyssym@sina.com 简介 鉴于DBSCAN算法对输入参数,邻域半径E和阈值M比较敏感,在参数调优时比较麻烦,因此本文对另一种基于密度的聚类算法OPTICS(Order ...

  2. K-means算法及文本聚类实践

    K-Means是常用的聚类算法,与其他聚类算法相比,其时间复杂度低,聚类的效果也还不错,这里简单介绍一下k-means算法,下图是一个手写体数据集聚类的结果. 基本思想 k-means算法需要事先指定 ...

  3. 灵玖软件NLPIRParser智能文本聚类

    随着互联网的迅猛发展,信息的爆炸式增加,信息超载问题变的越来越严重,信息的更新率也越来越高,用户在信息海洋里查找信息就像大海捞针一样.搜索引擎服务应运而生,在一定程度上满足了用户查找信息的需要.然而互 ...

  4. [python] 使用Jieba工具中文分词及文本聚类概念

    声明:由于担心CSDN博客丢失,在博客园简单对其进行备份,以后两个地方都会写文章的~感谢CSDN和博客园提供的平台.        前面讲述了很多关于Python爬取本体Ontology.消息盒Inf ...

  5. pyhanlp 文本聚类详细介绍

    文本聚类 文本聚类简单点的来说就是将文本视作一个样本,在其上面进行聚类操作.但是与我们机器学习中常用的聚类操作不同之处在于. 我们的聚类对象不是直接的文本本身,而是文本提取出来的特征.因此如何提取特征 ...

  6. [转]python进行中文文本聚类(切词以及Kmeans聚类)

    简介 查看百度搜索中文文本聚类我失望的发现,网上竟然没有一个完整的关于Python实现的中文文本聚类(乃至搜索关键词python 中文文本聚类也是如此),网上大部分是关于文本聚类的Kmeans聚类的原 ...

  7. 文本挖掘之文本聚类(MapReduce)

    刘 勇  Email:lyssym@sina.com 简介 针对大数量的文本数据,采用单线程处理时,一方面消耗较长处理时间,另一方面对大量数据的I/O操作也会消耗较长处理时间,同时对内存空间的消耗也是 ...

  8. 文本挖掘之文本聚类(DBSCAN)

    刘 勇   Email:lyssym@sina.com 简介 鉴于基于划分的文本聚类方法只能识别球形的聚类,因此本文对基于密度的文本聚类算法展开研究.DBSCAN(Density-Based Spat ...

  9. 10.HanLP实现k均值--文本聚类

    笔记转载于GitHub项目:https://github.com/NLP-LOVE/Introduction-NLP 10. 文本聚类 正所谓物以类聚,人以群分.人们在获取数据时需要整理,将相似的数据 ...

  10. 物以类聚人以群分,通过GensimLda文本聚类构建人工智能个性化推荐系统(Python3.10)

    众所周知,个性化推荐系统能够根据用户的兴趣.偏好等信息向用户推荐相关内容,使得用户更感兴趣,从而提升用户体验,提高用户粘度,之前我们曾经使用协同过滤算法构建过个性化推荐系统,但基于显式反馈的算法就会有 ...

随机推荐

  1. 【转载】Ubuntu20.04安装Bazel

    原文地址: https://zhuanlan.zhihu.com/p/311406177 ====================================== sudo apt install ...

  2. 再探 游戏 《 2048 》 —— AI方法—— 缘起、缘灭(4) —— state-of-the-art

    <2048>游戏在线试玩地址: https://play2048.co/ 该游戏的解法比较不错的资料为外网的一个讨论帖子: What is the optimal algorithm fo ...

  3. 到底什么是Cortex、ARMv8、arm架构、ARM指令集、soc?一文帮你梳理基础概念【科普】

    前言 有粉丝问我到底什么是ARM,搞不清楚Cortex.arm内核.arm架构.ARM指令集.soc这些概念都是什么关系,下面一口君给大家整理一下关于ARM相关的一些概念. 1.ARM既可以认为是一个 ...

  4. C语言编程-GCC编译过程

    gcc编译 预处理 ->编译->汇编->链接 预处理 gcc -E helloworld.c -o helloworld.i 头文件展开:不检查语法错误,即可以展开任意文件: 宏定义 ...

  5. 目标追踪 ByteTrack 算法详细流程分析

    原理介绍 ByteTrack是字节跳动与2021年10月份公开的一个全新的多目标跟踪算法,原论文是<ByteTrack: Multi-Object Tracking by Associating ...

  6. [学习笔记]在不同项目中切换Node.js版本

    @ 目录 使用 Node Version Manager (NVM) 安装 NVM 使用 NVM 安装和切换 Node.js 版本 为项目指定 Node.js 版本 使用环境变量指定 Node.js ...

  7. 记一次list集合优化

    已知某个列表List1有2000条数据,但是因为这个列表的某个字段要从另一个表查询,所以根据一个关联的查询条件查出来的另一个List2有将近75000条数据,然后需要先循环第一个List1,然后循环里 ...

  8. EF Core报错“Format of the initialization string does not conform to specification starting at index 0.”

    问题分析: 今天在EF Core数据库迁移的过程中无意中发现此错误,我的项目仅仅复制黏贴了配置文件而已,自此发现是数据库配置文件json在作祟. 对比了下发现是.json文件没有被设置"复制 ...

  9. 全网最适合入门的面向对象编程教程:47 Python函数方法与接口-回调函数Callback

    全网最适合入门的面向对象编程教程:47 Python 函数方法与接口-回调函数 Callback 摘要: 回调函数是编程中一种非常常见的模式,用于将函数作为参数传递给其他函数或方法.这种模式在 Pyt ...

  10. CSS – Counters

    介绍 counter 有点像 JS 的 for loop index. 最常用到的地方就是做 ol > li. 参考: W3Schools – CSS Counters 默认 ol > l ...