生成文本聚类java实现3

由于carrot2对中文的理解很不靠谱，所以参考了网络上的一些资料，现在贡献出来所有代码。

代码的思路就是找字或者词出现的频度，并进行打分，最后按照出现次数和重要性，找出重要的语汇。现在贴出来一些可用的代码。

ClusterBuilder.java

/** 
*

* @author

* @version 创建时间：2011-3-8 下午02:02:36

* 聚类生成器

 */

public class ClusterBuilder {

    private static final Log LOG;

    private List<DocCluster> clusters;

    private ICTHit[] docs;

    private int maxLevels;

    private ClusteringOptions[] options;

    private boolean useTagsAsTitle;

    private String wordsExcluded;

    private static short[] bit1Table;

    static {

        LOG = LogFactory.getLog(ClusterBuilder.class.getName());

        bit1Table = new short[65536];

        for (int n = 0; n < bit1Table.length; n++) {

            String s = Integer.toBinaryString(n);

            short m = 0;

            for (int k = 0; k < s.length(); k++) {

                if (s.charAt(k) == '1') {

                    m = (short) (m + 1);

                }

            }

            bit1Table[n] = m;

        }

    }

    private static int getValidBitCount(long n) {

        int i3 = (int) (n % 65536L);

        n /= 65536L;

        int i2 = (int) (n % 65536L);

        n /= 65536L;

        int i1 = (int) (n % 65536L);

        n /= 65536L;

        int i0 = (int) (n % 65536L);

        return bit1Table[i0] + bit1Table[i1] + bit1Table[i2] + bit1Table[i3];

    }

    private static int getDocHitCount(long[] hits) {

        assert (hits != null);

        if (hits == null)

            return 0;

        int n0 = 0;

        for (int i = 0; i < hits.length; i++) {

            n0 += getValidBitCount(hits[i]);

        }

        return n0;

    }

    public ClusterBuilder() {

        for (int n = 0; n < bit1Table.length; n++)

        {

            String s = Integer.toBinaryString(n);

            short m = 0;

            for (int k = 0; k < s.length(); k++)

            {

                if (s.getBytes()[k] == '1')

                {

                    m = (short)(m + 1);

                }

            }

            bit1Table[n] = m;

        }

    }

    /**

     *

     * @param docsToCluster 要聚类的记录列表

     * @param exWords 不使用的主题词列表，多个词用西文逗号分隔。这些词将不会作为主题词。

     * @param maxLevels 最大聚类级数

     * @param useTagsAsTitle 是否使用主题词作为类别主题词。如果不使用，则根据文档标题自动生成类别主题词。

     */

    public ClusterBuilder(ICTHit[] docsToCluster, String exWords, int maxLevels, boolean useTagsAsTitle) {

        this.useTagsAsTitle = useTagsAsTitle;

        this.wordsExcluded = exWords;

        this.maxLevels = maxLevels;

        this.docs = docsToCluster;

        this.options = new ClusteringOptions[3];

        this.options[0] = new ClusteringOptions();

        this.options[0].setDocMaxTagCount(10);

        this.options[0].setMinTagRelevance(60);

        this.options[0].setMinSameDocPercent(80);

        this.options[1] = new ClusteringOptions();

        this.options[1].setDocMaxTagCount(8);

        this.options[1].setMinTagRelevance(85);

        this.options[1].setMinSameDocPercent(70);

        this.options[1].setTagMinDocCount(2);

        this.options[1].setMinSameDocs(2);

        this.options[2] = new ClusteringOptions();

        this.options[2].setDocMaxTagCount(8);

        this.options[2].setMinTagRelevance(50);

        this.options[2].setMinSameDocPercent(70);

        this.options[2].setTagMinDocCount(2);

        this.options[2].setMinSameDocs(2);

    }

    /**

     * 对Docs记录列表执行聚类，结果存放于Clusters中

     */

    public void cluster() {

        this.clusters = createLevelClusters(docs, 0, options[0]);

        List subs = null;

        if (this.maxLevels <= 1) {

            return;

        }

        for (DocCluster dc : this.clusters) {

            if ((dc.getDocList().length < options[0].getMinDocsToCluster()) || (dc.getTags() == "其他"))

                continue;

            subs = createLevelClusters(dc.getDocList(), 1, options[1]);

            if (subs.size() > 1)

                dc.setSubclusters(subs);

        }

    }

    /**

     * 创建一个层级的聚类

     * @param docs 文档列表

     * @param level 层级号

     * @param levelOpt 该层级的聚类选项

     * @return

     */

    private List<DocCluster> createLevelClusters(ICTHit[] docs, int level, ClusteringOptions levelOpt) {

        TagHitMatrix matrix = new TagHitMatrix(docs.length, levelOpt.getDocMaxTagCount());

        List clusters = new ArrayList();

        int i, ValidTagCount;

        int DocCount = 0;

        // 扫描文档列表，根据每个文档的主题词列表，初始化主题词文档对照表。

        for (i = 0; i < docs.length; i++) {

            ICTHit d = docs[i];

            int validTagCount = 0;

            if (d.getTagList() != null) {

                String[] tagList = d.getTagList();

                for (int tagIdx = 0; (tagIdx < tagList.length) && (validTagCount < levelOpt.getDocMaxTagCount()); tagIdx++) {

                    String tag = tagList[tagIdx].trim();

                     // 主题词长度大于6个字的丢弃

                    if ((tag.length() <= 0)

                            || (tag.length() > 20)

                            || ((this.wordsExcluded.length() != 0) && ((tag.contains(this.wordsExcluded)) || (this.wordsExcluded

                                    .contains(tag)))))

                        continue;

                    matrix.AddDocHit(tag, i);

                    validTagCount++;

                }

            }

        }

        int maxKwDocCount = 0;

        List entryListToRemove = new ArrayList();

        String kwWithMaxDocCount = "";

        LOG.debug("有效关键词：");

        for (Map.Entry entry : matrix.entrySet()) {

            // 统计当前主题词的命中文档数，文档数小于预设值，则该主题词将被删除

            int n = getDocHitCount((long[]) entry.getValue());

            if (n < levelOpt.getTagMinDocCount()) {

                entryListToRemove.add((String) entry.getKey());

            } else {

                LOG.debug((String) entry.getKey() + "(" + n + "), ");

                DocCount += n;

            }

            if (n > maxKwDocCount) {

                maxKwDocCount = n;

                kwWithMaxDocCount = (String) entry.getKey();

            }

        }

        LOG.debug("");

        LOG.debug("被忽略的关键词：");

        for (i = 0; i < entryListToRemove.size(); i++) {

            LOG.debug((String) entryListToRemove.get(i) + ", ");

            matrix.remove(entryListToRemove.get(i));

        }

        LOG.debug("");

        LOG.debug(entryListToRemove.size() + "个关键词被忽略。剩余" + matrix.size() + "个关键词。");

        LOG.debug("最大文档数的关键词：" + kwWithMaxDocCount + "，文档数：" + maxKwDocCount + "。");

        double docCountPerTag = matrix.size() > 0 ? DocCount / matrix.size() : 0.0D;

        LOG.debug("关键词平均文档数：" + docCountPerTag);

        levelOpt.setMinSameDocs((int) (docCountPerTag / (2.0D + level)));

        if (levelOpt.getMinSameDocs() < 1) {

            levelOpt.setMinSameDocs(1);

        }

        while (mergeClusters(matrix, levelOpt) > 0) {

        }

        return createResult(matrix, docs, level, levelOpt);

    }

    private int mergeClusters(TagHitMatrix matrix, ClusteringOptions opt) {

        if (matrix.size() == 0)

            return 0;

        long[] docHitsMerged = (long[]) null;

        long[] maxDocHitsMerged = (long[]) null;

        String word1 = "";

        String word2 = "";

        String word1ToMerge = "";

        String word2ToMerge = "";

        int i,j;

        int sameDocs = 0;

        // 初始化一个相关度数组，0到100分，共101项

        List rankMatrix = new ArrayList();

        for (i = 0; i < 101; i++) {

            rankMatrix.add(new ArrayList());

        }

        List matrix2List = new ArrayList();

        matrix2List.addAll(matrix.entrySet());

        // 将主题词文档映射表中的主题词两两比对

        for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {

            Map.Entry hits1 = (Map.Entry) matrix2List.get(i1);

            word1 = (String) hits1.getKey();

            for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {

                Map.Entry hits2 = (Map.Entry) matrix2List.get(i2);

                word2 = (String) hits2.getKey();

                Object[] re = getWordsRelevance(mapEntry2TagHitEntry(hits1), mapEntry2TagHitEntry(hits2),

                        docHitsMerged, sameDocs, opt, matrix.hitsItemCount);

                // 计算两个词的相关性，获取两词的文档汇总表，以及相同文档数

                int nRank = ((Integer) re[0]).intValue();

                docHitsMerged = (long[]) re[1];

                sameDocs = ((Integer) re[2]).intValue();

                // 相关度小于预设阈值的忽略

                if (nRank >= opt.getMinTagRelevance()) {

                    ((List) rankMatrix.get(nRank)).add(new IdPair(i1, i2));

                }

            }

        }

        List tagListToRemove = new ArrayList();

        List entryListMerged = new ArrayList();

        entryListMerged.add(new TagHitEntry("", null));

        HashSet idPairTable = new HashSet();

        TagHitEntry entryToMerge1;

        while (true) {

            // 找到最大相关性的两个主题词

            for (i = 100; (i >= opt.getMinTagRelevance()) && (((List) rankMatrix.get(i)).size() == 0); i--){};

            if (i < opt.getMinTagRelevance()) {

                break;

            }

            IdPair ip = (IdPair) ((List) rankMatrix.get(i)).get(0);

            // 合并两个类别

            ((List) rankMatrix.get(i)).remove(0);

            entryToMerge1 = ip.Id1 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id1))

                    : (TagHitEntry) entryListMerged.get(-ip.Id1);

            TagHitEntry entryToMerge2 = ip.Id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id2))

                    : (TagHitEntry) entryListMerged.get(-ip.Id2);

            word1ToMerge = entryToMerge1.key;

            word2ToMerge = entryToMerge2.key;

            assert ((word1ToMerge.length() > 0) && (word2ToMerge.length() > 0));

            String wordsMerged = word1ToMerge + "," + word2ToMerge;

            long[] lDocs0 = entryToMerge1.value;

            long[] lDocs1 = entryToMerge2.value;

            maxDocHitsMerged = new long[matrix.hitsItemCount];

            for (i = 0; i < lDocs0.length; i++) {

                lDocs0[i] |= lDocs1[i];// 获取合并的文档集

            }

            if (ip.Id1 >= 0)

                tagListToRemove.add(word1ToMerge);

            else

                entryListMerged.set(-ip.Id1, new TagHitEntry("", null));

            if (ip.Id2 >= 0)

                tagListToRemove.add(word2ToMerge);

            else {

                entryListMerged.set(-ip.Id2, new TagHitEntry("", null));

            }

            entryListMerged.add(new TagHitEntry(wordsMerged, maxDocHitsMerged));

            // 替换与合并主题词有关联的其他相关主题词对的评分

            int idMerged = -(entryListMerged.size() - 1);

            int id2 = 0;

            boolean CanDelete = false;

            for (i = 0; i <= 100; i++) {

                int ListCount = ((List) rankMatrix.get(i)).size();

                if (ListCount == 0) {

                    continue;

                }

                for (j = 0; j < ListCount; j++) {

                    IdPair p = (IdPair) ((List) rankMatrix.get(i)).get(j);

                    CanDelete = false;

                    if ((ip.Id1 == p.Id1) || (ip.Id2 == p.Id1)) {

                        id2 = p.Id2;

                        CanDelete = true;

                    } else if ((ip.Id1 == p.Id2) || (ip.Id2 == p.Id2)) {

                        id2 = p.Id1;

                        CanDelete = true;

                    }

                    if (!CanDelete)

                        continue;

                    if (idMerged == id2) {

                        continue;

                    }

                    ((List) rankMatrix.get(i)).remove(j);

                    j--;

                    ListCount--;

                    IdPair pairMerged = new IdPair(idMerged, id2);

                    if (idPairTable.contains(pairMerged)) {

                        continue;

                    }

                    TagHitEntry e2 = id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(id2))

                            : (TagHitEntry) entryListMerged.get(-id2);

                    assert ((e2.key.length() != 0) && (e2.key != wordsMerged));

                    Object[] re = getWordsRelevance(new TagHitEntry(wordsMerged, maxDocHitsMerged), e2, docHitsMerged,

                            sameDocs, opt, matrix.hitsItemCount);

                    int rank = ((Integer) re[0]).intValue();

                    docHitsMerged = (long[]) re[1];

                    sameDocs = ((Integer) re[2]).intValue();

                    if (rank <= opt.getMinTagRelevance())

                        continue;

                    ((List) rankMatrix.get(rank)).add(pairMerged);

                    idPairTable.add(pairMerged);

                }

            }

        }

        // 删除被合并的主题词

        for (int m =0;m<tagListToRemove.size();m++){

            matrix.remove(tagListToRemove.get(m));

        }

        /**

        for (String w : tagListToRemove)

            matrix.remove(w);

        **/

        // 添加合并而成的新主题词

        for (int n=0;n<entryListMerged.size();n++){

            TagHitEntry e = (TagHitEntry) entryListMerged.get(n);

            matrix.put(e.getKey(), e.getValue());

        }

        /**

        for (TagHitEntry e : entryListMerged) {

            if (e.getKey().length() > 0)

                matrix.put(e.getKey(), e.getValue());

        }

        **/

        return 0;

    }

    private int mergeClusters1(TagHitMatrix matrix, ClusteringOptions opt) {

        if (matrix.size() == 0)

            return 0;

        long[] docHitsMerged = (long[]) null;

        long[] maxDocHitsMerged = (long[]) null;

        int nMaxRank = 0;

        String word1 = "";

        String word2 = "";

        String word1ToMerge = "";

        String word2ToMerge = "";

        int sameDocs = 0;

        List matrix2List = new ArrayList();

        matrix2List.addAll(matrix.entrySet());

        for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {

            TagHitEntry hits1 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i1));

            word1 = hits1.getKey();

            for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {

                TagHitEntry hits2 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i2));

                word2 = hits2.getKey();

                Object[] re = getWordsRelevance(hits1, hits2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount);

                int nRank = ((Integer) re[0]).intValue();

                docHitsMerged = (long[]) re[1];

                sameDocs = ((Integer) re[2]).intValue();

                if ((nRank <= nMaxRank) || (nRank <= opt.getMinTagRelevance()))

                    continue;

                nMaxRank = nRank;

                maxDocHitsMerged = docHitsMerged;

                word1ToMerge = word1;

                word2ToMerge = word2;

            }

        }

        if ((word1ToMerge.length() == 0) || (word2ToMerge.length() == 0)) {

            return 0;

        }

        String wordsMerged = word1ToMerge + "," + word2ToMerge;

        if ((nMaxRank > opt.getMinTagRelevance()) && (wordsMerged != "")) {

            matrix.remove(word1ToMerge);

            matrix.remove(word2ToMerge);

            matrix.put(wordsMerged, maxDocHitsMerged);

            LOG.debug("(" + word1ToMerge + ") - (" + word2ToMerge + ")");

            return 1;

        }

        return 0;

    }

    private Object[] getWordsRelevance(TagHitEntry entry1, TagHitEntry entry2, long[] docHitsMerged, int sameDocCount,

            ClusteringOptions opt, int hitsItemCount) {

        Object[] re = new Object[3];

        docHitsMerged = new long[hitsItemCount];

        sameDocCount = 0;

        String tag1 = entry1.getKey();

        String tag2 = entry2.getKey();

        assert (tag2 != tag1);

        long[] lDocs0 = entry1.getValue();

        long[] lDocs1 = entry2.getValue();

        int n0 = 0;

        int n1 = 0;

        n0 = getDocHitCount(lDocs0);

        n1 = getDocHitCount(lDocs1);

        int docCountMin = Math.min(n0, n1);

        int docCountMax = Math.max(n0, n1);

        int docCountMerged = 0;

        long sameDocBits = 0L;

        long diffDocBits = 0L;

        int diffDocCount = 0;

        for (int i = 0; i < lDocs0.length; i++) {

            docHitsMerged[i] = lDocs0[i] | lDocs1[i];// 获取合并的文档集

            docCountMerged += getValidBitCount(docHitsMerged[i]);

            diffDocBits = lDocs0[i] ^ lDocs1[i];// 获取不同的文档集

            diffDocCount += getValidBitCount(diffDocBits);

            sameDocBits = lDocs0[i] & lDocs1[i];// 获取相同的文档集

            sameDocCount += getValidBitCount(sameDocBits);

        }

        boolean IsSubstring = false;

        // 一个主题词是另一个的子串，则得分较高

        if ((tag2.contains(tag1)) || (tag1.contains(tag2))) {

            IsSubstring = true;

            docCountMin += opt.getTagMinDocCount();

        }

        if ((sameDocCount == 0) && (!IsSubstring)) {

            re[0] = Integer.valueOf(0);

            re[1] = docHitsMerged;

            re[2] = Integer.valueOf(sameDocCount);

            return re;

        }

        if (docCountMin < opt.getTagMinDocCount()) {

            re[0] = Integer.valueOf(0);

            re[1] = docHitsMerged;

            re[2] = Integer.valueOf(sameDocCount);

            return re;

        }

        int samePercent = (int) Math.round(sameDocCount * 100.0D / docCountMerged);

        int samePercentMin = (int) Math.round(sameDocCount * 100.0D / docCountMin);

        int diffPercent = (int) Math.round(diffDocCount * 100.0D / docCountMerged);

        LOG.debug("相关性：" + tag1 + "(" + n0 + ")-(" + n1 + ")" + tag2);

        LOG.debug(", SamePercent=" + samePercent);

        LOG.debug(", SamePercentMin=" + samePercentMin);

        LOG.debug(", DiffPercent=" + diffPercent);

        int nRank;

        if ((sameDocCount >= opt.getMinSameDocs())

                && ((docCountMin < 10) || (samePercentMin >= opt.getMinSameDocPercent()))) {

            nRank = (int) Math.round((samePercentMin + samePercent) * 0.85D - diffPercent * 0.2D);

        } else {

            nRank = 0;

        }

        if (IsSubstring)

            nRank += 80;

        LOG.debug(", Rank=" + nRank);

        re[0] = Integer.valueOf(Math.min(nRank, 100));

        re[1] = docHitsMerged;

        re[2] = Integer.valueOf(sameDocCount);

        return re;

    }

    private TagHitEntry mapEntry2TagHitEntry(Map.Entry<String, long[]> e) {

        return new TagHitEntry((String) e.getKey(), (long[]) e.getValue());

    }

    @SuppressWarnings("unchecked")

    private List<DocCluster> createResult(TagHitMatrix matrix, ICTHit[] docs, int level, ClusteringOptions opt) {

        int i,j;

        Map<String,DocValue> clsIdList = new HashMap();

        List ClassTitleList = new ArrayList();

        for (Map.Entry de : matrix.entrySet()) {

            DocValue dv = new DocValue();

            clsIdList.put((String) de.getKey(), dv);

        }

        List<Integer> otherIdList = new ArrayList();

        TagHitEntry maxTagHitEntry = new TagHitEntry();

        int clsCount;

        String tag;

        // 确定每个文档所属的类别

        for (i = 0; i < docs.length; i++) {

            ICTHit d = docs[i];

            TagHitMatrix.ClusterDocInfo di = matrix.docs[i];

            assert (docs[i] != null);

            int maxTagHit = 0;

            clsCount = 0;

            for (Map.Entry hits : matrix.entrySet()) {

                int tagHitCount = 0;

                int score = 0;

                String clsWordListStr = "," + (String) hits.getKey() + ",";

                // 那个类别包含当前文档的主题词最多，该文档就属于哪个类别

                for (j = 0; j < di.TagCount; j++) {

                    tag = di.TagList[j];

                    score = j < 3 ? 2 : 1;

                    assert (tag.length() > 0);

                    if (!clsWordListStr.contains("," + tag + ","))

                        continue;

                    tagHitCount += score;

                    clsCount++;

                }

                if (maxTagHit >= tagHitCount)

                    continue;

                maxTagHit = tagHitCount;

                maxTagHitEntry = mapEntry2TagHitEntry(hits);

            }

            if (maxTagHit > 0) {

                DocValue dv = (DocValue) clsIdList.get(maxTagHitEntry.getKey());

                dv.idList.add(Integer.valueOf(i));

            } else {

                otherIdList.add(Integer.valueOf(i));

            }

        }

        // 生成类别列表

        List<DocCluster> clusterList = new ArrayList();

        String[] TagList;

        Object dc;

        for (Map.Entry<String,DocValue> kv : clsIdList.entrySet()) {

            DocValue dv = (DocValue) kv.getValue();

            if (dv.idList.size() <= 0)

                continue;

            if (dv.idList.size() == 1) {

                otherIdList.add((Integer) dv.idList.get(0));

            } else {

                dc = new DocCluster();

                ((DocCluster) dc).setDocIdList(new String[dv.idList.size()]);

                ((DocCluster) dc).setDocList(new ICTHit[dv.idList.size()]);

                for (i = 0; i < dv.idList.size(); i++) {

                    ((DocCluster) dc).getDocIdList()[i] = docs[((Integer) dv.idList.get(i)).intValue()].getDocId();

                    ((DocCluster) dc).getDocList()[i] = docs[((Integer) dv.idList.get(i)).intValue()];

                }

                ((DocCluster) dc).setLevel(level);

                ((DocCluster) dc).setTags((String) kv.getKey());

                for (i = 0; (i < clusterList.size())

                        && (((DocCluster) dc).getDocIdList().length <= ((DocCluster) clusterList.get(i)).getDocIdList().length);) {

                    i++;

                }

                clusterList.add(i, (DocCluster) dc);

            }

        }

        for (i = opt.getMaxClusterCount(); i < clusterList.size();) {

            DocCluster c = (DocCluster) clusterList.get(i);

            List idList = ((DocValue) clsIdList.get(c.getTags())).idList;

            for (dc = idList.iterator(); ((Iterator) dc).hasNext();) {

                int idx = ((Integer) ((Iterator) dc).next()).intValue();

                otherIdList.add(Integer.valueOf(idx));

            }

            clusterList.remove(i);

        }

        int i1;

        for (i = 0; i < clusterList.size(); i++) {

            DocCluster dc1 = (DocCluster) clusterList.get(i);

            String[] tagList = dc1.getTags().split(",");

            String newTags = "";

            for (j = 0; j < tagList.length; j++) {

                i1 = dc1.getTags().indexOf(tagList[j]);

                int i2 = dc1.getTags().lastIndexOf(tagList[j]);

                if (i1 == i2)

                    newTags = newTags + tagList[j] + ",";

            }

            if ((newTags.trim().length() > 0) && (newTags.endsWith(","))) {

                newTags = newTags.substring(0, newTags.length() - 1);

            }

            dc1.setTags(newTags);

            dc1.setTitle("");

            if (this.useTagsAsTitle) {

                tagList = dc1.getTags().split(",");

                for (j = 0; (tagList != null) && (j < tagList.length); j++) {

                    if ((dc1.getTitle() + tagList[j]).length() > 16)

                        break;

                    boolean isSubstr = false;

                    for (DocCluster c : clusterList) {

                        if ((c.getTitle().length() <= 0)

                                || ((!c.getTitle().contains(tagList[j])) && (!tagList[j].contains(c.getTitle()))))

                            continue;

                        isSubstr = true;

                        break;

                    }

                    if (!isSubstr)

                        dc1.setTitle(dc1.getTitle() + tagList[j] + ",");

                }

                if ((dc1.getTitle().trim().length() > 0) && (dc1.getTitle().endsWith(","))) {

                    dc1.setTitle(dc1.getTitle().substring(0, dc1.getTitle().length() - 1));

                }

            }

            if (dc1.getTitle() != "")

                continue;

            dc1.setTitle(dc1.getTags());

            if (dc1.getTitle().length() <= 16)

                continue;

            String s = dc1.getTitle().substring(0, 16);

            int li = s.lastIndexOf(',');

            if (li > 0) {

                dc1.setTitle(s.substring(0, li));

            }

        }

        if (otherIdList.size() > 0) {

            DocCluster clusterOther = new DocCluster();

            clusterOther.setDocIdList(new String[otherIdList.size()]);

            clusterOther.setDocList(new ICTHit[otherIdList.size()]);

            clusterOther.setLevel(level);

            clusterOther.setTitle("其他");

            clusterOther.setTags("其他");

            i = 0;

            for (int k=0;k<otherIdList.size();k++) {

                int idx = otherIdList.get(k);

                clusterOther.getDocIdList()[i] = docs[idx].getDocId();

                clusterOther.getDocList()[i] = docs[idx];

                i++;

            }

            clusterList.add(clusterOther);

        }

        return (List<DocCluster>) clusterList;

    }

    public List<DocCluster> getClusters() {

        return this.clusters;

    }

    public void setClusters(List<DocCluster> clusters) {

        this.clusters = clusters;

    }

    public ICTHit[] getDocs() {

        return this.docs;

    }

    public void setDocs(ICTHit[] docs) {

        this.docs = docs;

    }

    public int getMaxLevels() {

        return this.maxLevels;

    }

    public void setMaxLevels(int maxLevels) {

        this.maxLevels = maxLevels;

    }

    public ClusteringOptions[] getOptions() {

        return this.options;

    }

    public void setOptions(ClusteringOptions[] options) {

        this.options = options;

    }

    public boolean isUseTagsAsTitle() {

        return this.useTagsAsTitle;

    }

    public void setUseTagsAsTitle(boolean useTagsAsTitle) {

        this.useTagsAsTitle = useTagsAsTitle;

    }

    public String getWordsExcluded() {

        return this.wordsExcluded;

    }

    public void setWordsExcluded(String wordsExcluded) {

        this.wordsExcluded = wordsExcluded;

    }

    private class DocValue {

        public List<Integer> idList = new ArrayList();

        public String titleListStr = "";

        private DocValue() {

        }

    }

    /**

     * 主题词ID对，主题词ID为该主题词在主题词文档映射表中的主键位置。

    * @author

    * @version 创建时间：2011-3-9 下午02:52:44

     */

    private class IdPair {

        public int Id1;

        public int Id2;

        public IdPair(int id1, int id2) {

            assert (id1 != id2);

            if (id1 < id2) {

                this.Id1 = id1;

                this.Id2 = id2;

            } else {

                this.Id1 = id2;

                this.Id2 = id1;

            }

        }

        public int hashCode() {

            return -1;

        }

        public boolean equals(Object o) {

            return (((IdPair) o).Id1 == this.Id1) && (((IdPair) o).Id2 == this.Id2);

        }

    }

    public static class TagHitEntry {

        public String key;

        public long[] value;

        public TagHitEntry() {

        }

        public TagHitEntry(String k, long[] v) {

            this.key = k;

            this.value = v;

        }

        public String getKey() {

            return this.key;

        }

        public long[] getValue() {

            return this.value;

        }

    }

}

ClusteringOptions.java

/**

 *

* @author

* @version 创建时间：2011-3-8 上午10:23:27

 */

public class ClusteringOptions {

    public static int DefMaxClusterCount = 20;

    public static int DefMaxKeywordCount = 6;

    public static int DefMinWordsRelevance = 10;

    public static int DefTagMinDocCount = 3;

    public static int DefIgnoreSameDocs = 2;

    public static int DefSameDocPercent = 50;

    public static int DefMinDocsToCluster = 8;

    private int docMaxTagCount;

    private int maxClusterCount;

    private int minDocsToCluster;

    private int minSameDocPercent;

    private int minSameDocs;

    private int minTagRelevance;

    private int tagMinDocCount;

    public ClusteringOptions() {

        this.maxClusterCount = DefMaxClusterCount;

        this.minTagRelevance = DefMinWordsRelevance;

        this.tagMinDocCount = DefTagMinDocCount;

        this.minSameDocs = DefIgnoreSameDocs;

        this.minSameDocPercent = DefSameDocPercent;

        this.docMaxTagCount = DefMaxKeywordCount;

        this.minDocsToCluster = DefMinDocsToCluster;

    }

    public int getDocMaxTagCount() {

        return this.docMaxTagCount;

    }

    public void setDocMaxTagCount(int docMaxTagCount) {

        this.docMaxTagCount = docMaxTagCount;

    }

    public int getMaxClusterCount() {

        return this.maxClusterCount;

    }

    public void setMaxClusterCount(int maxClusterCount) {

        this.maxClusterCount = maxClusterCount;

    }

    public int getMinDocsToCluster() {

        return this.minDocsToCluster;

    }

    public void setMinDocsToCluster(int minDocsToCluster) {

        this.minDocsToCluster = minDocsToCluster;

    }

    public int getMinSameDocPercent() {

        return this.minSameDocPercent;

    }

    public void setMinSameDocPercent(int minSameDocPercent) {

        this.minSameDocPercent = minSameDocPercent;

    }

    public int getMinSameDocs() {

        return this.minSameDocs;

    }

    public void setMinSameDocs(int minSameDocs) {

        this.minSameDocs = minSameDocs;

    }

    public int getMinTagRelevance() {

        return this.minTagRelevance;

    }

    public void setMinTagRelevance(int minTagRelevance) {

        this.minTagRelevance = minTagRelevance;

    }

    public int getTagMinDocCount() {

        return this.tagMinDocCount;

    }

    public void setTagMinDocCount(int tagMinDocCount) {

        this.tagMinDocCount = tagMinDocCount;

    }

}

DocCluster.java

/**

 *

* @author

* @version 创建时间：2011-3-8 上午10:23:35

 */

public class DocCluster {

    private String[] docIdList;

    private ICTHit[] docList;

    private int level;

    private List<DocCluster> subclusters;

    private String tags;

    private String title;

    public String[] getDocIdList() {

        return this.docIdList;

    }

    public void setDocIdList(String[] docIdList) {

        this.docIdList = docIdList;

    }

    public ICTHit[] getDocList() {

        return this.docList;

    }

    public void setDocList(ICTHit[] docList) {

        this.docList = docList;

    }

    public int getLevel() {

        return level;

    }

    public void setLevel(int level) {

        this.level = level;

    }

    public List<DocCluster> getSubclusters() {

        return this.subclusters;

    }

    public void setSubclusters(List<DocCluster> subclusters) {

        this.subclusters = subclusters;

    }

    public String getTags() {

        return this.tags;

    }

    public void setTags(String tags) {

        this.tags = tags;

    }

    public String getTitle() {

        if (title == null)

            title = "";

        return this.title;

    }

    public void setTitle(String title) {

        this.title = title;

    }

}

ICTHit.java

public class ICTHit implements Serializable {

    /*

     * 关键词数组

     */

    private String[] TagList;

    private String docId;

    private String title;

    public String[] getTagList() {

        return TagList;

    }

    public void setTagList(String[] tagList) {

        TagList = tagList;

    }

    public String getDocId() {

        return docId;

    }

    public void setDocId(String docId) {

        this.docId = docId;

    }

    public String getTitle() {

        return title;

    }

    public void setTitle(String title) {

        this.title = title;

    }    

}

TagHitMatrix.java

public class TagHitMatrix extends LinkedHashMap<String, long[]> {

    /**

     *

     */

    private static final long serialVersionUID = -7511464445378974433L;

    public static int ii = 0;

    public ClusterDocInfo[] docs;

    public int hitsItemCount;

    public TagHitMatrix(int DocCount, int MaxTagCount) {

        this.hitsItemCount = (int) (DocCount / 62.0D + 0.984375D);

        this.docs = new ClusterDocInfo[DocCount];

        for (int i = 0; i < this.docs.length; i++)

            this.docs[i] = new ClusterDocInfo(MaxTagCount);

    }

    public void AddDocHit(String TagStr, int Position) {

        TagStr = TagStr.trim();

        int n = Position / 62;

        int m = Position % 62;

        long[] DocHits = (long[]) get(TagStr);

        if (DocHits == null) {

            DocHits = new long[this.hitsItemCount];

            put(TagStr, DocHits);

        }

        DocHits[n] |= Math.round(Math.pow(2.0D, m));

        ClusterDocInfo di = this.docs[Position];

        di.TagList[(di.TagCount++)] = TagStr;

    }

    class ClusterDocInfo {

        public String[] TagList;

        public int TagCount;

        public ClusterDocInfo(int MaxTagCount) {

            this.TagList = new String[MaxTagCount];

            this.TagCount = 0;

        }

    }

}

测试方法：

public void test(ICTHit[] icthits) throws IOException {

        ClusterBuilder clusterBuilder = new ClusterBuilder();

        // 设置需要聚类的数据集合，测试中用的null。

        clusterBuilder.setDocs(icthits);

        // 设置聚类级别，只使用1级

        clusterBuilder.setMaxLevels(10);

        clusterBuilder.setUseTagsAsTitle(true);

        // 一般将检索词设置为wordsExcluded

        clusterBuilder.setWordsExcluded("万美元,日本,公司,视频,北京时间,图文,新华网,新浪,消息,通讯,互联网,美国,中国");

        clusterBuilder

                .setOptions(new ClusteringOptions[] { new ClusteringOptions(),new ClusteringOptions() });

        // 开始聚类

        clusterBuilder.cluster();

        FileWriter fw1 = new FileWriter("c:/today-20110509-cluster.txt ", true);

        BufferedWriter bw1 = new BufferedWriter(fw1);

        // 打印结果

        if (clusterBuilder.getClusters() != null) {

            int i = 0;

            for (DocCluster docCluster : clusterBuilder.getClusters()) {

                i++;

                System.out.println("tag:" + docCluster.getTags() + "("

                        + docCluster.getDocIdList().length + ")");

                bw1.write(docCluster.getTags() + "("+ docCluster.getDocIdList().length + ")"+"\r\n ");                

                if (docCluster.getDocList() != null

                        && docCluster.getDocList().length > 0) {

                    for (ICTHit co : docCluster.getDocList()) {

                        System.out.println("     DocID: " + co.getDocId());

                        bw1.write("标题为: "    + co.getTitle()+",ID为"+co.getDocId()+"\r\n ");

                        for (int m = 0; m < co.getTagList().length; m++) {

                            bw1.write("标题为: "    + co.getTitle()+",ID为"+co.getDocId()+"\r\n ");

                            System.out.println("     Key Word: "

                                    + co.getTagList()[m]);

                        }

                        System.out.println("");

                    }

                    System.out.println("");

                } else {

                    bw1.write("      该分类下无数据！"+"\r\n ");

                }

                bw1.write("-------------------------------------------------------------------------------\r\n");

            }

        }

        bw1.close();

        fw1.close();

    }

如上方法可以，是一个示例性的，没有用在生产当中。核心方法有了。大家可以引用到项目当中。效果比carrot２标准的方法要好很多。

生成文本聚类java实现3的更多相关文章

文本挖掘之文本聚类（OPTICS）
刘勇 Email:lyssym@sina.com 简介鉴于DBSCAN算法对输入参数,邻域半径E和阈值M比较敏感,在参数调优时比较麻烦,因此本文对另一种基于密度的聚类算法OPTICS(Order ...
K-means算法及文本聚类实践
K-Means是常用的聚类算法,与其他聚类算法相比,其时间复杂度低,聚类的效果也还不错,这里简单介绍一下k-means算法,下图是一个手写体数据集聚类的结果. 基本思想 k-means算法需要事先指定 ...
灵玖软件NLPIRParser智能文本聚类
随着互联网的迅猛发展,信息的爆炸式增加,信息超载问题变的越来越严重,信息的更新率也越来越高,用户在信息海洋里查找信息就像大海捞针一样.搜索引擎服务应运而生,在一定程度上满足了用户查找信息的需要.然而互 ...
[python] 使用Jieba工具中文分词及文本聚类概念
声明:由于担心CSDN博客丢失,在博客园简单对其进行备份,以后两个地方都会写文章的~感谢CSDN和博客园提供的平台. 前面讲述了很多关于Python爬取本体Ontology.消息盒Inf ...
pyhanlp 文本聚类详细介绍
文本聚类文本聚类简单点的来说就是将文本视作一个样本,在其上面进行聚类操作.但是与我们机器学习中常用的聚类操作不同之处在于. 我们的聚类对象不是直接的文本本身,而是文本提取出来的特征.因此如何提取特征 ...
[转]python进行中文文本聚类（切词以及Kmeans聚类）
简介查看百度搜索中文文本聚类我失望的发现,网上竟然没有一个完整的关于Python实现的中文文本聚类(乃至搜索关键词python 中文文本聚类也是如此),网上大部分是关于文本聚类的Kmeans聚类的原 ...
文本挖掘之文本聚类（MapReduce）
刘勇 Email:lyssym@sina.com 简介针对大数量的文本数据,采用单线程处理时,一方面消耗较长处理时间,另一方面对大量数据的I/O操作也会消耗较长处理时间,同时对内存空间的消耗也是 ...
文本挖掘之文本聚类（DBSCAN）
刘勇 Email:lyssym@sina.com 简介鉴于基于划分的文本聚类方法只能识别球形的聚类,因此本文对基于密度的文本聚类算法展开研究.DBSCAN(Density-Based Spat ...
10.HanLP实现k均值--文本聚类
笔记转载于GitHub项目:https://github.com/NLP-LOVE/Introduction-NLP 10. 文本聚类正所谓物以类聚,人以群分.人们在获取数据时需要整理,将相似的数据 ...
物以类聚人以群分,通过GensimLda文本聚类构建人工智能个性化推荐系统(Python3.10)
众所周知,个性化推荐系统能够根据用户的兴趣.偏好等信息向用户推荐相关内容,使得用户更感兴趣,从而提升用户体验,提高用户粘度,之前我们曾经使用协同过滤算法构建过个性化推荐系统,但基于显式反馈的算法就会有 ...

随机推荐

linux终端如何加上时间,添加时间戳到终端提示？
方法: 在 .bashrc 文件中加入: export PROMPT_COMMAND="echo -n \[\$(date +%H:%M:%S)\\] " 这样便可以在每次输入命令 ...
暑假Java自学进度总结06
一.今日所学: 1.for循环 for(初始化语句;条件判断语句;条件控制语句){ 循环体语句; } 执行流程: 1>执行初始化语句 2>执行条件判断语句,若为true则执行循环体语句,若 ...
Linux内核自旋锁spin lock，教你如何用自旋锁让ubuntu死锁
背景由于在多处理器环境中某些资源的有限性,有时需要互斥访问(mutual exclusion),这时候就需要引入锁的概念,只有获取了锁的任务才能够对资源进行访问,由于多线程的核心是CPU的时间分片, ...
zabbix snmp OID 列表
系统参数(1.3.6.1.2.1.1) OID 描述备注请求方式 .1.3.6.1.2.1.1.1.0 获取系统基本信息 SysDesc GET .1.3.6.1.2.1.1.3.0 监控时间 s ...
C语言基础函数
C语言文件操作 fopen(filename, "r") // 只读模式打开文件 -r // 只读 -w // 可写 -b // 二进制 fgetc(fd) // 从fd获取ch ...
痞子衡嵌入式：在IAR开发环境下将尽可能多的代码重定向到RAM中执行的方法
大家好,我是痞子衡,是正经搞技术的痞子.今天痞子衡给大家分享的是在IAR开发环境下将尽可能多的代码重定向到RAM中执行的方法. 最近和同事在讨论一个客户案例,客户 APP 工程是基于 IAR 开发环境 ...
Linux 主流桌面环境
GNOME KDE Xfce Ubuntu 使用 GNOME 作为桌面环境. 基于 KDE Plasma 开发的 Ubuntu 发行版:Kubuntu 基于 Xfce 开发的 Ubuntu 发行版:X ...
游戏AI LOD交易员（附项目）
游戏AI的LOD控制这次我们来一同看看AI LOD的一个另类控制技术,如果你对AI LOD一无所知也没关系,本文会为你们做个科普.但请注意,本文着重讨论其思想, 没有讲代码细节(因为很多涉及数学,有 ...
netcore高级知识点，内存对齐，原理与示例
最近几年一直从事物联网开发,与硬件打交道越来越多,发现越接近底层开发对性能的追求越高,毕竟硬件资源相对上层应用来实在是太缺乏了.今天想和大家一起分享关于C#中的内存对齐,希望通过理解和优化内存对齐,可 ...
如何使用ChatGPT自带插件
OpenAI的插件将ChatGPT连接到第三方应用程序.这些插件使ChatGPT能够与开发者定义的API进行交互,增强ChatGPT的能力,并使其能够执行广泛的操作.插件使ChatGPT能够做如下事情 ...

生成文本聚类java实现3

生成文本聚类java实现3的更多相关文章

随机推荐

热门专题