Wordvec_句子相似度

import jieba
from jieba import analyse
import numpy
import gensim
import codecs
import pandas as pd
import jieba.posseg as pog
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
#获取训练语料
def data_handle(data):
    n = data.shape[0]
    data_str = ''
    for i in numpy.arange(n):
        data_str += str(data.ix[i, 'comment'])
    return data_str
def fenci(data_str,stop_property,stopfile):
    # 停用词
    stop_word = [word.strip() for word in open(stopfile, encoding='utf-8').readlines()]
    # 分词
    word_cut = pog.cut(data_str)

    with open('weibo.txt','w',encoding='utf-8') as f:
        for word, flag in word_cut:
            if flag not in stop_property:
                if word not in stop_word:
                    f.write(word+'\n')

# 原始的训练语料转化成一个sentence的迭代器,每一次迭代返回的sentence是一个word（utf8格式）的列表
def vctor_word():
    wiki_news = open('weibo.txt', 'r',encoding='utf-8')
    sentences=LineSentence(wiki_news)
    model=Word2Vec(sentences,sg=0,size=100,window=5,min_count=5,workers=9)
    model.save('weibo.word2vec')

# 实现给出任意字符串，获取字符串中某字符的位置以及出现的总次数
def get_char_pos(string, char):
    chPos = []
    try:
        chPos = list(((pos, char) for pos, val in enumerate(string) if (val == char)))
    except:
        pass
    return chPos

# 利用训练好的词向量获取关键词的词向量

def cut_data(data,stopfile):
    data.fillna(0,inplace=True)
    stop_word = [word.strip() for word in open(stopfile, encoding='utf-8').readlines()]
    charater=['a', 'nr', 'ns', 'nt', 'ng', 'vn', 'vi', 'l', 'n', 'v']
    m=data.shape[0]
    with open('seg_word.txt', 'w', encoding='utf-8') as f:

       for i in range(m):
            str_cut = ''
            str=data.ix[i,'comment']
            if str!=0:
                segs=jieba.posseg.cut(str)
                for word,flag in segs:
                    if flag in charater:
                        if word not in stop_word:
                             str_cut+=word+'/'
                f.write(str_cut )
            else:
               str_cut=''
            f.write('\n ')

def get_vector(data,model):#str
    wordvec_size = 100
    word_vec_all = numpy.zeros(wordvec_size)
    space_pos = get_char_pos(data, '/')
    first_word = data[0:space_pos[0][0]]
    print('first_word', first_word)
    if first_word in model:
        print('yes')
        word_vec_all = word_vec_all + model[first_word]

    for i in range(len(space_pos) - 2):
        word = data[space_pos[i][0]:space_pos[i + 1][0]]
        print('word',word)
        if word in model:
            print('yes')
            word_vec_all = word_vec_all + model[first_word]

    print('word_vec_all',word_vec_all)
    return word_vec_all

def word2vec(file_name, model,str):

   DataFile = codecs.open(file_name, "r", encoding='utf-8')
   DataSet = DataFile.readlines()[:-1]

   score_list=[]

   str_vector=get_vector(str,model)
   for data in DataSet:  #
       if data.strip()!='':
           word_vec_all=get_vector(data,model)
           score=simlarityCalu(word_vec_all, str_vector)
       else:
           score=0
       score_list.append(score)
   print('score_list',score_list)
   return score_list

# 词向量相似度计算代码：余弦
def simlarityCalu(vector1, vector2):
    vector1Mod = numpy.sqrt(vector1.dot(vector1))
    vector2Mod = numpy.sqrt(vector2.dot(vector2))
    if vector2Mod != 0 and vector1Mod != 0:
        simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod)
    else:
        simlarity = 0
    return simlarity

if __name__ == '__main__':

    stop_property = ['b', 'c', 'd', 'e', 'f', 'm', 'o', 'p', 'q', 'r', 't', 'u', 'x', 'y', 'z', 'uj', 'nrt', 'eng',
                     'zg', 'ul']
    stop_file='stop.txt'

    # 读取数据
    data = pd.read_excel('C:/E/weibo.xlsx')
    data.rename(columns={'粉丝ID': 'fans_id', '粉丝': 'fans_name', '微博账户id': 'weibo_user_id', '微博名': 'weibo_name',
                         '微博id': 'weibo_id', '评论id': 'comment_id', '评论': 'comment'}, inplace=True)

    # 获取评论字符串
    comment_str=data_handle(data)

    #获取语料
    fenci(comment_str, stop_property, stop_file)
    #训练模型
    vctor_word()
    #获取关键词
    cut_data(data, stop_file)

    p1_keywords = 'seg_word.txt'
    str1 = '农农/陈利农/宝贝'
    # model = gensim.models.Word2Vec.load('weibo.word2vec')
    model = gensim.models.Word2Vec.load('zhiwiki_news.word2vec')
    p1_vec = word2vec(p1_keywords, model,str1)

    str2='舒蔻 尤妮佳 买'

Wordvec_句子相似度的更多相关文章

NLP入门（一）词袋模型及句子相似度
本文作为笔者NLP入门系列文章第一篇,以后我们就要步入NLP时代. 本文将会介绍NLP中常见的词袋模型(Bag of Words)以及如何利用词袋模型来计算句子间的相似度(余弦相似度,cosi ...
[LeetCode] 737. Sentence Similarity II 句子相似度 II
Given two sentences words1, words2 (each represented as an array of strings), and a list of similar ...
[LeetCode] 734. Sentence Similarity 句子相似度
Given two sentences words1, words2 (each represented as an array of strings), and a list of similar ...
使用 TF-IDF 加权的空间向量模型实现句子相似度计算
使用 TF-IDF 加权的空间向量模型实现句子相似度计算字符匹配层次计算句子相似度计算两个句子相似度的算法有很多种,但是对于从未了解过这方面算法的人来说,可能最容易想到的就是使用字符串匹配相关的算 ...
LSTM 句子相似度分析
使用句子中出现单词的Vector加权平均进行文本相似度分析虽然简单,但也有比较明显的缺点:没有考虑词序且词向量区别不明确.如下面两个句子: "北京的首都是中国"与"中国的 ...
[LeetCode] Sentence Similarity 句子相似度
Given two sentences words1, words2 (each represented as an array of strings), and a list of similar ...
[LeetCode] Sentence Similarity II 句子相似度之二
Given two sentences words1, words2 (each represented as an array of strings), and a list of similar ...
句子相似度_tf/idf
import mathfrom math import isnanimport pandas as pd#结巴分词,切开之后,有分隔符def jieba_function(sent): import ...
[LeetCode] 737. Sentence Similarity II 句子相似度之二
Given two sentences words1, words2 (each represented as an array of strings), and a list of similar ...

随机推荐

SVN 不显示状态图标--解决方法
[SVN 不显示状态图标--解决方法] 在名字前面加空格, 三个六个空格随意: 参考:https://www.cnblogs.com/lzpong/p/6187366.html
python websocket网页实时显示远程服务器日志信息
功能:用websocket技术,在运维工具的浏览器上实时显示远程服务器上的日志信息一般我们在运维工具部署环境的时候,需要实时展现部署过程中的信息,或者在浏览器中实时显示程序日志给开发人员看.你还在用 ...
CUDA error 100 & Decoder not initialized
项目中用cuda解码时候遇到该错误,这是调用cuda相关库中一些so库版本错误造成的.
Mybatis抛出 Closing non transactional SqlSession [org.apache.ibatis.session.defaults.DefaultSqlSession@f54509]异常
今天在做Springmvc和spring 时 mybatis 是抛出异常 Closing non transactional SqlSession [org.apache.ibatis.session ...
Docker基础入门
Docker 是一个开源的应用容器引擎,让开发者可以打包他们的应用以及依赖包到一个可移植的容器中,然后发布到任何流行的 Linux 机器上,也可以实现虚拟化.容器是完全使用沙箱机制,相互之间不会有任何 ...
ES6之对象的简洁表示法
ES6 允许直接写入变量和函数,作为对象的属性和方法.这样的书写更加简洁. let name = 'Pirates of the Caribbean', index = 5, captain = { ...
伪异步IO
针对传统的BIO编程,当客户端数量一直增加的情况下,可能会导致服务器直接奔溃掉,进而出现了一种伪异步IO的线程方式. 先看一下代码: 看一下server端的代码: 其中使用了自定义的一个线程池Hand ...
SSM提交了事物但数据库不执行
从图中可以看到,spring已经给出事物提交成功,但数据库并未插入数据,找了老半天发现,数据库表上我加了个触发器,触发器执行失败造成没有数据库commit.但程序没什么不报异常吗?
Jedis cluster集群初始化源码剖析
Jedis cluster集群初始化源码剖析环境 jar版本: spring-data-redis-1.8.4-RELEASE.jar.jedis-2.9.0.jar 测试环境: Redis 3.2 ...
Properties 使用
Properties 属于Map 下HashTable的小弟属于持久的属性集,他可以保存在流中或者在流中加载. 键和值都是字符串类型. 通常用于配置文件方法介绍: 存放键值对:setPropert ...

Wordvec_句子相似度

Wordvec_句子相似度的更多相关文章

随机推荐

热门专题