tensorflow实现Word2vec
# coding: utf-8
'''
Note: Step 3 is missing. That's why I left it.
''' from __future__ import absolute_import
from __future__ import print_function import collections
import math
import numpy as np
import os
import random
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import zipfile # Step 1: Download the data.
# Downloading data. If the file already exists, check that it was received correctly (the file size is the same).
# Return filename after download.
print("Step 1: Download the data.")
url = 'http://mattmahoney.net/dc/' def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename filename = maybe_download('text8.zip', 31344016) # Read the data into a string.
# file (zipfile) 을 읽어옴
# text8.zip contains only one file. Looking at the code, it seems to be words separated by ''.
def read_data(filename):
f = zipfile.ZipFile(filename)
for name in f.namelist():
return f.read(name).split()
f.close() words = read_data(filename)
print('Data size', len(words))
print('Sample words: ', words[:10]) # Step 2: Build the dictionary and replace rare words with UNK token.
print("\nStep 2: Build the dictionary and replace rare words with UNK token.")
vocabulary_size = 50000 def build_dataset(words):
#vocabulary_size is the number of frequent words to use.
#all words that do not fit within the top 50000 (vocabulary_size) are treated as UNK.
#Param words: literally a list of words
#Return data: indices of words including UNK. That is, words index list.
#Return count: collections.Counter which counts the frequency of occurrence of each word
#:return dictionary: {"word": "index"}
#:return reverse_dictionary: {"index": "word"}. e.g.) {0: 'UNK', 1: 'the', ...}
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary) # insert index to dictionary (len이 계속 증가하므로 결과적으로 index의 효과)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index]
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary data, count, dictionary, reverse_dictionary = build_dataset(words)
del words # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data: ', data[:10])
print('Sample count: ', count[:10])
print('Sample dict: ', dictionary.items()[:10])
print('Sample reverse dict: ', reverse_dictionary.items()[:10]) data_index = 0 # Step 4: Function to generate a training batch for the skip-gram model.
print("\nStep 4: Function to generate a training batch for the skip-gram model.")
def generate_batch(batch_size, num_skips, skip_window): #Function to generate minibatch.
#Data_index is declared as global, which acts as static here. That is, the value of data_index is retained even if this function is continually recalled.
#Param batch_size: batch_size.
#Param num_skips: how many (target, context) pairs to generate in the context window.
#Param skip_window: context window size. The skip-gram model predicts the surrounding words from the target word, and skip_window defines the range of the surrounding words.
#Return batch: mini-batch of data.
#Return labels: labels of mini-batch. 2d array of [batch_size] [1]. global data_index
assert batch_size % num_skips == 0 # num_skips의 배수로 batch가 생성되므로.
assert num_skips <= 2 * skip_window # num_skips == 2*skip_window 이면 모든 context window의 context에 대해 pair가 생성된다.
# That is, it should not be larger than that.
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
# Deques are a generalization of stacks and queues.
# The name is pronounced "deck" and is short for "double-ended queue".
# You can both push (append) & pop both sides.
# buffer = data[data_index:data_index+span] with circling
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data) # operator to discard the remainder or decimal point
#Skip-gram is a model for predicting surrounding context words from target words.
#Before we learn skip-gram model, we need to convert words to (target, context) type.
#The code below will do the job with batch_size size. for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
# Extracting the context from the context window is done randomly.
# However, if skip_window * 2 == num_skips, all contexts are taken out, so random is not meaningful. The order is random only.
target = random.randint(0, span - 1) targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target] buffer.append(data[data_index])
data_index = (data_index + 1) % len(data) return batch, labels # To see how the batch is configured, output it once:
print("Generating batch ... ")
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
print("Sample batches: ", batch[:10])
print("Sample labels: ", labels[:10])
for i in range(8):
print(batch[i], '->', labels[i, 0])
print(reverse_dictionary[batch[i]], '->', reverse_dictionary[labels[i, 0]]) # Step 5: Build and train a skip-gram model.
print("\nStep 5: Build and train a skip-gram model.")
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label. # We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(np.arange(valid_window), valid_size))
# [0 ~ valid_window] And then sampled by valid_size.
# In other words, valid_examples is a random selection of 16 from 0 to 99.
num_sampled = 64 # Number of negative examples to sample. print("valid_examples: ", valid_examples) graph = tf.Graph() with graph.as_default(): # Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Ops and variables pinned to the CPU because of missing GPU implementation
# Embedding_lookup This GPU implementation is not implemented, so it must be a CPU.
# Since default is GPU, it explicitly specifies CPU.
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
# embedding matrix (vectors)
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# Only the embedding vectors pointed to by train_inputs (mini-batch; indices) in the entire embedding matrix are extracted
embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss
# NCE loss is defined using a logistic regression model.
# That is, for logistic regression, weight and bias are needed for each word of vocabulary.
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
num_sampled, vocabulary_size)) # Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings.
# Calculate the cosine similarity between minibatch (valid_embeddings) and all embeddings.
# This process is to show which words are closest to each valid_example as the learning progresses (ie to show the learning process).
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # Step 6: Begin training
print("\nStep 6: Begin training")
num_steps = 100001 with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
tf.initialize_all_variables().run()
print("Initialized") average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} # We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
# Use feed_dict to put data into the placeholder and learn it.
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log_str = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval() # Step 7: Visualize the embeddings.
print("\nStep 7: Visualize the embeddings.")
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) #in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom') plt.savefig(filename) try:
# If you get an error here, update scikit-learn and matplotlib to the latest version.
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels) except ImportError:
print("Please install sklearn and matplotlib to visualize embeddings.")
tensorflow实现Word2vec的更多相关文章
- 文本分布式表示(二):用tensorflow和word2vec训练词向量
		
看了几天word2vec的理论,终于是懂了一些.理论部分我推荐以下几篇教程,有博客也有视频: 1.<word2vec中的数学原理>:http://www.cnblogs.com/pegho ...
 - 利用 TensorFlow 入门 Word2Vec
		
利用 TensorFlow 入门 Word2Vec 原创 2017-10-14 chen_h coderpai 博客地址:http://www.jianshu.com/p/4e16ae0aad25 或 ...
 - Tensorflow 的Word2vec demo解析
		
简单demo的代码路径在tensorflow\tensorflow\g3doc\tutorials\word2vec\word2vec_basic.py Sikp gram方式的model思路 htt ...
 - 使用tensorflow训练word2vec
		
from http://blog.csdn.net/wangyangzhizhou/article/details/77530479?locationNum=1&fps=1 使用了tensor ...
 - Tensorflow做阅读理解与完形填空
		
catalogue . 前言 . 使用的数据集 . 数据预处理 . 训练 . 测试模型运行结果: 进行实际完形填空 0. 前言 开始写这篇文章的时候是晚上12点,突然想到几点新的理解,赶紧记下来.我们 ...
 - 利用python中的gensim模块训练和测试word2vec
		
word2vec的基础知识介绍参考上一篇博客和列举的参考资料. 首先利用安装gensim模块,相关依赖如下,注意版本要一致: Python >= 2.7 (tested with version ...
 - 对word2vec的理解及资料整理
		
对word2vec的理解及资料整理 无他,在网上看到好多对word2vec的介绍,当然也有写的比较认真的,但是自己学习过程中还是看了好多才明白,这里按照自己整理梳理一下资料,形成提纲以便学习. 介绍较 ...
 - 《TensorFlow实战》读书笔记(完结)
		
1 TensorFlow基础 ---1.1TensorFlow概要 TensorFlow使用数据流图进行计算,一次编写,各处运行. ---1.2 TensorFlow编程模型简介 TensorFlow ...
 - DeepLearning入门笔记(一),准备工作与注意事项
		
本文记录了安装theano.keras.tensorflow以及运行tutorial程序时遇到的一些问题,供后人参考. 实验机器:联想笔记本,i7-6700HQ,GTX960M,16G内存,SSD硬盘 ...
 
随机推荐
- File类与常用IO流第十章——序列化流
			
第十章.序列化流 序列化流和反序列化流概述 序列化:用一个字节序列表示一个对象,该字节序列包含该对象的数据.对象的类型和对象中存储的属性等等信息.字节序列写出到文件后,相当于文件中持久保存了一个对象的 ...
 - 【LeetCode】283.移动零
			
283.移动零 知识点:数组:双指针: 题目描述 给定一个数组 nums,编写一个函数将所有 0 移动到数组的末尾,同时保持非零元素的相对顺序. 示例 输入: [0,1,0,3,12] 输出: [1, ...
 - Scala学习——操作外部数据
			
scala操作外部数据 一.scala读取文件及网络数据 package top.ruandb.scala.Course08 import scala.io.Source object FileApp ...
 - 微信小程序云开发-添加数据
			
一.数据的添加 使用add方法添加数据 添加完成后,在数据库中查询,可以看到数据库中添加了1条数据,此时添加的数据系统自动添加了_openid 将[添加]功能写到对应的方法中 wxml页面中,点击[添 ...
 - 深入学习Netty(5)——Netty是如何解决TCP粘包/拆包问题的?
			
前言 学习Netty避免不了要去了解TCP粘包/拆包问题,熟悉各个编解码器是如何解决TCP粘包/拆包问题的,同时需要知道TCP粘包/拆包问题是怎么产生的. 在此博文前,可以先学习了解前几篇博文: 深入 ...
 - 优化 Workerman 检查主进程是否存活的逻辑
			
主要新增了判断进程是否为 Workerman 进程的逻辑,从而优化了确定主进程是否存活的准确性 发现问题 年前逛 GitHub 的时候,发现 Workerman 有一个 2017 年打开的 Issue ...
 - Luogu2839 [国家集训队]middle 题解
			
题目很好,考察对主席树的深入理解与灵活运用. 首先看看一般解决中位数的思路,我们二分一个 \(mid\),将区间中 \(\ge mid\) 的数置为 \(1\),小于的置为 \(-1\),然后求区间和 ...
 - odoo里的rpc用法
			
import odoorpcdb_name = 'test-12'user_name = 'admin'password = 'admin'# Prepare the connection to th ...
 - python 接口测试之 图片识别
			
4.1 pytesser安装 2.安装pytesser,下载地址:http://code.google.com/p/pytesser/ ,下载后直接将其解压到项目代码下,或者解压到python安装目录 ...
 - react native踩坑记录
			
一 .安装 1.Python2 和Java SE Development Kit (JDK)可以直接通过腾讯电脑关键安装, Android SDK安装的时候路径里不能有中文和空格 2.配置java环境 ...