# coding: utf-8
'''
Note: Step 3 is missing. That's why I left it.
''' from __future__ import absolute_import
from __future__ import print_function import collections
import math
import numpy as np
import os
import random
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import zipfile # Step 1: Download the data.
# Downloading data. If the file already exists, check that it was received correctly (the file size is the same).
# Return filename after download.
print("Step 1: Download the data.")
url = 'http://mattmahoney.net/dc/' def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename filename = maybe_download('text8.zip', 31344016) # Read the data into a string.
# file (zipfile) 을 읽어옴
# text8.zip contains only one file. Looking at the code, it seems to be words separated by ''.
def read_data(filename):
f = zipfile.ZipFile(filename)
for name in f.namelist():
return f.read(name).split()
f.close() words = read_data(filename)
print('Data size', len(words))
print('Sample words: ', words[:10]) # Step 2: Build the dictionary and replace rare words with UNK token.
print("\nStep 2: Build the dictionary and replace rare words with UNK token.")
vocabulary_size = 50000 def build_dataset(words):
#vocabulary_size is the number of frequent words to use.
#all words that do not fit within the top 50000 (vocabulary_size) are treated as UNK.
#Param words: literally a list of words
#Return data: indices of words including UNK. That is, words index list.
#Return count: collections.Counter which counts the frequency of occurrence of each word
#:return dictionary: {"word": "index"}
#:return reverse_dictionary: {"index": "word"}. e.g.) {0: 'UNK', 1: 'the', ...}
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary) # insert index to dictionary (len이 계속 증가하므로 결과적으로 index의 효과)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index]
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary data, count, dictionary, reverse_dictionary = build_dataset(words)
del words # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data: ', data[:10])
print('Sample count: ', count[:10])
print('Sample dict: ', dictionary.items()[:10])
print('Sample reverse dict: ', reverse_dictionary.items()[:10]) data_index = 0 # Step 4: Function to generate a training batch for the skip-gram model.
print("\nStep 4: Function to generate a training batch for the skip-gram model.")
def generate_batch(batch_size, num_skips, skip_window): #Function to generate minibatch.
#Data_index is declared as global, which acts as static here. That is, the value of data_index is retained even if this function is continually recalled.
#Param batch_size: batch_size.
#Param num_skips: how many (target, context) pairs to generate in the context window.
#Param skip_window: context window size. The skip-gram model predicts the surrounding words from the target word, and skip_window defines the range of the surrounding words.
#Return batch: mini-batch of data.
#Return labels: labels of mini-batch. 2d array of [batch_size] [1]. global data_index
assert batch_size % num_skips == 0 # num_skips의 배수로 batch가 생성되므로.
assert num_skips <= 2 * skip_window # num_skips == 2*skip_window 이면 모든 context window의 context에 대해 pair가 생성된다.
# That is, it should not be larger than that.
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
# Deques are a generalization of stacks and queues.
# The name is pronounced "deck" and is short for "double-ended queue".
# You can both push (append) & pop both sides.
# buffer = data[data_index:data_index+span] with circling
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data) # operator to discard the remainder or decimal point
#Skip-gram is a model for predicting surrounding context words from target words.
#Before we learn skip-gram model, we need to convert words to (target, context) type.
#The code below will do the job with batch_size size. for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
# Extracting the context from the context window is done randomly.
# However, if skip_window * 2 == num_skips, all contexts are taken out, so random is not meaningful. The order is random only.
target = random.randint(0, span - 1) targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target] buffer.append(data[data_index])
data_index = (data_index + 1) % len(data) return batch, labels # To see how the batch is configured, output it once:
print("Generating batch ... ")
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
print("Sample batches: ", batch[:10])
print("Sample labels: ", labels[:10])
for i in range(8):
print(batch[i], '->', labels[i, 0])
print(reverse_dictionary[batch[i]], '->', reverse_dictionary[labels[i, 0]]) # Step 5: Build and train a skip-gram model.
print("\nStep 5: Build and train a skip-gram model.")
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label. # We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(np.arange(valid_window), valid_size))
# [0 ~ valid_window] And then sampled by valid_size.
# In other words, valid_examples is a random selection of 16 from 0 to 99.
num_sampled = 64 # Number of negative examples to sample. print("valid_examples: ", valid_examples) graph = tf.Graph() with graph.as_default(): # Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Ops and variables pinned to the CPU because of missing GPU implementation
# Embedding_lookup This GPU implementation is not implemented, so it must be a CPU.
# Since default is GPU, it explicitly specifies CPU.
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
# embedding matrix (vectors)
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# Only the embedding vectors pointed to by train_inputs (mini-batch; indices) in the entire embedding matrix are extracted
embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss
# NCE loss is defined using a logistic regression model.
# That is, for logistic regression, weight and bias are needed for each word of vocabulary.
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
num_sampled, vocabulary_size)) # Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings.
# Calculate the cosine similarity between minibatch (valid_embeddings) and all embeddings.
# This process is to show which words are closest to each valid_example as the learning progresses (ie to show the learning process).
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # Step 6: Begin training
print("\nStep 6: Begin training")
num_steps = 100001 with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
tf.initialize_all_variables().run()
print("Initialized") average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} # We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
# Use feed_dict to put data into the placeholder and learn it.
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log_str = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval() # Step 7: Visualize the embeddings.
print("\nStep 7: Visualize the embeddings.")
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) #in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom') plt.savefig(filename) try:
# If you get an error here, update scikit-learn and matplotlib to the latest version.
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels) except ImportError:
print("Please install sklearn and matplotlib to visualize embeddings.")
  

tensorflow实现Word2vec的更多相关文章

  1. 文本分布式表示(二):用tensorflow和word2vec训练词向量

    看了几天word2vec的理论,终于是懂了一些.理论部分我推荐以下几篇教程,有博客也有视频: 1.<word2vec中的数学原理>:http://www.cnblogs.com/pegho ...

  2. 利用 TensorFlow 入门 Word2Vec

    利用 TensorFlow 入门 Word2Vec 原创 2017-10-14 chen_h coderpai 博客地址:http://www.jianshu.com/p/4e16ae0aad25 或 ...

  3. Tensorflow 的Word2vec demo解析

    简单demo的代码路径在tensorflow\tensorflow\g3doc\tutorials\word2vec\word2vec_basic.py Sikp gram方式的model思路 htt ...

  4. 使用tensorflow训练word2vec

    from http://blog.csdn.net/wangyangzhizhou/article/details/77530479?locationNum=1&fps=1 使用了tensor ...

  5. Tensorflow做阅读理解与完形填空

    catalogue . 前言 . 使用的数据集 . 数据预处理 . 训练 . 测试模型运行结果: 进行实际完形填空 0. 前言 开始写这篇文章的时候是晚上12点,突然想到几点新的理解,赶紧记下来.我们 ...

  6. 利用python中的gensim模块训练和测试word2vec

    word2vec的基础知识介绍参考上一篇博客和列举的参考资料. 首先利用安装gensim模块,相关依赖如下,注意版本要一致: Python >= 2.7 (tested with version ...

  7. 对word2vec的理解及资料整理

    对word2vec的理解及资料整理 无他,在网上看到好多对word2vec的介绍,当然也有写的比较认真的,但是自己学习过程中还是看了好多才明白,这里按照自己整理梳理一下资料,形成提纲以便学习. 介绍较 ...

  8. 《TensorFlow实战》读书笔记(完结)

    1 TensorFlow基础 ---1.1TensorFlow概要 TensorFlow使用数据流图进行计算,一次编写,各处运行. ---1.2 TensorFlow编程模型简介 TensorFlow ...

  9. DeepLearning入门笔记(一),准备工作与注意事项

    本文记录了安装theano.keras.tensorflow以及运行tutorial程序时遇到的一些问题,供后人参考. 实验机器:联想笔记本,i7-6700HQ,GTX960M,16G内存,SSD硬盘 ...

随机推荐

  1. SLAM的数学基础(3):几种常见的概率分布的实现及验证。

    分布,在计算机学科里一般是指概率分布,是概率论的基本概念之一.分布反映的是随机或某个系统中的某个变量,它的取值的范围和规律. 常见的分布有:二项分布.泊松分布.正态分布.指数分布等,下面对它们进行一一 ...

  2. K8s基本概念资料

    https://www.cnblogs.com/menkeyi/p/7134460.html

  3. react应用(基于react脚手架)

    使用create-react-app创建react应用 react脚手架 1) xxx脚手架: 用来帮助程序员快速创建一个基于xxx库的模板项目 a. 包含了所有需要的配置 b. 指定好了所有的依赖 ...

  4. 微信小程序云开发-数据库-查询满足条件的数据

    一.查询价格大于10的商品 1.wxml文件 2.js文件 where条件语句:.where({price:db.command.gt(10)}) 3.查询结果 二.查询价格大于等于10的商品 js文 ...

  5. Centos7 firewall开放3306端口 笔记

    1. 开启端口 // zone -- 作用域 // add-port=80/tcp -- 添加端口,格式为:端口/通讯协议 // permanent -- 永久生效,没有此参数重启后失效 firewa ...

  6. informix 数据库锁表分析和解决方法

    一.前言 在联机事务处理(OLTP)的数据库应用系统中,多用户.多任务的并发性是系统最重要的技术指标之一.为了提高并发性,目前大部分RDBMS都采用加锁技术.然而由于现实环境的复杂性,使用加锁技术又不 ...

  7. 第四篇 -- CSS基础

    表单.单选.下拉框.文本域.多选框.提交.重置.按钮 <!DOCTYPE html> <html lang="en"> <head> <m ...

  8. 如何在windows 11中安装WSLG(WSL2)

    什么是 WSL WSL(Windows Subsystem for Linux):Windows 系统中的一个子系统,在这个子系统上可以运行 Linux 操作系统. 可以让开发人员直接在 Window ...

  9. etcd学习(5)-etcd的Raft一致性算法原理

    ETCD的Raft一致性算法原理 前言 Raft原理了解 raft选举 raft中的几种状态 任期 leader选举 日志复制 安全性 leader宕机,新的leader未同步前任committed的 ...

  10. 点云上的深度学习及其在三维场景理解中的应用(PPT内容整理PointNet)

      这篇博客主要是整理了PointNet提出者祁芮中台介绍PointNet.PointNet++.Frustum PointNets的PPT内容,内容包括如何将点云进行深度学习,如何设计新型的网络架构 ...