看了网上的一些用tf实现的FM,很多都没有考虑FM实际使用中数据样本稀疏的问题。

我在实现的时候使用 embedding_lookup_sparse来解决这个问题。

对于二阶部分,由于embedding_lookup_sparse没法计算 和的平方 和 平方的和,我参考embedding_lookup_sparse中sum和mean两种实现,自己写了一下。不过数据输入部分还需要改一下,改用dataset会更好。

代码如下:

import tensorflow as tf
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import array_ops
import random
import numpy as np
from sklearn import metrics class Args():
feature_size=925
field_size=15
embedding_size = 20
epoch = 3
batch_size = 2000
learning_rate = 0.001
l2_reg_rate = 0.001
checkpoint_dir = "./model"
is_training = True class FMmodel():
def __init__(self):
self.feature_sizes = Args.feature_size
self.field_size = Args.field_size
self.embedding_size = Args.embedding_size
self.l2_reg_rate = Args.l2_reg_rate
self.epoch = Args.epoch
self.learning_rate = Args.learning_rate
self.weight = {}
self.model_path = Args.checkpoint_dir
self.batch_size = Args.batch_size def build_model(self,is_warm_up=False):
self.x1_index = tf.sparse_placeholder(tf.int64,name="x1_index")
self.x1_value = tf.sparse_placeholder(tf.float32,name="x1_value")
self.labels = tf.placeholder(tf.float32,name="labels",shape=[None,1])
init_randomW = tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None, dtype=tf.float32)
init_randomV = tf.random_normal_initializer(mean=0.0, stddev=0.00001, seed=None, dtype=tf.float32)
#特征向量
self.weight["feature_weight"] = tf.get_variable(
shape =[self.feature_sizes,self.embedding_size],
name='feature_weight',
initializer=init_randomV
) #一次项中的W系数
self.weight["feature_first"] = tf.get_variable(
shape=[self.feature_sizes,1],
initializer=init_randomW,
name='feature_first') self.weight["bais"] = tf.get_variable(shape=[1,1],initializer=tf.constant_initializer(0.0),name="bais") #[batch_size,1] 线性部分的计算结果 xi*wi求和
self.line_part1 = tf.nn.embedding_lookup_sparse(self.weight["feature_first"],
sp_ids=self.x1_index,sp_weights=self.x1_value,combiner='sum')
self.line_part1_shape = tf.shape(self.line_part1)
#[batch*embedding_size]
self.embedding_part1_sum_square = tf.nn.embedding_lookup_sparse(self.weight["feature_weight"],
sp_ids=self.x1_index,sp_weights=self.x1_value,combiner='sum') #[batch_size,embeding_size]
ids_1 = self.x1_index.values self.ids1,self.idx1 = tf.unique(ids_1) self.weight_1 = self.x1_value.values self.weight_1 = tf.reshape(self.weight_1,[-1,1]) if self.weight_1.dtype != dtypes.float32:
self.weight_1 = math_ops.cast(self.weight_1,dtypes.float32) #[batch_size,embedding_size]
self.embedding_1 = tf.nn.embedding_lookup(self.weight["feature_weight"],ids=self.ids1) self.new_embedding_1 = tf.gather(self.embedding_1,self.idx1) #[batch_value_count,embedding_size]
self.embedding_weight_part1 =tf.multiply(self.weight_1,self.new_embedding_1) self.embedding_weight_part1_square = tf.square(self.embedding_weight_part1) self.segment_ids_1 = self.x1_index.indices[:, 0] if self.segment_ids_1.dtype != dtypes.int32:
self.segment_ids_1 = math_ops.cast(self.segment_ids_1, dtypes.int32) self.embeddings_square_sum1 = tf.math.segment_sum(
self.embedding_weight_part1_square,self.segment_ids_1) self.ess1_shape = tf.shape(self.embeddings_square_sum1)
#[batch_size,1]
self.y1_v = 0.5*tf.reduce_sum(tf.subtract(self.embedding_part1_sum_square,self.embeddings_square_sum1),1)
self.y1_v = tf.reshape(self.y1_v,[-1,1])
self.y1 = tf.add(tf.add(self.line_part1,self.y1_v),self.weight["bais"]) self.o1 = tf.sigmoid(self.y1)
self.loss = tf.losses.log_loss(labels=self.labels,predictions=self.o1)
self.error = tf.reduce_mean(self.loss)
# with tf.name_scope("loss"):
# tf.summary.scalar("loss", self.error) self.opt = tf.train.AdamOptimizer().minimize(self.error)
self.session = tf.Session()
self.init = tf.group(tf.global_variables_initializer())
if is_warm_up:
self.saver = tf.train.Saver(tf.global_variables())
self.saver.restore(self.session, self.model_path)
else:
self.session.run(self.init) def predict(self,file_name):
result_list = []
for x1_index, x1_value, true_labels in self.load_data(file_name,is_train=False):
predict1 = self.session.run([self.o1],feed_dict={
self.x1_value:x1_value,
self.x1_index:x1_index
})
# print(len(predict1))
# print(len(predict1[0]))
# print(true_labels.shape)
for i in range(len(predict1[0])):
result_list.append((true_labels[i][0],predict1[0][i]))
print(len(result_list))
with open("./data/result.txt",'w') as file1:
for tp in result_list:
file1.write(str(tp[0])+","+str(tp[1][0])+"\n") def save(self,sess,path):
saver = tf.train.Saver()
saver.save(sess,save_path=path) def restore(self,sess,path):
saver = tf.train.Saver()
saver.restore(sess,save_path=path) def train(self,train_data_file):
index=0
for x1_index,x1_value,true_labels in self.load_data(train_data_file):#ids_1,ids_2,weight_1,weight_2,
if(len(true_labels)<2):
#print("###$$$$$$ : "+str(len(true_labels)))
continue
my_o1,myerror,_=self.session.run([self.o1,self.error,self.opt],feed_dict={
self.x1_index : x1_index,
self.x1_value : x1_value,
self.labels:true_labels
})
index+=1
# if(index%1000==0):
# for i in range(len(my_o1)):
# print(str(my_o1[i])+" : "+str(true_labels[i]))
#y_t = true_labels.reshape([-1])
#y_p = np.asarray(my_o1,dtype=float).reshape([-1])
print(metrics.roc_auc_score(true_labels,my_o1)) #print(my_o1) self.save(self.session,self.model_path) self.session.close() def load_data(self,file_name,epoch=3,is_train=True):
def __parse_line(line):
tokens = line.split("#")[0].split()
assert len(tokens)>=2, "Ill-formatted line: {}".format(line)
label = float(tokens[0])
uid = tokens[1]
mid = tokens[2]
kv_pairs = [kv.split(":") for kv in tokens[3:]]
features = {k: float(v) for (k,v) in kv_pairs}
#print(type(features))
qid = uid
return qid,features,label def __encoder_line(sample):
qid = sample[0]
features = sample[1]
label = sample[2]
features_arr = []
for key in features.keys():
features_arr.append(str(key)+":"+str(features[key]))
return str(label)+" "+"qid:"+str(qid)+" "+" ".join(features_arr) def __gen_sparse_tensor(sample_list):
# 生成batch_size数据
# 根据sample_pair_list生成一个batch_size的训练样本
sample_index = 0
tensor_x1_index_ids = []
tensor_x1_index_value = [] tensor_x1_value_ids = []
tensor_x1_value_values = []
label_list = []
for sample in sample_list:
x1_feature = sample[0]
label_list.append([float(sample[1])])
tmpIndex = 0
for key in x1_feature.keys():
tensor_x1_index_ids.append([sample_index, tmpIndex])
tensor_x1_index_value.append(int(key)) tensor_x1_value_ids.append([sample_index, tmpIndex])
tensor_x1_value_values.append(float(x1_feature[key]))
tmpIndex += 1
sample_index+=1
x1_index = tf.SparseTensorValue(indices=tensor_x1_index_ids,values=tensor_x1_index_value,
dense_shape=[len(sample_list),self.feature_sizes])
x1_value = tf.SparseTensorValue(indices=tensor_x1_value_ids,values=tensor_x1_value_values,
dense_shape=[len(sample_list),self.feature_sizes])
#print("AHAHAHAHA : "+str(len(sample_list)))
return x1_index,x1_value,np.asarray(label_list,dtype=np.float32) def __gen_train_data(file_name):
new_file_name = file_name+"_train_data"
with open(file_name,'r') as filer:
with open(new_file_name,'w') as filew:
sample_list = []
now_qid = None
for l in filer:
qid, features, label = __parse_line(l)
if now_qid is None or now_qid==qid:
now_qid = qid
sample_list.append((qid,features,label))
else:
sorted_sample_list = sorted(sample_list,key=lambda x:x[2],reverse=True)
for sample in sorted_sample_list:
sample_str = __encoder_line(sample)
filew.write(sample_str+"\n")
sample_list = []
now_qid = qid
sample_list.append((qid, features, label)) return new_file_name if is_train:
new_file_name ="./data/new_final_train_data.txt" # __gen_train_data(file_name)
print("process data")
sample_list = []
while epoch>0:
epoch-=1
with open(new_file_name,'r') as filer:
for l in filer:
qid,features,label = __parse_line(l)
#print(len(sample_list))
if len(sample_list)<self.batch_size*10:
sample_list.append((features,label))
else:
random.shuffle(sample_list)
start = 0
end = len(sample_list)
while (start < end):
tmpEnd = min(end, start + self.batch_size)
sub_list = sample_list[start:tmpEnd]
x1_index, x1_value,labels = __gen_sparse_tensor(sub_list) # ids_1,ids_2,weight_1,weight_2,
if(labels.sum()<1):
start += self.batch_size
continue
yield (x1_index, x1_value,labels) # ids_1,ids_2,weight_1,weight_2,
start += self.batch_size
sample_list = []
sample_list.append((features, label))
else:
with open(file_name, 'r') as filer:
sample_list = []
for l in filer:
qid, features, label = __parse_line(l)
# print(len(sample_list))
if len(sample_list) < self.batch_size:
sample_list.append((features, label))
else:
start = 0
end = len(sample_list)
while (start < end):
tmpEnd = min(end, start + self.batch_size)
sub_list = sample_list[start:tmpEnd]
x1_index, x1_value, labels = __gen_sparse_tensor(sub_list) # ids_1,ids_2,weight_1,weight_2,
yield (x1_index, x1_value, labels) # ids_1,ids_2,weight_1,weight_2,
start += self.batch_size
sample_list = []
sample_list.append((features, label)) if __name__ =="__main__":
fm = FMmodel()
fm.build_model(is_warm_up=True)
#fm.train("./data/new_final_train_data.txt")
fm.predict("./data/test.data")

TensorFlow实现FM的更多相关文章

  1. FM-分解机模型详解

    https://blog.csdn.net/zynash2/article/details/80029969 FM论文地址:https://www.csie.ntu.edu.tw/~b97053/pa ...

  2. 100、TensorFlow实现FFM Field-awared FM模型

    ''' Created on 2017年11月15日 @author: weizhen ''' import tensorflow as tf import pandas as pd import n ...

  3. 采用ubuntu系统来安装tensorflow

    最近在学习google新开源的深度学习框架tensorflow.发现安装它的时候,需要依赖python2.7.X;我之前一直使用的linux是centos.而centos不更新了,里面的自带的pyth ...

  4. TensorFlow深度学习笔记 循环神经网络实践

    转载请注明作者:梦里风林 Github工程地址:https://github.com/ahangchen/GDLnotes 欢迎star,有问题可以到Issue区讨论 官方教程地址 视频/字幕下载 加 ...

  5. TensorFlow入门学习(让机器/算法帮助我们作出选择)

    catalogue . 个人理解 . 基本使用 . MNIST(multiclass classification)入门 . 深入MNIST . 卷积神经网络:CIFAR- 数据集分类 . 单词的向量 ...

  6. 用 tensorflow实现DeepFM

    http://www.fabwrite.com/deepfm 文章DeepFM: A Factorization-Machine based Neural Network for CTR Predic ...

  7. 『TensorFlow』卷积层、池化层详解

    一.前向计算和反向传播数学过程讲解

  8. TensorFlow实战——个性化推荐

    原创文章,转载请注明出处: http://blog.csdn.net/chengcheng1394/article/details/78820529 请安装TensorFlow1.0,Python3. ...

  9. 将libFM模型变换成tensorflow可serving的形式

    fm_model是libFM生成的模型 model.ckpt是可以tensorflow serving的模型结构 亲测输出正确. 代码: import tensorflow as tf # libFM ...

随机推荐

  1. 110道python题+理解(不断更新)

    此篇题目在网上已经广为流传,但好多都不做解释,所以我想着自己一道一道的做一遍,并将相关涉及的做个补充,个人知识毕竟片面,有不足的地方还请大家多多指正 一.请用一行代码实现1-100之和 >> ...

  2. HOJ 2252 The Priest(动态规划)

    The Priest Source : 计算机学院第二届"光熙杯"程序设计大赛 Time limit : 3 sec Memory limit : 32 M Submitted : ...

  3. 牛客网多校赛第七场J--Sudoku Subrectangle

    链接:https://www.nowcoder.com/acm/contest/145/J 来源:牛客网 时间限制:C/C++ 1秒,其他语言2秒 空间限制:C/C++ 32768K,其他语言6553 ...

  4. SpringCloud 进阶之Zuul(路由网关)

    1. Zuul(路由网关) Zuul 包含了对请求的路由和过滤两个最主要的功能; 路由功能:负责将外部请求转发到具体的微服务实例上,是实现外部访问统一入口的基础; 过滤功能:负责对请求的处理过程进行干 ...

  5. D. Babaei and Birthday Cake---cf629D(LIS线段树优化)

    题目链接:http://codeforces.com/problemset/problem/629/D 题意就是现有n个蛋糕,蛋糕的形状是圆柱体,每个蛋糕的体积就是圆柱体的体积,每个蛋糕的编号是1-- ...

  6. Jamie's Contact Groups---hdu1669--poj2289(多重匹配+二分)

    题目链接 题意:Jamie有很多联系人,但是很不方便管理,他想把这些联系人分成组,已知这些联系人可以被分到哪个组中去,而且要求每个组的联系人上限最小,即有一整数k,使每个组的联系人数都不大于k,问这个 ...

  7. 洛谷 P2602 [ZJOI2010]数字计数

    洛谷 第一次找规律A了一道紫题,写篇博客纪念一下. 这题很明显是数位dp,但是身为蒟蒻我不会呀,于是就像分块打表水过去. 数据范围是\(10^{12}\),我就\(10^6\)一百万一百万的打表. 于 ...

  8. java 调用静态方法和构造函数和静态块执行的先后顺序

    构造方法是只有你在new对象的时候才会执行,静态语句块和静态方法在类加载到内存的时候就已经执行了,另外,静态语句块只能给静态变量赋值,里面不能出现方法,同样,静态方法里面也不能出现静态语句块 追问: ...

  9. windows 系统无法启动windows event log 服务

    windows 系统无法启动windows event log 服务 关键词:无法启动系统事件日志 尝试解决步骤 [1]权限:把如图中logsfile文件等都给local service [2]把C: ...

  10. git-【四】撤销修改和删除文件操作

    一:撤销修改: 比如我现在在readme.txt文件里面增加一行 内容为555555555555,我们先通过命令查看如下: 在未提交之前,发现添加5555555555555内容有误,所以得马上恢复以前 ...