Tensorflow 处理libsvm格式数据生成TFRecord (parse libsvm data to TFRecord)
#写libsvm格式
数据 write libsvm
#!/usr/bin/env python
#coding=gbk
# ==============================================================================
# \file gen-records.py
# \author chenghuige
# \date 2016-08-12 11:52:01.952044
# \Description
# ==============================================================================
from
__future__
import absolute_import
from
__future__
import division
#from __future__ import print_function
import
sys,os
import
tensorflow
as
tf
import
numpy
as
np
flags = tf.app.flags
FLAGS = flags.FLAGS
_float_feature = lambda
v: tf.train.Feature(float_list=tf.train.FloatList(value=v))
_int_feature = lambda
v: tf.train.Feature(int64_list=tf.train.Int64List(value=v))
#how to store global info, using sequence example?
def main(argv):
writer = tf.python_io.TFRecordWriter(argv[2])
for line in open(argv[1]):
l = line.rstrip().split()
label = int(l[0])
start = 1
num_features = 0
if
':'
not
in l[1]:
num_features = int(l[1])
start += 1
indexes = []
values = []
for item in l[start:]:
index,value = item.split(':')
indexes.append(int(index))
values.append(float(value))
example = tf.train.Example(features=tf.train.Features(feature={
'label': _int_feature([label]),
'num_features': _int_feature
'index': _int_feature(indexes),
'value': _float_feature(values)
}))
writer.write(example.SerializeToString())
if __name__ == '__main__':
tf.app.run()
#读libsvm格式
数据 read libsvm
#!/usr/bin/env python
#coding=gbk
# ==============================================================================
# \file read-records.py
# \author chenghuige
# \date 2016-07-19 17:09:07.466651
# \Description
# ==============================================================================
#@TODO treat comment as sparse input ?
from
__future__
import absolute_import
from
__future__
import division
#from __future__ import print_function
import
sys, os, time
import
tensorflow
as
tf
import
numpy
as
np
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size', 5, 'Batch size.')
flags.DEFINE_integer('num_epochs', 10, 'Number of epochs to run trainer.')
flags.DEFINE_integer('num_preprocess_threads', 12, '')
MIN_AFTER_DEQUEUE = 10000
def read(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
return serialized_example
def decode(batch_serialized_examples):
features = tf.parse_example(
batch_serialized_examples,
features={
'label' : tf.FixedLenFeature([], tf.int64),
'index' : tf.VarLenFeature(tf.int64),
'value' : tf.VarLenFeature(tf.float32),
})
label = features['label']
index = features['index']
value = features['value']
return label, index, value
def batch_inputs(files, batch_size, num_epochs = None, num_preprocess_threads=1):
"""Reads input data num_epochs times.
"""
if
not
num_epochs: num_epochs = None
with
tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
files, num_epochs=num_epochs)
serialized_example = read(filename_queue)
batch_serialized_examples = tf.train.shuffle_batch(
[serialized_example],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=MIN_AFTER_DEQUEUE + (num_preprocess_threads + 1) * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=MIN_AFTER_DEQUEUE)
return decode(batch_serialized_examples)
def read_records():
# Tell TensorFlow that the model will be built into the default Graph.
with
tf.Graph().as_default():
# Input images and labels.
tf_record_pattern = sys.argv[1]
data_files = tf.gfile.Glob(tf_record_pattern)
label, index, value = batch_inputs(data_files,
batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs,
num_preprocess_threads=FLAGS.num_preprocess_threads)
# The op for initializing the variables.
init_op = tf.group(tf.initialize_all_variables(),
tf.initialize_local_variables())
# Create a session for running operations in the Graph.
#sess = tf.Session()
sess = tf.InteractiveSession()
#init_op = tf.initialize_all_variables()
#self.session.run(init)
# Initialize the variables (the trained variables and the
# epoch counter).
sess.run(init_op)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while
not coord.should_stop():
start_time = time.time()
label_, index_, value_ = sess.run([label, index, value])
print label_
print index_
print value_
print index_[0]
print index_[1]
print index_[2]
duration = time.time() - start_time
step += 1
except
tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
# When done, ask the threads to stop.
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
sess.close()
def main(_):
read_records()
if __name__ == '__main__':
tf.app.run()
#文本分类 text classification
https://github.com/chenghuige/tensorflow-example
using TfRecord only need small modification, like below, I will update the code in github soon.
class
SparseClassificationTrainer(object):
"""General framework for Sparse BinaryClassificationTrainer
Sparse BinaryClassfiction will use sparse embedding look up trick
see https://github.com/tensorflow/tensorflow/issues/342
"""
def __init__(self, dataset = None, num_features = 0):
if
dataset
is
not
None
and
type(dataset) != TfDataSet:
self.labels = dataset.labels
self.features = dataset.features
self.num_features = dataset.num_features
self.num_classes = dataset.num_classes
else:
self.features = SparseFeatures()
self.num_features = num_features
self.num_classes = None
self.index_only = False
self.total_features = self.num_features
if
type(dataset) != TfDataSet:
self.sp_indices = tf.placeholder(tf.int64, name = 'sp_indices')
self.sp_shape = tf.placeholder(tf.int64, name = 'sp_shape')
self.sp_ids_val = tf.placeholder(tf.int64, name = 'sp_ids_val')
self.sp_weights_val = tf.placeholder(tf.float32, name = 'sp_weights_val')
self.sp_ids = tf.SparseTensor(self.sp_indices, self.sp_ids_val, self.sp_shape)
self.sp_weights = tf.SparseTensor(self.sp_indices, self.sp_weights_val, self.sp_shape)
self.X = (self.sp_ids, self.sp_weights)
self.Y = tf.placeholder(tf.int32) #same as batch size
else:
self.X = (dataset.index, dataset.value)
self.Y = dataset.label
self.type = 'sparse'
MIN_AFTER_DEQUEUE = 10000
def read(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
return serialized_example
def decode(batch_serialized_examples):
features = tf.parse_example(
batch_serialized_examples,
features={
'label' : tf.FixedLenFeature([], tf.int64),
'index' : tf.VarLenFeature(tf.int64),
'value' : tf.VarLenFeature(tf.float32),
})
label = features['label']
index = features['index']
value = features['value']
return label, index, value
def batch_inputs(files, batch_size, num_epochs=None, num_preprocess_threads=12):
if
not
num_epochs: num_epochs = None
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
files, num_epochs=num_epochs)
serialized_example = read(filename_queue)
batch_serialized_examples = tf.train.shuffle_batch(
[serialized_example],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=MIN_AFTER_DEQUEUE + (num_preprocess_threads + 1) * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=MIN_AFTER_DEQUEUE)
return decode(batch_serialized_examples
class
TfDataSet(object):
def __init__(self, data_files):
self.data_files = data_files
#@TODO now only deal sparse input
self.features = SparseFeatures()
self.label = None
def build_read_graph(self, batch_size):
tf_record_pattern = self.data_files
data_files = tf.gfile.Glob(tf_record_pattern)
self.label, self.index, self.value = batch_inputs(data_files, batch_size)
def next_batch(self, sess):
label, index, value = sess.run([self.label, self.index, self.value])
trX = (index, value)
trY = label
return trX, trY
trainset = melt.load_dataset(trainset_file, is_record=FLAGS.is_record)
if FLAGS.is_record:
trainset.build_read_graph(batch_size)
step = 0
while
not coord.should_stop():
#self.trainer.X, self.trainer.Y = trainset.next_batch(self.session)
_, cost_, accuracy_ = self.session.run([self.train_op, self.cost, self.accuracy])
if step % 100 == 0:
print
'step:', step, 'train precision@1:', accuracy_,'cost:', cost_
if step % 1000 == 0:
pass
step +=
Tensorflow 处理libsvm格式数据生成TFRecord (parse libsvm data to TFRecord)的更多相关文章
- 记录几种有关libsvm格式数据的list和dict用法
# list元素求和 sum = reduce(lambda x,y: x+y, mylist) # 比较两个 lists 的元素是否完全一致 if all(x==y for x, y in zip( ...
- ini格式数据生成与解析具体解释
ini格式数据生成与解析具体解释 1.ini格式数据长啥样? watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fontsize/ ...
- 更加清晰的TFRecord格式数据生成及读取
TFRecords 格式数据文件处理流程 TFRecords 文件包含了 tf.train.Example 协议缓冲区(protocol buffer),协议缓冲区包含了特征 Features.Ten ...
- tensorflow制作tfrecord格式数据
tf.Example msg tensorflow提供了一种统一的格式.tfrecord来存储图像数据.用的是自家的google protobuf.就是把图像数据序列化成自定义格式的二进制数据. To ...
- tensorflow学习笔记(10) mnist格式数据转换为TFrecords
本程序 (1)mnist的图片转换成TFrecords格式 (2) 读取TFrecords格式 # coding:utf-8 # 将MNIST输入数据转化为TFRecord的格式 # http://b ...
- iOS开发之JSON格式数据的生成与解析
本文将从四个方面对IOS开发中JSON格式数据的生成与解析进行讲解: 一.JSON是什么? 二.我们为什么要用JSON格式的数据? 三.如何生成JSON格式的数据? 四.如何解析JSON格式的数据? ...
- Android使用DOM生成和输出XML格式数据
Android使用DOM生成和输出XML格式数据 本文主要简单解说怎样使用DOM生成和输出XML数据. 1. 生成和输出XML数据 代码及凝视例如以下: try { DocumentBuilderFa ...
- 转载 -- iOS开发之JSON格式数据的生成与解析
本文将从四个方面对IOS开发中JSON格式数据的生成与解析进行讲解: 一.JSON是什么? 二.我们为什么要用JSON格式的数据? 三.如何生成JSON格式的数据? 四.如何解析JSON格式的数据? ...
- PHP生成和获取XML格式数据
在做数据接口时,我们通常要获取第三方数据接口或者给第三方提供数据接口,而这些数据格式通常是以XML或者JSON格式传输,本文将介绍如何使用PHP生成XML格式数据供第三方调用以及如何获取第三方提供的X ...
随机推荐
- mysql 可以跨库查询
eg: SELECTcity.ID,city.`Name`,city.CountryCode,city.District,city.Population,adv_site.ADD_DATEFROMci ...
- Jquery实现特效滑动菜单栏
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...
- 【转】机器学习教程 十四-利用tensorflow做手写数字识别
模式识别领域应用机器学习的场景非常多,手写识别就是其中一种,最简单的数字识别是一个多类分类问题,我们借这个多类分类问题来介绍一下google最新开源的tensorflow框架,后面深度学习的内容都会基 ...
- Java学习笔记15
do-while循环时while循环的变体语法如下:do{ // 循环体 语句(组);}while(循环继续条件); 如果循环中的语句至少需要执行一次,那么建议使用do-while循环. for循环 ...
- SpringMVC学习记录1
起因 以前大三暑假实习的时候看到公司用SpringMVC而不是Struts2,老司机告诉我SpringMVC各种方便,各种解耦. 然后我自己试了试..好像是蛮方便的.... 基本上在Spring的基础 ...
- js实现继承的5种方式 (笔记)
js实现继承的5种方式 以下 均为 ES5 的写法: js是门灵活的语言,实现一种功能往往有多种做法,ECMAScript没有明确的继承机制,而是通过模仿实现的,根据js语言的本身的特性,js实现继承 ...
- 神经网络模型及R代码实现
神经网络基本原理 一.神经元模型 图中x1~xn是从其他神经元传来的输入信号,wij表示表示从神经元j到神经元i的连接权值,θ表示一个阈值 ( threshold ),或称为偏置( bias ).则神 ...
- 移动端开发概览【webview和touch事件】
作为一个前端,而且作为一个做移动端开发的前端,那意味着你要有三头六臂,跟iOS开发哥哥一起打酱油,跟Android开发哥哥一起修bug... Android vs Ios 我在webkit内核的chr ...
- 关于MapReduce中自定义分组类(三)
Job类 /** * Define the comparator that controls which keys are grouped together * for a single ...
- 【IDEA 2016】intellij idea tomcat jsp 热部署
刚开始用IDEA,落伍的我,只是觉得IDEA好看.可以换界面.想法如此的low. 真是不太会用啊,弄好了tomcat.程序启动竟然改动一下就要重启,JSP页面也一样. IDEA可以配置热部署,打开to ...