TF从文件中读取数据

从文件中读取数据

在TensorFlow中进行模型训练时,在官网给出的三种读取方式,中最好的文件读取方式就是将利用队列进行文件读取,而且步骤有两步:

把样本数据写入TFRecords二进制文件
从队列中读取

TFRecords二进制文件,能够更好的利用内存,更方便的移动和复制,并且不需要单独的标记文件

下面官网给出的,对mnist文件进行操作的code,具体代码请参考:tensorflow-master\tensorflow\examples\how_tos\reading_data\convert_to_records.py

（https://www.sogou.com/link?url=DSOYnZeCC_pKZzihDKzFgzQoUkRGi7SFyAyslJcA_SlXxobSKiNyJA..）

生成TFRecords文件

定义主函数，给训练、验证、测试数据集做转换：

def main(unused_argv):

  # Get the data.

  data_sets = mnist.read_data_sets(FLAGS.directory,

                                   dtype=tf.uint8,

                                   reshape=False,

                                   validation_size=FLAGS.validation_size)

  # Convert to Examples and write the result to TFRecords.

  convert_to(data_sets.train, 'train')

  convert_to(data_sets.validation, 'validation')

  convert_to(data_sets.test, 'test')

转换函数的作用convert_to的主要功能是，将数据填入到协议缓冲区，并化为一个字符串，然后写入到TFRecords文件。



def convert_to(data_set, name):

  """Converts a dataset to tfrecords."""

  images = data_set.images

  labels = data_set.labels

  num_examples = data_set.num_examples

  if images.shape[0] != num_examples:

    raise ValueError('Images size %d does not match label size %d.' %

                     (images.shape[0], num_examples))

  rows = images.shape[1] # 28

  cols = images.shape[2] # 28

  depth = images.shape[3] # 1. 是黑白图像,所以是单通道

  filename = os.path.join(FLAGS.directory, name + '.tfrecords')

  print('Writing', filename)

  writer = tf.python_io.TFRecordWriter(filename)

  for index in range(num_examples):

    image_raw = images[index].tostring()

    # 写入协议缓存区,height,width,depth,label编码成int64类型,image_raw 编码成二进制

    example = tf.train.Example(features=tf.train.Features(feature={

        'height': _int64_feature(rows),

        'width': _int64_feature(cols),

        'depth': _int64_feature(depth),

        'label': _int64_feature(int(labels[index])),

        'image_raw': _bytes_feature(image_raw)}))

    writer.write(example.SerializeToString()) # 序列化为字符串

  writer.close()

编码函数如下：

def _int64_feature(value):

  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):

  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

完整代码：

import tensorflow as tf

import os

import argparse

import sys

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#1.0 生成TFRecords 文件

from tensorflow.contrib.learn.python.learn.datasets import mnist

FLAGS = None

# 编码函数如下:

def _int64_feature(value):

  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):

  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def convert_to(data_set, name):

  """Converts a dataset to tfrecords."""

  images = data_set.images

  labels = data_set.labels

  num_examples = data_set.num_examples

  if images.shape[0] != num_examples:

    raise ValueError('Images size %d does not match label size %d.' %

                     (images.shape[0], num_examples))

  rows = images.shape[1] # 28

  cols = images.shape[2] # 28

  depth = images.shape[3] # 1. 是黑白图像,所以是单通道

  filename = os.path.join(FLAGS.directory, name + '.tfrecords')

  print('Writing', filename)

  writer = tf.python_io.TFRecordWriter(filename)

  for index in range(num_examples):

    image_raw = images[index].tostring()

    # 写入协议缓存区,height,width,depth,label编码成int64类型,image_raw 编码成二进制

    example = tf.train.Example(features=tf.train.Features(feature={

        'height': _int64_feature(rows),

        'width': _int64_feature(cols),

        'depth': _int64_feature(depth),

        'label': _int64_feature(int(labels[index])),

        'image_raw': _bytes_feature(image_raw)}))

    writer.write(example.SerializeToString()) # 序列化为字符串

  writer.close()

def main(unused_argv):

  # Get the data.

  data_sets = mnist.read_data_sets(FLAGS.directory,

                                   dtype=tf.uint8,

                                   reshape=False,

                                   validation_size=FLAGS.validation_size)

  # Convert to Examples and write the result to TFRecords.

  convert_to(data_sets.train, 'train')

  convert_to(data_sets.validation, 'validation')

  convert_to(data_sets.test, 'test')

if __name__ == '__main__':

  parser = argparse.ArgumentParser()

  parser.add_argument(

      '--directory',

      type=str,

      default='MNIST_data/',

      help='Directory to download data files and write the converted result'

  )

  parser.add_argument(

      '--validation_size',

      type=int,

      default=5000,

      help="""\

      Number of examples to separate from the training data for the validation

      set.\

      """

  )

  FLAGS, unparsed = parser.parse_known_args()

  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

运行结束后，在/tmp/data下生成3个文件，即train.tfrecords,validation.tfrecords和test.tfrecords.

从队列中读取

读取TFRecords文件步骤

使用队列读取数TFRecords 文件数据的步骤

创建张量,从二进制文件读取一个样本数据
创建张量,从二进制文件随机读取一个mini-batch
把每一批张量传入网络作为输入点

TensorFlow使用TFRecords文件训练样本的步骤

在生成文件名的序列中,设定epoch数量

训练时,设定为无穷循环

在读取数据时,如果捕捉到错误,终止

source code：tensorflow-master\tensorflow\examples\how_tos\reading_data\fully_connected_reader.py(1.2.1)

（https://blog.csdn.net/fontthrone/article/details/76728083 ）



import tensorflow as tf

import os

# from tensorflow.contrib.learn.python.learn.datasets import mnist

# 注意上面的这个mnist 与 example 中的 mnist 是不同的,本文件中请使用下面的那个 mnist

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import argparse

import os.path

import sys

import time

from tensorflow.examples.tutorials.mnist import mnist

# Basic model parameters as external flags.

FLAGS = None

# This part of the code is added by FontTian,which comes from the source code of tensorflow.examples.tutorials.mnist

# The MNIST images are always 28x28 pixels.

# IMAGE_SIZE = 28

# IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE

# Constants used for dealing with the files, matches convert_to_records.

TRAIN_FILE = 'train.tfrecords'

VALIDATION_FILE = 'validation.tfrecords'

def read_and_decode(filename_queue):

    reader = tf.TFRecordReader()

    _, serialized_example = reader.read(filename_queue)

    features = tf.parse_single_example(

        serialized_example,

        # Defaults are not specified since both keys are required.

        # 必须写明faetures 中的 key 的名称

        features={

            'image_raw': tf.FixedLenFeature([], tf.string),

            'label': tf.FixedLenFeature([], tf.int64),

        })

    # Convert from a scalar string tensor (whose single string has

    # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape

    # [mnist.IMAGE_PIXELS].

    # 将一个标量字符串张量(其单个字符串的长度是mnist.image像素) # 0 维的Tensor

    # 转换为一个带有形状mnist.图像像素的uint8张量。 # 一维的Tensor

    image = tf.decode_raw(features['image_raw'], tf.uint8)

    # print(tf.shape(image)) # Tensor("input/Shape:0", shape=(1,), dtype=int32)

    image.set_shape([mnist.IMAGE_PIXELS])

    # print(tf.shape(image)) # Tensor("input/Shape_1:0", shape=(1,), dtype=int32)

    # OPTIONAL: Could reshape into a 28x28 image and apply distortions

    # here.  Since we are not applying any distortions in this

    # example, and the next step expects the image to be flattened

    # into a vector, we don't bother.

    # Convert from [0, 255] -> [-0.5, 0.5] floats.

    image = tf.cast(image, tf.float32) * (1. / 255) - 0.5

    # print(tf.shape(image)) # Tensor("input/Shape_2:0", shape=(1,), dtype=int32)

    # Convert label from a scalar uint8 tensor to an int32 scalar.

    label = tf.cast(features['label'], tf.int32)

    # print(tf.shape(label)) # Tensor("input/Shape_3:0", shape=(0,), dtype=int32)

    return image, label

# 使用 tf.train.shuffle_batch 将前面生成的样本随机化,获得一个最小批次的张量

def inputs(train, batch_size, num_epochs):

    """Reads input data num_epochs times.

    Args:

      train: Selects between the training (True) and validation (False) data.

      batch_size: Number of examples per returned batch.

      num_epochs: Number of times to read the input data, or 0/None to

         train forever.

    Returns:

      A tuple (images, labels), where:

      * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]

        in the range [-0.5, 0.5].

      * labels is an int32 tensor with shape [batch_size] with the true label,

        a number in the range [0, mnist.NUM_CLASSES).

      Note that an tf.train.QueueRunner is added to the graph, which

      must be run using e.g. tf.train.start_queue_runners().

    输入参数:

      train: Selects between the training (True) and validation (False) data.

      batch_size: 训练的每一批有多少个样本

      num_epochs: 读取输入数据的次数, or 0/None 表示永远训练下去

    返回结果:

      A tuple (images, labels), where:

      * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]

        范围: [-0.5, 0.5].

      * labels is an int32 tensor with shape [batch_size] with the true label,

        范围: [0, mnist.NUM_CLASSES).

      注意 :  tf.train.QueueRunner 被添加进 graph, 它必须用 tf.train.start_queue_runners() 来启动线程.

    """

    if not num_epochs: num_epochs = None

    filename = os.path.join(FLAGS.train_dir,

                            TRAIN_FILE if train else VALIDATION_FILE)

    with tf.name_scope('input'):

        # tf.train.string_input_producer 返回一个 QueueRunner,里面有一个 FIFQueue

        filename_queue = tf.train.string_input_producer(

            [filename], num_epochs=num_epochs)

        # 如果样本数据很大,可以分成若干文件,把文件名列表传入

        # Even when reading in multiple threads, share the filename queue.

        image, label = read_and_decode(filename_queue)

        # Shuffle the examples and collect them into batch_size batches.

        # (Internally uses a RandomShuffleQueue.)

        # We run this in two threads to avoid being a bottleneck.

        images, sparse_labels = tf.train.shuffle_batch(

            [image, label], batch_size=batch_size, num_threads=2,

            capacity=1000 + 3 * batch_size,

            # Ensures a minimum amount of shuffling of examples.

            # 留下一部分队列,来保证每次有足够的数据做随机打乱

            min_after_dequeue=1000)

        return images, sparse_labels

def run_training():

    """Train MNIST for a number of steps."""

    # Tell TensorFlow that the model will be built into the default Graph.

    with tf.Graph().as_default():

        # Input images and labels.

        images, labels = inputs(train=True, batch_size=FLAGS.batch_size,

                                num_epochs=FLAGS.num_epochs)

        # 构建一个从推理模型来预测数据的图

        logits = mnist.inference(images,

                                 FLAGS.hidden1,

                                 FLAGS.hidden2)

        # Add to the Graph the loss calculation.

        # 定义损失函数

        loss = mnist.loss(logits, labels)

        # 将模型添加到图操作中

        train_op = mnist.training(loss, FLAGS.learning_rate)

        # 初始化变量的操作

        init_op = tf.group(tf.global_variables_initializer(),

                           tf.local_variables_initializer())

        # Create a session for running operations in the Graph.

        # 在图中创建一个用于运行操作的会话

        sess = tf.Session()

        # 初始化变量,注意:string_input_product 内部创建了一个epoch计数器

        sess.run(init_op)

        # Start input enqueue threads.

        coord = tf.train.Coordinator()

        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:

            step = 0

            while not coord.should_stop():

                start_time = time.time()

                # Run one step of the model.  The return values are

                # the activations from the `train_op` (which is

                # discarded) and the `loss` op.  To inspect the values

                # of your ops or variables, you may include them in

                # the list passed to sess.run() and the value tensors

                # will be returned in the tuple from the call.

                _, loss_value = sess.run([train_op, loss])

                duration = time.time() - start_time

                # Print an overview fairly often.

                if step % 100 == 0:

                    print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,

                                                               duration))

                step += 1

        except tf.errors.OutOfRangeError:

            print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))

        finally:

            # 通知其他线程关闭

            coord.request_stop()

        # Wait for threads to finish.

        coord.join(threads)

        sess.close()

def main(_):

    run_training()

if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument(

        '--learning_rate',

        type=float,

        default=0.01,

        help='Initial learning rate.'

    )

    parser.add_argument(

        '--num_epochs',

        type=int,

        default=2,

        help='Number of epochs to run trainer.'

    )

    parser.add_argument(

        '--hidden1',

        type=int,

        default=128,

        help='Number of units in hidden layer 1.'

    )

    parser.add_argument(

        '--hidden2',

        type=int,

        default=32,

        help='Number of units in hidden layer 2.'

    )

    parser.add_argument(

        '--batch_size',

        type=int,

        default=100,

        help='Batch size.'

    )

    parser.add_argument(

        '--train_dir',

        type=str,

        default='/tmp/data',

        help='Directory with the training data.'

    )

    FLAGS, unparsed = parser.parse_known_args()

    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)