问题描述
我想使用 Tensorflow 的数据集 API 来读取变体长度列表的 TFRecords 文件.这是我的代码.
def _int64_feature(value):# value 必须是一个 numpy 数组.返回 tf.train.Feature(int64_list=tf.train.Int64List(value=value))定义 main1():# 将数组写入 TFrecord.# a 是一个包含变量长度列表的数组.a = np.array([[0, 54, 91, 153, 177],[0, 50, 89, 147, 196],[0, 38, 79, 157],[0, 49, 89, 147, 177],[0, 32, 73, 145]])writer = tf.python_io.TFRecordWriter('文件')for i in range(a.shape[0]): # i = 0 ~ 4x_train = a[i]特征 = {'i': _int64_feature(np.array([i])), 'data': _int64_feature(x_train)}# 创建一个示例协议缓冲区示例 = tf.train.Example(features=tf.train.Features(feature=feature))# 序列化为字符串并写入文件writer.write(example.SerializeToString())writer.close()# 检查 TFRocord 文件.record_iterator = tf.python_io.tf_record_iterator(path='file')对于 record_iterator 中的 string_record:示例 = tf.train.Example()示例.ParseFromString(string_record)i = (example.features.feature['i'].int64_list.value)数据 = (example.features.feature['data'].int64_list.value)#data = np.fromstring(data_string, dtype=np.int64)打印(我,数据)# 使用Dataset API 读取TFRecord 文件.def _parse_function(example_proto):keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),'数据':tf.FixedLenFeature([], tf.int64)}parsed_features = tf.parse_single_example(example_proto,keys_to_features)返回parsed_features['i'], parsed_features['data']ds = tf.data.TFRecordDataset('文件')迭代器 = ds.map(_parse_function).make_one_shot_iterator()i, data = iterator.get_next()使用 tf.Session() 作为 sess:打印(i.eval())打印(数据.评估())
检查TFRecord文件
[0] [0, 54, 91, 153, 177][1] [0, 50, 89, 147, 196][2] [0, 38, 79, 157][3] [0, 49, 89, 147, 177][4] [0, 32, 73, 145]
但是当我尝试使用 Dataset API 读取 TFRecord 文件时,它显示了以下错误.
tensorflow.python.framework.errors_impl.InvalidArgumentError: 名称:,键:数据,索引:0.int64 值的数量!= 预期.值大小:5 但输出形状:[]
谢谢.
更新:我尝试使用以下代码使用 Dataset API 读取 TFRecord,但都失败了.
def _parse_function(example_proto):keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),'数据':tf.VarLenFeature(tf.int64)}parsed_features = tf.parse_single_example(example_proto,keys_to_features)返回parsed_features['i'], parsed_features['data']ds = tf.data.TFRecordDataset('文件')迭代器 = ds.map(_parse_function).make_one_shot_iterator()i, data = iterator.get_next()使用 tf.Session() 作为 sess:打印(sess.run([i,数据]))
或
def _parse_function(example_proto):keys_to_features = {'i' :tf.VarLenFeature(tf.int64),'数据':tf.VarLenFeature(tf.int64)}parsed_features = tf.parse_single_example(example_proto,keys_to_features)返回parsed_features['i'], parsed_features['data']ds = tf.data.TFRecordDataset('文件')迭代器 = ds.map(_parse_function).make_one_shot_iterator()i, data = iterator.get_next()使用 tf.Session() 作为 sess:打印(sess.run([i,数据]))
和错误:
回溯(最近一次调用最后一次):文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",第 468 行,在 make_tensor_proto 中str_values = [compat.as_bytes(x) for x in proto_values] 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",第 468 行,在str_values = [compat.as_bytes(x) for x in proto_values] 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py",第 65 行,以 as_bytes 为单位(bytes_or_text,)) TypeError: 预期的二进制或 unicode 字符串,得到 p>
在处理上述异常的过程中,又发生了一个异常:
回溯(最近一次调用最后一次):文件2tfrecord.py",第 126 行,在main1() 文件2tfrecord.py",第 72 行,在 main1 中iterator = ds.map(_parse_function).make_one_shot_iterator() 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",712行,在地图上返回 MapDataset(self, map_func) 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",第 1385 行,在 init 中self._map_func.add_to_graph(ops.get_default_graph()) 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",第 486 行,在 add_to_graph 中self._create_definition_if_needed() 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",第 321 行,在 _create_definition_if_needed 中self._create_definition_if_needed_impl() 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",第 338 行,在 _create_definition_if_needed_impl 中输出 = self._func(*inputs) 文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",第 1376 行,在 tf_map_func 中flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] 文件"/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",第 1376 行,在flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] 文件"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",第 836 行,在 convert_to_tensor 中as_ref=False) 文件 "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",第 926 行,internal_convert_to_tensorret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref) 文件"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py",第 229 行,在 _constant_tensor_conversion_function 中返回常量(v,dtype=dtype,name=name)文件/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py",第 208 行,常量值,dtype=dtype,shape=shape,verify_shape=verify_shape)) 文件"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",第 472 行,在 make_tensor_proto 中支持的类型."% (type(values), values)) TypeError:无法将类型的对象转换为张量.内容:SparseTensor(indices=Tensor("ParseSingleExample/Slice_Indices_i:0",形状=(?,1),dtype=int64),values=Tensor("ParseSingleExample/ParseExample/ParseExample:3",形状=(?,),dtype=int64),Dense_shape=Tensor("ParseSingleExample/Squeeze_Shape_i:0", shape=(1,),dtype=int64)).考虑将元素强制转换为支持的类型.
Python 版本:3.5.2
Tensorflow 版本:1.4.1
经过数小时的搜索和尝试,我相信答案会浮出水面.下面是我的代码.
def _int64_feature(value):# value 必须是一个 numpy 数组.返回 tf.train.Feature(int64_list=tf.train.Int64List(value=value.flatten()))# 将数组写入 TFrecord.# a 是一个包含变量长度列表的数组.a = np.array([[0, 54, 91, 153, 177],[0, 50, 89, 147, 196],[0, 38, 79, 157],[0, 49, 89, 147, 177],[0, 32, 73, 145]])writer = tf.python_io.TFRecordWriter('文件')for i in range(a.shape[0]): # i = 0 ~ 4x_train = np.array(a[i])特征 = {'i' : _int64_feature(np.array([i])),'数据':_int64_feature(x_train)}# 创建一个示例协议缓冲区示例 = tf.train.Example(features=tf.train.Features(feature=feature))# 序列化为字符串并写入文件writer.write(example.SerializeToString())writer.close()# 检查 TFRocord 文件.record_iterator = tf.python_io.tf_record_iterator(path='file')对于 record_iterator 中的 string_record:示例 = tf.train.Example()示例.ParseFromString(string_record)i = (example.features.feature['i'].int64_list.value)数据 = (example.features.feature['data'].int64_list.value)打印(我,数据)# 使用Dataset API 读取TFRecord 文件.文件名 = [文件"]数据集 = tf.data.TFRecordDataset(文件名)def _parse_function(example_proto):keys_to_features = {'i':tf.VarLenFeature(tf.int64),'数据':tf.VarLenFeature(tf.int64)}parsed_features = tf.parse_single_example(example_proto,keys_to_features)返回 tf.sparse_tensor_to_dense(parsed_features['i']), \tf.sparse_tensor_to_dense(parsed_features['data'])# 将记录解析为张量.数据集 = dataset.map(_parse_function)# 打乱数据集数据集 = dataset.shuffle(buffer_size=1)# 无限重复输入数据集 = dataset.repeat()# 生成批次数据集 = 数据集.batch(1)# 创建一个一次性迭代器迭代器 = dataset.make_one_shot_iterator()i, data = iterator.get_next()使用 tf.Session() 作为 sess:打印(sess.run([i,数据]))打印(sess.run([i,数据]))打印(sess.run([i,数据]))
有几点需要注意.
1. 这个 SO 问题很有帮助.
2. tf.VarLenFeature
会返回 SparseTensor,因此需要使用 tf.sparse_tensor_to_dense
转换为密集张量.
3. 在我的代码中,parse_single_example()
不能替换为parse_example()
,这让我烦了一天.我不知道为什么 parse_example()
不起作用.如果有人知道原因,请赐教.
I want to use Tensorflow's Dataset API to read TFRecords file of lists of variant length. Here is my code.
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def main1():
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = a[i]
feature = {'i': _int64_feature(np.array([i])), 'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
#data = np.fromstring(data_string, dtype=np.int64)
print(i, data)
# Use Dataset API to read the TFRecord file.
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.FixedLenFeature([], tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(i.eval())
print(data.eval())
Check TFRecord file
[0] [0, 54, 91, 153, 177]
[1] [0, 50, 89, 147, 196]
[2] [0, 38, 79, 157]
[3] [0, 49, 89, 147, 177]
[4] [0, 32, 73, 145]
But it showed the following error when I tried to use Dataset API to read TFRecord file.
Thank you.
UPDATE:I tried to use the following code to read TFRecord with Dataset API, but both of them failed.
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
or
def _parse_function(example_proto):
keys_to_features = {'i' :tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
And the error:
Python version: 3.5.2
Tensorflow version: 1.4.1
After hours of searching and trying, I believe the answer emerges. Below is my code.
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value.flatten()))
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = np.array(a[i])
feature = {'i' : _int64_feature(np.array([i])),
'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
print(i, data)
# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
keys_to_features = {'i':tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return tf.sparse_tensor_to_dense(parsed_features['i']), \
tf.sparse_tensor_to_dense(parsed_features['data'])
# Parse the record into tensors.
dataset = dataset.map(_parse_function)
# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=1)
# Repeat the input indefinitly
dataset = dataset.repeat()
# Generate batches
dataset = dataset.batch(1)
# Create a one-shot iterator
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
print(sess.run([i, data]))
print(sess.run([i, data]))
There are few things to note.
1. This SO question helps a lot.
2. tf.VarLenFeature
would return SparseTensor, thus, using tf.sparse_tensor_to_dense
to convert to dense tensor is necessary.
3. In my code, parse_single_example()
can't be replaced with parse_example()
, and it bugs me for a day. I don't know why parse_example()
doesn't work out. If anyone know the reason, please enlighten me.
这篇关于如何使用 Dataset API 读取变体长度列表的 TFRecords 文件?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!