Code

import tensorflow as tf

import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def printdataset1(dataset):
print(dataset.element_spec)
# 遍历dataset
for ele in dataset:
print(ele.numpy())

def printdataset2(dataset):
print(dataset.element_spec)
for index,datasetele in enumerate(dataset):
print('dataset2['+str(index)+'] : ')
for ele in datasetele:
print(ele.numpy())

def printdataset3(dataset):
print(dataset3.element_spec)
#for index,dataset in enumerate(dataset3):
# print(dataset)

def printdataset(dataset):
it = iter(dataset)
print(next(it).numpy())

def printdatasetreduce(dataset):
print(dataset.reduce(0, lambda state, value: state + value).numpy())


def plot_batch_sizes(ds):
batch_sizes = [batch.shape[0] for batch in ds]
plt.bar(range(len(batch_sizes)), batch_sizes)
plt.xlabel('Batch number')
plt.ylabel('Batch size')

def process_path(file_path):
label = tf.strings.split(file_path, os.sep)[-2]
return tf.io.read_file(file_path), label

if __name__ == '__main__':

print("========================= 用 容器(字面值) 来构建 dataset ============================\n")
'''
#以tensor为数据源,创建一个dataset
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0, 8, 2, 1])
for elem in dataset:
#print(elem.numpy())
pass
#创建一个迭代器用来遍历dataset
#printdataset(dataset)
#print("=====================================================")

#dataset提供reduce方法来把所有成员都按照某个方式处理后形成 “一个” 结果
#printdatasetreduce(dataset)
'''
print("\n\n")

print("========================= 用 容器(随机值) 来构建 dataset ============================\n")
'''
#创建一个dataset, 结构是4行10列, 内容是 [0,100) 随机值
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4, 10], minval=0, maxval=100))
#printdataset1(dataset1)
#print("=====================================================")

#创建一个dataset数组, 一个是四行一列, 内容是 [0,1) 随机值 . 另一个是 4行100列, 内容是 [0,100).
#从输出结果可以看到, dataset2[i] 是 tf[4] 和 tf[4,100] 组成一个
dataset2 = tf.data.Dataset.from_tensor_slices(
(tf.random.uniform([4]),
tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))
#printdataset2(dataset2)
#print("=====================================================")

dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
printdataset3(dataset3)
'''
print("\n\n")

print("========================= 用 tensor 来构建 dataset ============================\n")
'''
dataset4 = tf.data.Dataset.from_tensors(
tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]))
print(dataset4.element_spec)
print(dataset4.element_spec.value_type)
'''
print("\n\n")

print("========================= 用 numpy数据 来构建 dataset ============================\n")
'''
train, test = tf.keras.datasets.fashion_mnist.load_data()
images, labels = train
images = images / 255
dataset5 = tf.data.Dataset.from_tensor_slices((images, labels))
print(dataset5)
'''
print("\n\n")

print("========================= 用 TFRecord数据 来构建 dataset ============================\n")
'''
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec",
"file:///E:/__PythonWorkSpace/data/fsns-00000-of-00001")
dataset6 = tf.data.TFRecordDataset(filenames=[fsns_test_file])
print(dataset6.element_spec)
'''
print("\n\n")

print("========================= 用 txt文件 来构建 dataset ============================\n")
'''
directory_url = 'file:///E:/__PythonWorkSpace/data/'
file_names = ['cowper.txt', 'derby.txt']
file_paths = [
tf.keras.utils.get_file(file_name, directory_url + file_name)
for file_name in file_names
]
print(file_paths[0])
print(file_paths[1])
dataset = tf.data.TextLineDataset(file_paths[0])
for line in dataset.take(5):
print(line.numpy())
'''
print("\n\n")

print("========================= 用 cvs文件 来构建 dataset ============================\n")
'''
titanic_file = tf.keras.utils.get_file("train.csv", "file:///E:/__PythonWorkSpace/data/train.csv")
df = pd.read_csv(titanic_file)
df.head()
titanic_slices = tf.data.Dataset.from_tensor_slices(dict(df))
for feature_batch in titanic_slices.take(1):
for key, value in feature_batch.items():
print(" {!r:20s}: {}".format(key, value))
'''
print("\n\n")

print("========================= 用压缩包里的文件 来构建 dataset ============================\n")
'''
flowers_root = tf.keras.utils.get_file(
'flower_photos',
'file:///E:/__PythonWorkSpace/data/flower_photos.tgz',
untar=True)
flowers_root = pathlib.Path(flowers_root)
for item in flowers_root.glob("*"):
print(item.name)
list_ds = tf.data.Dataset.list_files(str(flowers_root / '*/*'))
for f in list_ds.take(5):
print(f.numpy())
labeled_ds = list_ds.map(process_path)
for image_raw, label_text in labeled_ds.take(1):
print(repr(image_raw.numpy()[:100]))
print()
print(label_text.numpy())
'''
print("\n\n")

print("========================= 批量处理(扩展和重组) dataset ============================\n")
'''
titanic_file = tf.keras.utils.get_file("train.csv", "file:///E:/__PythonWorkSpace/data/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)
titanic_batches = titanic_lines.repeat(3).batch(128) #先把titanic_lines里面的数据复制3份,然后把4份数据首尾相接组成新的数据集,在通过batch把新的数据既按照128为一组进行分组
#titanic_batches = titanic_lines.batch(128).repeat(3) #也可以先分组再repeat
plot_batch_sizes(titanic_batches)
'''
print("\n\n")

print("========================= 批量处理(打乱) dataset ============================\n")
titanic_file = tf.keras.utils.get_file("train.csv", "file:///E:/__PythonWorkSpace/data/train.csv")
lines = tf.data.TextLineDataset(titanic_file)
counter = tf.data.experimental.Counter()
print(lines)
print(counter)
dataset = tf.data.Dataset.zip((counter, lines))
print(dataset.element_spec)
dataset = dataset.shuffle(buffer_size=100) #从dataset
dataset = dataset.batch(20)

n, line_batch = next(iter(dataset))
print(n.numpy())

dataset = tf.data.Dataset.zip((counter, lines))
shuffled = dataset.shuffle(buffer_size=100).batch(10).repeat(2) #先随机取100个数据,再按照10个分组,最后每个分组都重复2遍

print("Here are the item ID's near the epoch boundary:\n")
for n, line_batch in shuffled.skip(60).take(5):
print(n.numpy())

repeat_shuffle = [n.numpy().mean() for n, line_batch in shuffled]

plt.plot(repeat_shuffle, label="shuffle().repeat()")
plt.plot(repeat_shuffle, label="repeat().shuffle()")
plt.ylabel("Mean item ID")
plt.legend()
print("\n\n")

print("========================= 批量处理(打乱) dataset ============================\n")