4.3.2 同步数据读取¶

4.3.2.1 定义必要库及全集参数¶

In [7]:
# codes04003_synchronous_initialization
import os
import numpy as np
import cv2
import json
import paddle

# 1. 定义数据集列表文件路径
dataset_name      = 'Zodiac'
dataset_path      = 'D:\\Workspace\\ExpDatasets\\'
dataset_root_path = os.path.join(dataset_path, dataset_name)

trainval_list = os.path.join(dataset_root_path, 'trainval.txt')
train_list = os.path.join(dataset_root_path, 'train.txt')
val_list = os.path.join(dataset_root_path, 'val.txt')
test_list = os.path.join(dataset_root_path, 'test.txt')

# 2. 图像基本信息
input_size = [3, 227, 227]             # 定义图像输入模型时的尺寸
mean_value = [0.485, 0.456, 0.406]     # Imagenet均值
std_value = [0.229, 0.224, 0.225]      # Imagenet标准差    

4.3.2.2 创建数据集读取器¶

In [8]:
# codes04004_synchronous_create_reader
# 0. 在CPU多进程处理方法
from multiprocessing import cpu_count

# 1. 定义数据预处理方法
def transforms(sample):
    img, label = sample
    img = cv2.imread(img, 1)
    # 将图像尺度resize为指定尺寸
    img = cv2.resize(img, [input_size[1], input_size[2]]) 
    # 将图像数据类型转化为float32                     
    img = img.astype('float32')
    # 将像素值归一化到[0,1]之间
    img = img/255.0
    # 数据标准化(均值消除)
    img = (img - mean_value) / std_value
    # 将图像数据类型转化为float32
    img = img.reshape(1, 3, input_size[1], input_size[2])                                      
    
    return img, label
In [8]:
# 2. 定义数据器eader,用于从列表文件中批量获取图像
def data_reader(data_list_path):
    # 定义读取函数,从列表文件中读取
    def reader():
        with open(data_list_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                img_path, label = line.split('\t')
                yield img_path, int(label)
    # 通过用户自定义的映射器mapper来映射reader返回的样本
    # cpu_count()可以实现多线程方式运行 
    return paddle.reader.xmap_readers(transforms, reader, cpu_count(), 512) 

4.3.2.3 创建数据迭代器¶

In [9]:
# codes04005_synchronous_create_iterative_reader
trainval_reader = paddle.batch(paddle.reader.shuffle(
    reader=data_reader(trainval_list), buf_size=256), batch_size=64, drop_last=False)
train_reader = paddle.batch(paddle.reader.shuffle(
    reader=data_reader(train_list), buf_size=256), batch_size=64, drop_last=False)
val_reader = paddle.batch(paddle.reader.shuffle(
    reader=data_reader(val_list), buf_size=256), batch_size=64, drop_last=False)
test_reader = paddle.batch(paddle.reader.shuffle(
    reader=data_reader(test_list), buf_size=256), batch_size=64, drop_last=False)

4.3.2.4 测试数据迭代器¶

In [10]:
# codes04006_synchronous_print_reader
# 迭代的读取数据并打印数据的形状
for i, data in enumerate(val_reader()):
    if i < 1:
        print('验证集batch_{}的图像形态: {}'.format(i, data[0][0].shape))
    else:
        break
验证集batch_0的图像形态: (1, 3, 227, 227)

4.3.3 异步数据读取¶

4.3.3.1 定义必要库及全局参数¶

In [8]:
# codes04007_asynchronous_initialization
import os
import cv2
import json
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append(r'D:\WorkSpace\DeepLearning\WebsiteV2') # 定义课程自定义模块保存位置
from codes.paddle import common, datasets
import paddle
import paddle.vision.transforms as T

# 1. 定义数据集基本信息
dataset_name      = 'Zodiac'
dataset_path      = 'D:\\Workspace\\ExpDatasets\\'
dataset_root_path = os.path.join(dataset_path, dataset_name)

# 2. 图像基本信息
args = {
'input_size': [3, 227, 227],             # 定义图像输入模型时的尺寸
'mean_value': [0.485, 0.456, 0.406],     # Imagenet均值
'std_value': [0.229, 0.224, 0.225],      # Imagenet标准差
}

4.3.3.2 定义数据集类¶

In [2]:
class DatasetZodiac(paddle.io.Dataset):
    """定义十二生肖Zodiac数据集"""
    # args: 数据集定义所使用的相关超参数
    # isTransforms=[0|1|2]: 定义Transforms类型,选择是否使用数据增广和数据归约
    # 0:仅使用必要的数据规约,包括尺度变换、数据格式和数据类型变换。用于不使用图像变换的特殊场景。
    # 1:仅使用数据规约,包括尺度变换、数据格式和数据类型变换、均值消除。用于train和trainval
    # 2:使用完整的数据归约和数据增广。用于test和val
    def __init__(self, dataset_root_path, mode='test', args=None, isTransforms=2):
        assert mode in ['train', 'val', 'test', 'trainval']
        self.data = []                                         # 定义数据序列,用于保存数据的路径和标签
        self.args = args                                       # 定义超参数列表
        self.isTransforms = isTransforms                       # 定义transforms类型
        
        # 读取数据列表文件,将每一行都按照路径和标签进行拆分成两个字段的序列,并将序列依次保存至data序列中
        # 1) 若列表信息长度为2,则表示包含路径和标签信息。
        # 2) 若列表信息长度为1,则表示只包含路径,不包含标签。一般正式的测试文件都只包含路径,不包含标签。
        with open(os.path.join(dataset_root_path, mode+'.txt')) as f:
            for line in f.readlines():
                info = line.strip().split('\t')           # 拆分从列表文件中读取到数据信息
                image_path = info[0].strip()              # 信息的[0]位置为路径
                if len(info) == 2:                        # 判断信息的长度,若包含标签则写入image_label
                    image_label = info[1].strip()
                elif len(info) == 1:                      # 判断信息的长度,若不包含标签,则用"-1"表示
                    image_label = -1
                self.data.append([image_path, image_label])   # 将路径和标签写入[data]容器
In [2]:
        # 对训练数据和验证、测试数据采用不同的数据预处理方法
        # 1) train和trainval:执行随机裁剪,并完成标准化预处理
        # 2) train和trainval:直接执行尺度缩放,并完成标准化预处理
        inputSize = self.args['input_size'][1:3] 
                if len(self.args['input_size'])==3 
                else self.args['input_size']
        if self.isTransforms == 0:
            self.transforms = T.Compose([                      # 0) 必要数据规约
                T.Resize(inputSize),                           # 直接尺度缩放
                T.ToTensor(),                                  # 转换成Paddle规定的Tensor格式
            ])
        elif self.isTransforms == 1 or (self.isTransforms == 2 and mode in ['val', 'test']):
            self.transforms = T.Compose([                      # 1) 基本数据预处理,不含数据增广
                T.Resize(inputSize),                           # 直接尺度缩放
                T.ToTensor(),                                  # 转换成Paddle规定的Tensor格式
                T.Normalize(mean=self.args['mean_value'],      # 均值方差归一化
                            std=self.args['std_value'])    
            ])
        elif self.isTransforms == 2 and mode in ['train', 'trainval']:
            self.transforms = T.Compose([                      # 2) 训练数据预处理,包含数据增广
                T.Resize((256, 256)),                          # 直接尺度缩放
                T.RandomResizedCrop(inputSize),                # 随机裁剪
                T.RandomHorizontalFlip(prob=0.5),              # 水平翻转
                T.RandomRotation(15),                          # 随机旋转
                T.ColorJitter(brightness=0.4,                  # 色彩扰动:亮度、对比度、饱和度和色度
                                contrast=0.4, 
                                saturation=0.4, 
                                hue=0.4),
                T.ToTensor(),                                  # 转换成Paddle规定的Tensor格式
                T.Normalize(mean=self.args['mean_value'],      # 均值方差归一化
                            std=self.args['std_value'])      
            ])
In [2]:
    # 定义数据获取函数,返回单条数据(样本数据、对应的标签)
    def __getitem__(self, index):
        image_path, label = self.data[index]    # 根据索引,从列表中取出图像的路径和列表
        img = cv2.imread(image_path, 1)         # 使用cv2进行数据读取,0为灰度模式,1为彩色模式

        img = self.transforms(img)              # 执行数据预处理
        
        label = np.array(label, dtype='int64')  # 将标签转换为64位整型
        
        return img, label
                
    # 获取数据集的样本总数   
    def __len__(self): 
        return len(self.data)

4.3.3.3 读取数据集¶

In [13]:
# codes04009_asynchronous_create_reader
dataset_train = DatasetZodiac(dataset_root_path, args=args, mode='train')
dataset_val = DatasetZodiac(dataset_root_path, args=args, mode='val')
dataset_trainval = DatasetZodiac(dataset_root_path, args=args, mode='trainval')
dataset_test = DatasetZodiac(dataset_root_path, args=args, mode='test')
In [14]:
print('train:{}, val:{}, trainval:{}, test:{}'
      .format(len(dataset_train),len(dataset_val), len(dataset_test), len(dataset_trainval)))
train:7190, val:650, trainval:660, test:7840
In [15]:
dataset_train[0][0].shape
Out[15]:
[3, 227, 227]
In [11]:
#@save get_Zodiac_labelname_from_labelID TODO:codes04010
def get_Zodiac_labels_from_labelID(label_id):
    """根据标签ID,返回Zodiac数据集的文本标签"""
    label_dict = {'0': 'dog', '1': 'dragon', '2': 'goat', '3': 'horse', 
    '4': 'monkey', '5': 'ox', '6': 'pig', '7': 'rabbit', 
    '8': 'ratt', '9': 'rooster', '10': 'snake', '11': 'tiger'}
    return label_dict[str(label_id)]

print('标签 0 的名称为:{}。'.format(get_Zodiac_labels_from_labelID(0)))
标签 0 的名称为:dog。
In [9]:
#@save get_Zodiac_labelname_from_labelID_by_json TODO:codes04011
def get_Zodiac_labels_from_labelID_by_json(label_id, dataset_info_path):    
    """根据标签ID,返回Zodiac数据集的文本标签,标签信息来源于dataset_info.json"""
    dataset_info = json.load(open(dataset_info_path))
    label_dict = dataset_info['label_dict']
    return label_dict[str(label_id)]

dataset_info_path = os.path.join(dataset_root_path, 'dataset_info.json')
print('标签 0 的名称为:{}。'
      .format(get_Zodiac_labels_from_labelID_by_json(0, dataset_info_path)))
标签 0 的名称为:dog。
In [16]:
#@save show_dataset_images TODO:codes04012
def show_dataset_images(reader, num_rows=2, num_cols=6, scale=1.5):
    _, (image, label) = next(enumerate(reader))
    num_images = num_rows*num_cols
    image = np.transpose(image[0:num_images], (0,2,3,1))
    label = label[0:num_images]

    plt.figure(figsize = (num_cols*scale, num_rows*scale+1))
    for i in range(1, num_rows+1):
        for j in range(1, num_cols+1):
            n = num_cols*(i-1)+j
            ax = plt.subplot(num_rows, num_cols, n)
            ax.set_title(get_Zodiac_labels_from_labelID(int(label[n-1])))
            img = cv2.cvtColor(image[n-1].numpy(), cv2.COLOR_BGR2RGB)
            plt.imshow(img)

dataset_test = DatasetZodiac(dataset_root_path, mode='test', args=args, isTransforms=0)
test_reader = paddle.io.DataLoader(dataset_test, batch_size=32, shuffle=True, drop_last=False)
show_dataset_images(test_reader, num_rows=2, num_cols=6, scale=3)

4.3.3.3 创建小批量数据迭代读取器¶

In [18]:
# codes04013_asynchronous_create_dataLoader
train_reader = paddle.io.DataLoader(dataset_train, 
                                    batch_size=64, shuffle=True,drop_last=True)
val_reader = paddle.io.DataLoader(dataset_val, 
                                  batch_size=64, shuffle=False, drop_last=False)
trainval_reader = paddle.io.DataLoader(dataset_trainval, 
                                       batch_size=64, shuffle=True, drop_last=True)
test_reader = paddle.io.DataLoader(dataset_test, 
                                   batch_size=64, shuffle=False, drop_last=False)

4.3.3.4 测试数据迭代器¶

In [19]:
# codes04014_asynchronous_print_reader
for i, (image, label) in enumerate(val_reader()):
    if i < 2:
        print('验证集batch_{}的图像形态:{}, 标签形态:{}'.format(i, image.shape, label.shape))
    else:
        break
验证集batch_0的图像形态:[64, 3, 227, 227], 标签形态:[64]
验证集batch_1的图像形态:[64, 3, 227, 227], 标签形态:[64]

4.3.3.5 整合所有组件¶

In [20]:
# codes04015_load_dataset_Zodiac
def load_dataset_Zodiac(batch_size=64, transformArgs=args, isTransforms=2):
    """载入Zodiac数据集并对其进行基本预处理"""
    # 1. 实例化数据类
    dataset_train = datasets.DatasetZodiac(dataset_root_path, 
                                           args=transformArgs, isTransforms=isTransforms, mode='train')
    dataset_val = datasets.DatasetZodiac(dataset_root_path, 
                                         args=transformArgs, isTransforms=isTransforms, mode='val')
    dataset_trainval = datasets.DatasetZodiac(dataset_root_path, 
                                              args=transformArgs, isTransforms=isTransforms, mode='trainval')
    dataset_test = datasets.DatasetZodiac(dataset_root_path, 
                                          args=transformArgs, isTransforms=isTransforms, mode='test')

    # 2. 创建小批量数据迭代读取器
    # 使用paddle.io.DataLoader 定义DataLoader对象用于加载Python生成器产生的数据,
    # DataLoader 返回的是一个批次数据迭代器,并且是异步的。
    train_reader = paddle.io.DataLoader(dataset_train, 
                                        batch_size=batch_size, shuffle=True, drop_last=True)
    val_reader = paddle.io.DataLoader(dataset_val, 
                                      batch_size=batch_size, shuffle=False, drop_last=False)
    trainval_reader = paddle.io.DataLoader(dataset_trainval, 
                                           batch_size=batch_size, shuffle=True, drop_last=True)
    test_reader = paddle.io.DataLoader(dataset_test, 
                                       batch_size=batch_size, shuffle=False, drop_last=False)
    
    return train_reader, val_reader, train_reader, test_reader
In [21]:
import os
import sys
sys.path.append(r'D:\WorkSpace\DeepLearning\WebsiteV2')   # 定义课程自定义模块保存位置
from codes.paddle import datasets                         # 载入自定义模块

dataset_name      = 'Zodiac'
dataset_path      = 'D:\\Workspace\\ExpDatasets\\'
dataset_root_path = os.path.join(dataset_path, dataset_name)

args={
    'input_size': (100, 100),               # 定义图像输入模型时的尺寸
    'mean_value': [0.485, 0.456, 0.406],    # Imagenet均值
    'std_value': [0.229, 0.224, 0.225]      # Imagenet标准差
}

# codes04016_Zodiac_create_dataLoader
train_reader, val_reader, train_reader, test_reader 
                = load_dataset_Zodiac(batch_size=32, transformArgs=args, isTransforms=2)
for i, (image, label) in enumerate(val_reader):
    if i < 2:
        print('验证集batch_{}的图像形态:{}, 标签形态:{}'.format(i, image.shape, label.shape))
    else:
        break
验证集batch_0的图像形态:[32, 3, 100, 100], 标签形态:[32]
验证集batch_1的图像形态:[32, 3, 100, 100], 标签形态:[32]

4.3.4 图像的均值消除¶

In [2]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import paddle
import paddle.vision.transforms as transforms
plt.rcParams['font.sans-serif'] = ['simhei']
plt.rcParams['font.size'] = 14

# 1. 设置图片路径
img_dir = r'..\..\Images\Materials\chapter04Datasets'
img_name = 'chapter04008AugmentationExampleRIO.jpg'
img_path = os.path.join(img_dir, img_name)
mean_value = np.array([0.485,0.456,0.406])*255.  # RGB->BGR
std_value = np.array([0.229,0.224,0.225])*255.  # RGB->BGR

# 2. 图像载入
img = cv2.imread(img_path, 1) 
In [ ]:
# 3. 执行图像变换
transform = transforms.Normalize(mean=mean_value, std=std_value, data_format='HWC')
fake_img = transform(img)
fake_img = cv2.cvtColor(fake_img, cv2.COLOR_RGB2BGR)
plt.imshow(fake_img)
WARNING 2022-12-30 17:59:02,756 image.py:725] Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Out[ ]:
<matplotlib.image.AxesImage at 0x22281c9a670>