目录
- 引言
- 第一步:读取yaml配置文件
- 第二步创建Dataset和DataLoader
- 初始化图片文件名和标注文件名集合
- 加载标注文件
- 加载图片
- 最后一步:读取数据到网络
- 总结
引言
本文以YOLOV5目标检测算法作为例子,探讨当数据集为图片集组成的文件夹时具体的数据集加载方式。
第一步:读取yaml配置文件
yolov5的.yaml配置文件主要用来配置数据集、测试集、验证集的路径以及待检测物体的种类和名称。以coco数据集的.yaml文件举例:
# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license
# COCO 2017 dataset http://cocodataset.org by Microsoft
# Example usage: python train.py --data coco.yaml
# parent
# ├── yolov5
# └── datasets
# └── coco ← downloads here (20.1 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: ../datasets/coco # dataset root dir
train: train2017.txt # train images (relative to 'path') 118287 images
val: val2017.txt # val images (relative to 'path') 5000 images
test: test-dev2017.txt # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794
# Classes
names:
0: person
1: bicycle
2: car
3: motorcycle
在读取数据集之前首先需要读取.yaml文件中的这些信息,在train.py中Config模块中完成这一步骤。
# Config
plots = not evolve and not opt.noplots # create plots
cuda = device.type != 'cpu'
init_seeds(opt.seed + 1 + RANK, deterministic=True)
with torch_distributed_zero_first(LOCAL_RANK):
data_dict = data_dict or check_dataset(data) # check if None
train_path, val_path = data_dict['train'], data_dict['val']
check_dataset函数中,将训练集、验证集、测试集的路径拼凑出来,最后封装成一个Dictionary返回给data_dict变量,因此train_path, val_path就是完整的训练集和验证集目录。YOLOV5也是通过之前文章详细讲过的Dataset+DataLoader组合对数据进行加载和预处理。
第二步创建Dataset和DataLoader
初始化图片文件名和标注文件名集合
# Trainloader
train_loader, dataset = create_dataloader(train_path,
imgsz,
batch_size // WORLD_SIZE,
gs,
single_cls,
hyp=hyp,
augment=True,
cache=None if opt.cache == 'val' else opt.cache,
rect=opt.rect,
rank=LOCAL_RANK,
workers=workers,
image_weights=opt.image_weights,
quad=opt.quad,
prefix=colorstr('train: '),
shuffle=True,
seed=opt.seed)
create_dataLoader函数会同时返回dataset和DataLoader,YOLOV5的数据集与MNIST数据集的不同就在于YOLOV5对数据加载有着不同的选择方式通过cache参数进行控制,具体代码如下:
class LoadImagesAndLabels(Dataset):
# YOLOv5 train_loader/val_loader, loads images and labels for training and validation
cache_version = 0.6 # dataset labels *.cache version
rand_interp_methods = [cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4]
def __init__(self,
path,
img_size=640,
batch_size=16,
augment=False,
hyp=None,
rect=False,
image_weights=False,
cache_images=False,
single_cls=False,
stride=32,
pad=0.0,
min_items=0,
prefix=''):
self.img_size = img_size
self.augment = augment
self.hyp = hyp
self.image_weights = image_weights
self.rect = False if image_weights else rect
self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training)
self.mosaic_border = [-img_size // 2, -img_size // 2]
self.stride = stride
self.path = path
self.albumentations = Albumentations(size=img_size) if augment else None
try:
f = [] # image files
for p in path if isinstance(path, list) else [path]:
p = Path(p) # os-agnostic
if p.is_dir(): # dir
f += glob.glob(str(p / '**' / '*.*'), recursive=True)
# f = list(p.rglob('*.*')) # pathlib
elif p.is_file(): # file
with open(p) as t:
t = t.read().strip().splitlines()
parent = str(p.parent) + os.sep
f += [x.replace('./', parent, 1) if x.startswith('./') else x for x in t] # to global path
# f += [p.parent / x.lstrip(os.sep) for x in t] # to global path (pathlib)
else:
raise FileNotFoundError(f'{prefix}{p} does not exist')
self.im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
# self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
assert self.im_files, f'{prefix}No images found'
except Exception as e:
raise Exception(f'{prefix}Error loading data from {path}: {e}\n{HELP_URL}') from e
这段代码说明它并不是直接将图片数据加载到内存中,而是首先通过文件夹将文件夹下所有的文件名字添加到一个列表下。再判断文件名是否是图像格式的文件名,将文件名做一个筛选然后再保存到im_files变量中,这里需要将一下的是python中的glob.glob()函数是递归的遍历整个目录下的子目录和文件,随后保存与通配符匹配的文件名。这种方式在文件数目非常大的时候是延迟较大的,而如果是在分布式文件系统上,这样的方式对性能的影响就更加大了,不过大部分的AI工作者对训练性能上的研究仍然是集中在GPU算力这方面的,因为数据集的大小始终不足以达到在时间耗费上的占比引起足够的重视。
加载标注文件
# Check cache
self.label_files = img2label_paths(self.im_files) # labels
cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')
try:
cache, exists = np.load(cache_path, allow_pickle=True).item(), True # load dict
assert cache['version'] == self.cache_version # matches current version
assert cache['hash'] == get_hash(self.label_files + self.im_files) # identical hash
except Exception:
cache, exists = self.cache_labels(cache_path, prefix), False # run cache ops
首先将图片的文件路径通过img2label()函数替换成label的路径。
上面的代码实现了一种对标注信息缓存的机制,为了方便在下次使用中不用重新加载标注信息,因此对标注信息做一个缓存。cache_labels()的功能就是如此:
def cache_labels(self, path=Path('./labels.cache'), prefix=''):
# Cache dataset labels, check images and read shapes
x = {} # dict
nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
desc = f'{prefix}Scanning {path.parent / path.stem}...'
with Pool(NUM_THREADS) as pool:
pbar = tqdm(pool.imap(verify_image_label, zip(self.im_files, self.label_files, repeat(prefix))),
desc=desc,
total=len(self.im_files),
bar_format=TQDM_BAR_FORMAT)
for im_file, lb, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar:
nm += nm_f
nf += nf_f
ne += ne_f
nc += nc_f
if im_file:
x[im_file] = [lb, shape, segments]
if msg:
msgs.append(msg)
pbar.desc = f'{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt'
pbar.close()
if msgs:
LOGGER.info('\n'.join(msgs))
if nf == 0:
LOGGER.warning(f'{prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}')
x['hash'] = get_hash(self.label_files + self.im_files)
x['results'] = nf, nm, ne, nc, len(self.im_files)
x['msgs'] = msgs # warnings
x['version'] = self.cache_version # cache version
try:
np.save(path, x) # save cache for next time
path.with_suffix('.cache.npy').rename(path) # remove .npy suffix
LOGGER.info(f'{prefix}New cache created: {path}')
except Exception as e:
LOGGER.warning(f'{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable: {e}') # not writeable
return x
cache_labels()采用线程池,多个线程运行verify_image_label()函数,值得一提的是verify_image_label()函数是用于对图像文件和标注文件进行检查的函数,涉及到检查那么就一定会去访问源文件,它的源代码如下:
def verify_image_label(args):
# Verify one image-label pair
im_file, lb_file, prefix = args
nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments
try:
# verify images
im = Image.open(im_file)
im.verify() # PIL verify
shape = exif_size(im) # image size
assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
if im.format.lower() in ('jpg', 'jpeg'):
with open(im_file, 'rb') as f:
f.seek(-2, 2)
if f.read() != b'\xff\xd9': # corrupt JPEG
ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100)
msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved'
# verify labels
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file) as f:
lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
if any(len(x) > 6 for x in lb): # is segment
classes = np.array([x[0] for x in lb], dtype=np.float32)
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb] # (cls, xy1...)
lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
lb = np.array(lb, dtype=np.float32)
nl = len(lb)
if nl:
assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected'
assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}'
assert (lb[:, 1:] <= 1).all(), f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}'
_, i = np.unique(lb, axis=0, return_index=True)
if len(i) < nl: # duplicate row check
lb = lb[i] # remove duplicates
if segments:
segments = [segments[x] for x in i]
msg = f'{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed'
else:
ne = 1 # label empty
lb = np.zeros((0, 5), dtype=np.float32)
else:
nm = 1 # label missing
lb = np.zeros((0, 5), dtype=np.float32)
return im_file, lb, shape, segments, nm, nf, ne, nc, msg
except Exception as e:
nc = 1
msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}'
return [None, None, None, None, nm, nf, ne, nc, msg]
可以看到在verify_image_label()函数中会逐个的读取图像文件和Lables文件进行检查,值得注意的是这个函数对图像文件进行了两次读取,一次是以Image对象的方式读取,用于对图像进行检测和修复以及获取图片的shape。一次是以二进制的方法读取,用于检测图像是否损坏。也就是说每次的图片和标注信息的检查都会读取两次图片文件和一次标注文件,这里的图片检查对文件的频繁访问肯定是会对数据集读取性能造成影响的。
最后verify_image_label()函数将没有问题的图片路径和标注信息返回。cache_labels()函数获取过后做简单的处理,以{图像路径:标注信息}的方式将数据存储到磁盘上。
cache文件的读取,会重新读取之前图像数据集的检查情况,对于不合格的图像和标注,会从图像路径列表中直接删除,也就是采用cache中合法的图像路径列表。读取cache文件的代码如下:
[cache.pop(k) for k in ('hash', 'version', 'msgs')] # remove items
labels, shapes, self.segments = zip(*cache.values())
nl = len(np.concatenate(labels, 0)) # number of labels
assert nl > 0 or not augment, f'{prefix}All labels empty in {cache_path}, can not start training. {HELP_URL}'
self.labels = list(labels)
self.shapes = np.array(shapes)
self.im_files = list(cache.keys()) # update
self.label_files = img2label_paths(cache.keys()) # update
加载图片
MNIST数据集的数据加载方式是在初始化时候将所有的图片数据全部加载到内存中,而yolov5目标检测算法的数据集加载方式并不是固定的,可以通过cache参数选择数据的加载和存储方式。
if cache_images == 'ram' and not self.check_cache_ram(prefix=prefix):
cache_images = False
self.ims = [None] * n ##缓存所有图片?
self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files]
if cache_images:
b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes
self.im_hw0, self.im_hw = [None] * n, [None] * n
fcn = self.cache_images_to_disk if cache_images == 'disk' else self.load_image
results = ThreadPool(NUM_THREADS).imap(fcn, range(n))
pbar = tqdm(enumerate(results), total=n, bar_format=TQDM_BAR_FORMAT, disable=LOCAL_RANK > 0)
for i, x in pbar:
if cache_images == 'disk':
b += self.npy_files[i].stat().st_size
else: # 'ram'
self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i)
b += self.ims[i].nbytes
pbar.desc = f'{prefix}Caching images ({b / gb:.1f}GB {cache_images})'
pbar.close()
当cache==‘raw’时表示会将全部数据缓存到内存中,之后会判断内存是否足够存储所有数据。如果足够会计算需要的内存总量并且将所有的图片文件全部加载到内存。
if cache_images:
b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes
self.im_hw0, self.im_hw = [None] * n, [None] * n
fcn = self.cache_images_to_disk if cache_images == 'disk' else self.load_image
load_image函数的实现如下:
def load_image(self, i):
# Loads 1 image from dataset index 'i', returns (im, original hw, resized hw)
im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i],
if im is None: # not cached in RAM
if fn.exists(): # load npy
im = np.load(fn)
else: # read image
im = cv2.imread(f) # BGR
assert im is not None, f'Image Not Found {f}'
h0, w0 = im.shape[:2] # orig hw
r = self.img_size / max(h0, w0) # ratio
if r != 1: # if sizes are not equal
interp = cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA
im = cv2.resize(im, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
return im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
return self.ims[i], self.im_hw0[i], self.im_hw[i] # im, hw_original, hw_resized
可以看到当图片数据不在内存中时,会查看是否之前有将数据处理过并保存成.npy文件(.npy 文件是NumPy库中用于保存单个多维数组数据的二进制文件格式。和图像文件相比.npy 文件具有更小的文件大小和更快的读写速度,因为它以二进制格式存储数据。),如果有则直接加载。没有就会通过opencv库的图像加载函数将图片加载到内存中。
当cache_images='disk’时,会将图片读取并且以二进制形式保存到磁盘中。cache_images_to_disk 函数的实现如下:
def cache_images_to_disk(self, i):
# Saves an image as an *.npy file for faster loading
f = self.npy_files[i]
if not f.exists():
np.save(f.as_posix(), cv2.imread(self.im_files[i]))
可以看到cache_images_to_disk()函数也是将图像挨个读取到内存并且以二进制形式保存回磁盘。
最后一步:读取数据到网络
当数据访问方式确定后,剩下的工作就是利用DataLoader加载数据,具体的代码分析之前的文章有讲过这里就不再赘述。值得一提的是,在不使用加权图像策略的情况下,yolov5采用的是InfiniteDataLoader。它是DataLoader的子类,与DataLoader不同的是InfiniteDataLoader 是一种无限循环的数据加载器,它会持续地提供数据样本,直到人为地停止数据加载过程(手动退出循环)。最后来看一下dataset中的__getitem__()代码吧,在之前的文章我说过,这个是数据加载的最核心的代码。
def __getitem__(self, index):
index = self.indices[index] # linear, shuffled, or image_weights
hyp = self.hyp
mosaic = self.mosaic and random.random() < hyp['mosaic']
if mosaic:
# Load mosaic
img, labels = self.load_mosaic(index)
shapes = None
# MixUp augmentation
if random.random() < hyp['mixup']:
img, labels = mixup(img, labels, *self.load_mosaic(random.randint(0, self.n - 1)))
else:
# Load image
img, (h0, w0), (h, w) = self.load_image(index)
# Letterbox
shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape
img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling
labels = self.labels[index].copy()
if labels.size: # normalized xywh to pixel xyxy format
labels[:, 1:] = xywhn2xyxy(labels[:, 1:], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1])
if self.augment:
img, labels = random_perspective(img,
labels,
degrees=hyp['degrees'],
translate=hyp['translate'],
scale=hyp['scale'],
shear=hyp['shear'],
perspective=hyp['perspective'])
nl = len(labels) # number of labels
if nl:
labels[:, 1:5] = xyxy2xywhn(labels[:, 1:5], w=img.shape[1], h=img.shape[0], clip=True, eps=1E-3)
if self.augment:
# Albumentations
img, labels = self.albumentations(img, labels)
nl = len(labels) # update after albumentations
# HSV color-space
augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
# Flip up-down
if random.random() < hyp['flipud']:
img = np.flipud(img)
if nl:
labels[:, 2] = 1 - labels[:, 2]
# Flip left-right
if random.random() < hyp['fliplr']:
img = np.fliplr(img)
if nl:
labels[:, 1] = 1 - labels[:, 1]
# Cutouts
# labels = cutout(img, labels, p=0.5)
# nl = len(labels) # update after cutout
labels_out = torch.zeros((nl, 6))
if nl:
labels_out[:, 1:] = torch.from_numpy(labels)
# Convert
img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
img = np.ascontiguousarray(img)
return torch.from_numpy(img), labels_out, self.im_files[index], shapes
可以看到__getitem__()函数依旧是使用的上面的load_image()方法。
当既没有设置将图像数据全部加载到内存也没有设置以二进制的方式重新保存到磁盘时,会逐个加载图像。加载图像的方式,都是通过文件路径队列直接读取。
总结
加载数据首先通过文件夹路径,匹配所有文件路径添加到一个队列。
首先会对队列里面的所有文件做检查,这里会读取两次图像文件和一次标注文件。并将合法的文件信息以{图像路径:标签}的方式保存到磁盘中,方便下次训练读取。
对于图像的加载当设置为全部加载到内存时,会在dataset初始化时判断能否加入,再全部加载。当设置为加载图片到磁盘时,会读取图片并将图片重新以二进制的方式存储到磁盘。
当既没有设置将图像数据全部加载到内存也没有设置以二进制的方式重新保存到磁盘时,就对每个图片逐个加载,加载图像的方式,都是通过文件路径队列直接读取。
在整个训练过程中对于数据集的读取和存储都是有很多次的,而当数据过大,本地磁盘无法全部容纳的情况下,多次的数据存储和读取肯定会对整体性能有影响。