想要玩转faster rcnn实现自己组织数据来进行faster rcnn的训练,那么必须得弄清楚,faster rcnn的roibatchLoader送给网络的训练数据格式是什么样子的,在此处对这个问题做一个详细的研究:
我们知道基于pytorch的工作机制,网络是通过roibatchLoader中的__getitem__(self, index)这个函数来获取训练数据的,__getitem__(self, index)的源代码如下:
def __getitem__(self, index):
if self.training:
index_ratio = int(self.ratio_index[index])
else:
index_ratio = index
# get the anchor index for current sample index
# here we set the anchor index to the last one
# sample in this group
minibatch_db = [self._roidb[index_ratio]]
blobs = get_minibatch(minibatch_db, self._num_classes)
data = torch.from_numpy(blobs['data'])
im_info = torch.from_numpy(blobs['im_info'])
# we need to random shuffle the bounding box.
data_height, data_width = data.size(1), data.size(2)
if self.training:
np.random.shuffle(blobs['gt_boxes'])
gt_boxes = torch.from_numpy(blobs['gt_boxes'])
########################################################
# padding the input image to fixed size for each group #
########################################################
# NOTE1: need to cope with the case where a group cover both conditions. (done)
# NOTE2: need to consider the situation for the tail samples. (no worry)
# NOTE3: need to implement a parallel data loader. (no worry)
# get the index range
# if the image need to crop, crop to the target size.
ratio = self.ratio_list_batch[index]
if self._roidb[index_ratio]['need_crop']:
if ratio < 1:
# this means that data_width << data_height, we need to crop the
# data_height
min_y = int(torch.min(gt_boxes[:,1]))
max_y = int(torch.max(gt_boxes[:,3]))
trim_size = int(np.floor(data_width / ratio))
if trim_size > data_height:
trim_size = data_height
box_region = max_y - min_y + 1
if min_y == 0:
y_s = 0
else:
if (box_region-trim_size) < 0:
y_s_min = max(max_y-trim_size, 0)
y_s_max = min(min_y, data_height-trim_size)
if y_s_min == y_s_max:
y_s = y_s_min
else:
y_s = np.random.choice(range(y_s_min, y_s_max))
else:
y_s_add = int((box_region-trim_size)/2)
if y_s_add == 0:
y_s = min_y
else:
y_s = np.random.choice(range(min_y, min_y+y_s_add))
# crop the image
data = data[:, y_s:(y_s + trim_size), :, :]
# shift y coordiante of gt_boxes
gt_boxes[:, 1] = gt_boxes[:, 1] - float(y_s)
gt_boxes[:, 3] = gt_boxes[:, 3] - float(y_s)
# update gt bounding box according the trip
gt_boxes[:, 1].clamp_(0, trim_size - 1)
gt_boxes[:, 3].clamp_(0, trim_size - 1)
else:
# this means that data_width >> data_height, we need to crop the
# data_width
min_x = int(torch.min(gt_boxes[:,0]))
max_x = int(torch.max(gt_boxes[:,2]))
trim_size = int(np.ceil(data_height * ratio))
if trim_size > data_width:
trim_size = data_width
box_region = max_x - min_x + 1
if min_x == 0:
x_s = 0
else:
if (box_region-trim_size) < 0:
x_s_min = max(max_x-trim_size, 0)
x_s_max = min(min_x, data_width-trim_size)
if x_s_min == x_s_max:
x_s = x_s_min
else:
x_s = np.random.choice(range(x_s_min, x_s_max))
else:
x_s_add = int((box_region-trim_size)/2)
if x_s_add == 0:
x_s = min_x
else:
x_s = np.random.choice(range(min_x, min_x+x_s_add))
# crop the image
data = data[:, :, x_s:(x_s + trim_size), :]
# shift x coordiante of gt_boxes
gt_boxes[:, 0] = gt_boxes[:, 0] - float(x_s)
gt_boxes[:, 2] = gt_boxes[:, 2] - float(x_s)
# update gt bounding box according the trip
gt_boxes[:, 0].clamp_(0, trim_size - 1)
gt_boxes[:, 2].clamp_(0, trim_size - 1)
# based on the ratio, padding the image.
if ratio < 1:
# this means that data_width < data_height
trim_size = int(np.floor(data_width / ratio))
padding_data = torch.FloatTensor(int(np.ceil(data_width / ratio)), \
data_width, 3).zero_()
padding_data[:data_height, :, :] = data[0]
# update im_info
im_info[0, 0] = padding_data.size(0)
# print("height %d %d \n" %(index, anchor_idx))
elif ratio > 1:
# this means that data_width > data_height
# if the image need to crop.
padding_data = torch.FloatTensor(data_height, \
int(np.ceil(data_height * ratio)), 3).zero_()
padding_data[:, :data_width, :] = data[0]
im_info[0, 1] = padding_data.size(1)
else:
trim_size = min(data_height, data_width)
padding_data = torch.FloatTensor(trim_size, trim_size, 3).zero_()
padding_data = data[0][:trim_size, :trim_size, :]
# gt_boxes.clamp_(0, trim_size)
gt_boxes[:, :4].clamp_(0, trim_size)
im_info[0, 0] = trim_size
im_info[0, 1] = trim_size
# check the bounding box:
not_keep = (gt_boxes[:,0] == gt_boxes[:,2]) | (gt_boxes[:,1] == gt_boxes[:,3])
keep = torch.nonzero(not_keep == 0).view(-1)
gt_boxes_padding = torch.FloatTensor(self.max_num_box, gt_boxes.size(1)).zero_()
if keep.numel() != 0:
gt_boxes = gt_boxes[keep]
num_boxes = min(gt_boxes.size(0), self.max_num_box)
gt_boxes_padding[:num_boxes,:] = gt_boxes[:num_boxes]
else:
num_boxes = 0
# permute trim_data to adapt to downstream processing
padding_data = padding_data.permute(2, 0, 1).contiguous()
im_info = im_info.view(3)
return padding_data, im_info, gt_boxes_padding, num_boxes
else:
data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width)
im_info = im_info.view(3)
gt_boxes = torch.FloatTensor([1,1,1,1,1])
num_boxes = 0
return data, im_info, gt_boxes, num_boxes
从源代码中可以看到,返回的数据有4个部分:data, im_info, gt_boxes, num_boxes,我们需要一个一个来看:
data的来源如下:
blobs = get_minibatch(minibatch_db, self._num_classes)
data = torch.from_numpy(blobs['data'])
我们进一步的去查看get_minibatch(minibatch_db, self._num_classes)函数源码如下:
def get_minibatch(roidb, num_classes):
"""Given a roidb, construct a minibatch sampled from it."""
num_images = len(roidb)
# Sample random scales to use for each image in this batch
random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
size=num_images)
assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
'num_images ({}) must divide BATCH_SIZE ({})'. \
format(num_images, cfg.TRAIN.BATCH_SIZE)
# Get the input image blob, formatted for caffe
im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
blobs = {'data': im_blob}
assert len(im_scales) == 1, "Single batch only"
assert len(roidb) == 1, "Single batch only"
# gt boxes: (x1, y1, x2, y2, cls)
if cfg.TRAIN.USE_ALL_GT:
# Include all ground truth boxes
gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
else:
# For the COCO ground truth boxes, exclude the ones that are ''iscrowd''
gt_inds = np.where((roidb[0]['gt_classes'] != 0) & np.all(roidb[0]['gt_overlaps'].toarray() > -1.0, axis=1))[0]
gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
blobs['gt_boxes'] = gt_boxes
blobs['im_info'] = np.array(
[[im_blob.shape[1], im_blob.shape[2], im_scales[0]]],
dtype=np.float32)
blobs['img_id'] = roidb[0]['img_id']
return blobs
从上述get_minibatch(minibatch_db, self._num_classes)的源码中我们可以看到,blobs是一个字典(也可以理解为json格式),包含的字段如下:
blobs = {'data': im_blob}
blobs['gt_boxes'] = gt_boxes
blobs['im_info']
= np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]],dtype=np.float32)
blobs['img_id'] = roidb[0]['img_id']
其中的核心是:im_blob,而im_blob来自于_get_image_blob(roidb, random_scale_inds),因此进一步的去查看_get_image_blob(roidb, random_scale_inds)源码如下:
def _get_image_blob(roidb, scale_inds):
"""Builds an input blob from the images in the roidb at the specified
scales.
"""
num_images = len(roidb)
processed_ims = []
im_scales = []
for i in range(num_images):
#im = cv2.imread(roidb[i]['image'])
im = imread(roidb[i]['image'])
if len(im.shape) == 2:
im = im[:,:,np.newaxis]
im = np.concatenate((im,im,im), axis=2)
# flip the channel, since the original one using cv2
# rgb -> bgr
im = im[:,:,::-1]
if roidb[i]['flipped']:
im = im[:, ::-1, :]
target_size = cfg.TRAIN.SCALES[scale_inds[i]]
im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
cfg.TRAIN.MAX_SIZE)
im_scales.append(im_scale)
processed_ims.append(im)
# Create a blob to hold the input images
blob = im_list_to_blob(processed_ims)
return blob, im_scales
上述源码中用到的prep_im_for_blob函数和 im_list_to_blob函数的源码一并附上:
def im_list_to_blob(ims):
"""Convert a list of images into a network input.
Assumes images are already prepared (means subtracted, BGR order, ...).
"""
max_shape = np.array([im.shape for im in ims]).max(axis=0)
num_images = len(ims)
blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
dtype=np.float32)
for i in xrange(num_images):
im = ims[i]
blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
return blob
def prep_im_for_blob(im, pixel_means, target_size, max_size):
"""Mean subtract and scale an image for use in a blob."""
im = im.astype(np.float32, copy=False)
im -= pixel_means
# im = im[:, :, ::-1]
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(target_size) / float(im_size_min)
# Prevent the biggest axis from being more than MAX_SIZE
# if np.round(im_scale * im_size_max) > max_size:
# im_scale = float(max_size) / float(im_size_max)
# im = imresize(im, im_scale)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
interpolation=cv2.INTER_LINEAR)
return im, im_scale
从该源码中可以看出,im_blob是一个NXHmaxXWmaxX3的一个数列(array),这个array所存储的图像已经都resize到参数cfg.TRAIN.SCALES所指定的尺度大小。
同时从上述的get_minibatch(roidb, num_classes)可以看到,blobs的数据来源还有一个重要的源头,那就是:roidb。因此,还需要把roidb的结构给弄清楚,roidb来源于roi_data_layer.roidb.combined_roidb(args.imdb_name),其源头在:roi_data_layer.roidb.prepare_roidb,它的源码如下:
def prepare_roidb(imdb):
"""Enrich the imdb's roidb by adding some derived quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
"""
roidb = imdb.roidb
if not (imdb.name.startswith('coco')):
cache_file = os.path.join(imdb.cache_path, imdb.name + '_sizes.pkl')
if os.path.exists(cache_file):
print('Image sizes loaded from %s' % cache_file)
with open(cache_file, 'rb') as f:
sizes = pickle.load(f)
else:
print('Extracting image sizes... (It may take long time)')
sizes = [PIL.Image.open(imdb.image_path_at(i)).size
for i in range(imdb.num_images)]
with open(cache_file, 'wb') as f:
pickle.dump(sizes, f)
print('Done!!')
for i in range(len(imdb.image_index)):
roidb[i]['img_id'] = imdb.image_id_at(i)
roidb[i]['image'] = imdb.image_path_at(i)
if not (imdb.name.startswith('coco')):
roidb[i]['width'] = sizes[i][0]
roidb[i]['height'] = sizes[i][1]
# need gt_overlaps as a dense array for argmax
gt_overlaps = roidb[i]['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap
max_classes = gt_overlaps.argmax(axis=1)
roidb[i]['max_classes'] = max_classes
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks
# max overlap of 0 => class should be zero (background)
zero_inds = np.where(max_overlaps == 0)[0]
assert all(max_classes[zero_inds] == 0)
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0]
assert all(max_classes[nonzero_inds] != 0)
从上面的源码中,我们可以看到,roidb包含的数据结构如下:
每个roidb中存放的是一个batch的图像数据及其相关信息。
roidb[i]['img_id'] = imdb.image_id_at(i) #第i张图像的image_id
roidb[i]['image'] = imdb.image_path_at(i) #第i张图像的文件路径
roidb[i]['width'] = sizes[i][0] #第i张图像的高H
roidb[i]['height'] = sizes[i][1] #第i张图像的宽W
roidb[i]['gt_overlaps'] #这个比较复杂,待会另起一篇博客详细讲,请见下文
roidb[i]['max_classes'] = max_classes #IoU最大的框所对应的类别,下文详述
roidb[i]['max_overlaps'] = max_overlaps #IoU最大的框所对应的IoU的值
上面将roidb分析清楚之后,我们回到blobs['img_id'] = roidb[0]['img_id']这个,我们就可以看出,其实blobs['img_id']存储的就是
当前这个batch的图像的文件名列表。我们回过头来总结以下blobs的结构:
blobs = {'data': im_blob} # NXHmaxXWmaxX3的一个数列,用于存储图像的原始矩阵数据
blobs['gt_boxes'] = gt_boxes # 图像中的grountruth框
blobs['im_info'] #
= np.array([[Hmax, Wmax, 图像的放缩尺度]],dtype=np.float32)
blobs['img_id'] = roidb[0]['img_id']#batch中的图像文件名列表。