mean average precision
mAP)只是把每个类别的AP都算了一遍,再取平均值
因此,AP是针对单个类别的,mAP是针对所有类别的。
先通过图片直观看AP计算过程,绿色为gt,红色为预测框,数字为类别,score为置信度。
mAP是分类别计算的,所以先拿出所有1类别的pre的label,然后计算iou iou结果shape为[3,1],把iou小于阈值的去掉,假设下面这个预测框去掉了,上面这两个框都满足,一个gt也只能匹配一个预测框,score低的不会匹配成功,则每一个框用一个match变量记录是否匹配成功,得到的结果match应该为[1,0,0] (左上角为第一个)
然后拿出所有的2类别的预测框和gt,同样计算。。。。依次这样计算出所有label,得到了一副图片下的各类别match,再循环图片得到各个类别的match,最后通过match得到TP,FP,然后prec,recall,然后AP,然后mAP。
然后就是代码了
首先是计算map的主函数,传入dataloader(pytorch数据集)和模型,得到map
def eval(dataloader, faster_rcnn, test_num=10000):
pred_bboxes, pred_labels, pred_scores = list(), list(), list()
gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):
“” 将每个图片预测出来的结果放在图片的一个列表中“”
sizes = [sizes[0][0], sizes[1][0]]
pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])
#将真实的目标框、类别、difficults存入list
gt_bboxes += list(gt_bboxes_.numpy())
gt_labels += list(gt_labels_.numpy())
gt_difficults += list(gt_difficults_.numpy())
#将预测的目标框、类别、分数存入list
pred_bboxes += pred_bboxes_
pred_labels += pred_labels_
pred_scores += pred_scores_
if ii == test_num: break
#返回dictz字典,两个key值:AP、mAP
result = eval_detection_voc(
pred_bboxes, pred_labels, pred_scores,
gt_bboxes, gt_labels, gt_difficults,
use_07_metric=True)
return result
接下来看eval_detection_voc函数,它是先计算prec和rec然后计算ap,map
def eval_detection_voc(
pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
gt_difficults=None,
iou_thresh=0.5, use_07_metric=False):
prec, rec = calc_detection_voc_prec_rec(
pred_bboxes, pred_labels, pred_scores,
gt_bboxes, gt_labels, gt_difficults,
iou_thresh=iou_thresh)
ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric)
return {'ap': ap, 'map': np.nanmean(ap)}
先看计算pre,rec的函数
def calc_detection_voc_prec_rec(
pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
gt_difficults=None,
iou_thresh=0.5):
pred_bboxes = iter(pred_bboxes)
pred_labels = iter(pred_labels)
pred_scores = iter(pred_scores)
gt_bboxes = iter(gt_bboxes)
gt_labels = iter(gt_labels)
if gt_difficults is None:
gt_difficults = itertools.repeat(None)
else:
gt_difficults = iter(gt_difficults)
n_pos = defaultdict(int)
score = defaultdict(list)
match = defaultdict(list)
for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \
six.moves.zip(
pred_bboxes, pred_labels, pred_scores,
gt_bboxes, gt_labels, gt_difficults):
"""先对每个图片进行循环"""
if gt_difficult is None:
gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool)
for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
"""循环每个类别"""
pred_mask_l = pred_label == l
pred_bbox_l = pred_bbox[pred_mask_l]
pred_score_l = pred_score[pred_mask_l]
# sort by score
order = pred_score_l.argsort()[::-1]
pred_bbox_l = pred_bbox_l[order]
pred_score_l = pred_score_l[order]
gt_mask_l = gt_label == l
gt_bbox_l = gt_bbox[gt_mask_l]
gt_difficult_l = gt_difficult[gt_mask_l]
n_pos[l] += np.logical_not(gt_difficult_l).sum()
score[l].extend(pred_score_l)
if len(pred_bbox_l) == 0:#如果没有预测这个类别的 直接循环下一个类别了
continue
if len(gt_bbox_l) == 0:#如果gt没有这个类别,还预测了这个类别,match填入0,没匹配上
match[l].extend((0,) * pred_bbox_l.shape[0])
continue
# VOC evaluation follows integer typed bounding boxes.
pred_bbox_l = pred_bbox_l.copy()
pred_bbox_l[:, 2:] += 1
gt_bbox_l = gt_bbox_l.copy()
gt_bbox_l[:, 2:] += 1
iou = bbox_iou(pred_bbox_l, gt_bbox_l)#对于每个图片,类别正确 才开始计算iou,iou>阈值 才说明正确
gt_index = iou.argmax(axis=1)
# set -1 if there is no matching ground truth
gt_index[iou.max(axis=1) < iou_thresh] = -1
del iou
selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
for gt_idx in gt_index: #这里是每个gt只匹配一次,下面图片说明
if gt_idx >= 0:
if gt_difficult_l[gt_idx]:
match[l].append(-1)
else:
if not selec[gt_idx]:
match[l].append(1)
else:
match[l].append(0)
selec[gt_idx] = True
else:
match[l].append(0)
for iter_ in (
pred_bboxes, pred_labels, pred_scores,
gt_bboxes, gt_labels, gt_difficults):
if next(iter_, None) is not None:
raise ValueError('Length of input iterables need to be same.')
n_fg_class = max(n_pos.keys()) + 1
prec = [None] * n_fg_class
rec = [None] * n_fg_class
for l in n_pos.keys():
score_l = np.array(score[l])
match_l = np.array(match[l], dtype=np.int8)
order = score_l.argsort()[::-1]
match_l = match_l[order]
tp = np.cumsum(match_l == 1)
fp = np.cumsum(match_l == 0)
# If an element of fp + tp is 0,
# the corresponding element of prec[l] is nan.
prec[l] = tp / (fp + tp)
# If n_pos[l] is 0, rec[l] is None.
if n_pos[l] > 0:
rec[l] = tp / n_pos[l]
return prec, rec
假设绿色为gt,红色为预测的两个框,他们都和gt相交, 1框分数为0.8,2框分数为0.7,假设两个框iou都大于阈值了,gt与2框成功匹配后,不再与1框匹配,尽管他们label和iou都符合要求,只能匹配一个。
每一个预测框都对应于match中的一个值,假设一张图片有3个预测框,则match_l长度为3,然后再按照score进行排序,得到的表类似这样。就可以算TP和FP,然后算出AP和recall了