目标检测分类loss 目标检测tricks

转载

mob64ca1412b28c 2024-04-22 11:55:05

文章标签 目标检测分类loss 目标检测机器学习 python 2d 文章分类 计算机视觉人工智能

目标检测tricks（基于detectron2）

正确尝试

裁剪

由于目标相对于整张图片来说过小，所以对数据进行裁剪（除了裁剪尺寸还需要关注重叠尺寸，重叠尺寸稍微大一些，尽量保持每个目标有完整的存在，不至于因裁剪而破坏目标，这里设置裁剪512，重叠256）

改变anchor size和aspect_ratio

由于数据目标较小，所以需要更改detectron2里默认的anchor.size和aspect_ratio

cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[35], [68], [87], [130], [149]]
cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[1.1], [1.2], [1.4], [1.8], [2.7]]

做法：统计数据标注框的面积和长宽比例，分别通过kmeans聚类方法得出结果。这里使用了sklearn包带的kmeans和网上手写的kmeans代码进行了比较，认为应该是sklearn自带的kmeans得到的聚类结果更能覆盖整体数据，更符合这里目标检测anchor的需求。

加入TTA

测试时数据增强，简称TTA，是对测试数据集进行数据扩展的一种应用，涉及到为测试集中的每个图像创建多个扩增副本，让模型对每个图像做出预测，然后返回这些预测的集合。

cfg.TEST.AUG.ENABLED = True
cfg.TEST.AUG.MIN_SIZES = (400, 500, 512, 600, 700, 800)
cfg.TEST.AUG.MAX_SIZE = 1000
cfg.TEST.AUG.FLIP = True

由于目标检测使用的旋转框五参数格式（ $目标检测分类loss 目标检测tricks_python$ ），所以对detectron2自带的TTA做了一些改动，主要是apply_box更改为apply_rotated_box以及fast_rcnn_inference_single_image_rotated

class GeneralizedRCNNWithTTA(nn.Module):
    """
    A GeneralizedRCNN with test-time augmentation enabled.
    Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`.
    """

    def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
        """
        Args:
            cfg (CfgNode):
            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
            tta_mapper (callable): takes a dataset dict and returns a list of
                augmented versions of the dataset dict. Defaults to
                `DatasetMapperTTA(cfg)`.
            batch_size (int): batch the augmented images into this batch size for inference.
        """
        super().__init__()
        if isinstance(model, DistributedDataParallel):
            model = model.module
        assert isinstance(
            model, GeneralizedRCNN
        ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
        self.cfg = cfg.clone()
        assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
        assert (
            not self.cfg.MODEL.LOAD_PROPOSALS
        ), "TTA for pre-computed proposals is not supported yet"

        self.model = model

        if tta_mapper is None:
            tta_mapper = DatasetMapperTTA(cfg.TEST.AUG.MIN_SIZES, cfg.TEST.AUG.MAX_SIZE, cfg.TEST.AUG.FLIP)
        self.tta_mapper = tta_mapper
        self.batch_size = batch_size

    @contextmanager
    def _turn_off_roi_heads(self, attrs):
        """
        Open a context where some heads in `model.roi_heads` are temporarily turned off.
        Args:
            attr (list[str]): the attribute in `model.roi_heads` which can be used
                to turn off a specific head, e.g., "mask_on", "keypoint_on".
        """
        roi_heads = self.model.roi_heads
        old = {}
        for attr in attrs:
            try:
                old[attr] = getattr(roi_heads, attr)
            except AttributeError:
                # The head may not be implemented in certain ROIHeads
                pass

        if len(old.keys()) == 0:
            yield
        else:
            for attr in old.keys():
                setattr(roi_heads, attr, False)
            yield
            for attr in old.keys():
                setattr(roi_heads, attr, old[attr])

    def _batch_inference(self, batched_inputs, detected_instances=None):
        """
        Execute inference on a list of inputs,
        using batch size = self.batch_size, instead of the length of the list.

        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
        """
        if detected_instances is None:
            detected_instances = [None] * len(batched_inputs)

        outputs = []
        inputs, instances = [], []
        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
            inputs.append(input)
            instances.append(instance)
            if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
                outputs.extend(
                    self.model.inference(
                        inputs,
                        instances if instances[0] is not None else None,
                        do_postprocess=False,
                    )
                )
                inputs, instances = [], []
        return outputs

    def __call__(self, batched_inputs):
        """
        Same input/output format as :meth:`GeneralizedRCNN.forward`
        """

        def _maybe_read_image(dataset_dict):
            ret = copy.copy(dataset_dict)
            if "image" not in ret:
                image = read_image(ret.pop("file_name"), self.model.input_format)
                image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
                ret["image"] = image
            if "height" not in ret and "width" not in ret:
                ret["height"] = image.shape[1]
                ret["width"] = image.shape[2]
            return ret

        return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs]

    def _inference_one_image(self, input):
        """
        Args:
            input (dict): one dataset dict with "image" field being a CHW tensor

        Returns:
            dict: one output dict
        """
        orig_shape = (input["height"], input["width"])
        augmented_inputs, tfms = self._get_augmented_inputs(input)
        # Detect boxes from all augmented versions
        with self._turn_off_roi_heads(["mask_on", "keypoint_on"]):
            # temporarily disable roi heads
            all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)
        # merge all detected boxes to obtain final predictions for boxes
        merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)

        if self.cfg.MODEL.MASK_ON:
            # Use the detected boxes to obtain masks
            augmented_instances = self._rescale_detected_boxes(
                augmented_inputs, merged_instances, tfms
            )
            # run forward on the detected boxes
            outputs = self._batch_inference(augmented_inputs, augmented_instances)
            # Delete now useless variables to avoid being out of memory
            del augmented_inputs, augmented_instances
            # average the predictions
            merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms)
            merged_instances = detector_postprocess(merged_instances, *orig_shape)
            return {"instances": merged_instances}
        else:
            return {"instances": merged_instances}

    def _get_augmented_inputs(self, input):
        augmented_inputs = self.tta_mapper(input)
        tfms = [x.pop("transforms") for x in augmented_inputs]
        return augmented_inputs, tfms

    def _get_augmented_boxes(self, augmented_inputs, tfms):
        # 1: forward with all augmented images
        outputs = self._batch_inference(augmented_inputs)
        # 2: union the results
        all_boxes = []
        all_scores = []
        all_classes = []
        for output, tfm in zip(outputs, tfms):
            # Need to inverse the transforms on boxes, to obtain results on original image
            pred_boxes = output.pred_boxes.tensor
            original_pred_boxes = tfm.inverse().apply_rotated_box(pred_boxes.cpu().numpy())
            all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device))

            all_scores.extend(output.scores)
            all_classes.extend(output.pred_classes)
        all_boxes = torch.cat(all_boxes, dim=0)
        return all_boxes, all_scores, all_classes

    def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
        # select from the union of all results
        num_boxes = len(all_boxes)
        num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
        # +1 because fast_rcnn_inference expects background scores as well
        all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
        for idx, cls, score in zip(count(), all_classes, all_scores):
            all_scores_2d[idx, cls] = score

        merged_instances, _ = fast_rcnn_inference_single_image_rotated(
            all_boxes,
            all_scores_2d,
            shape_hw,
            1e-8,
            self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
            self.cfg.TEST.DETECTIONS_PER_IMAGE,
        )

        return merged_instances

    def _rescale_detected_boxes(self, augmented_inputs, merged_instances, tfms):
        augmented_instances = []
        for input, tfm in zip(augmented_inputs, tfms):
            # Transform the target box to the augmented image's coordinate space
            pred_boxes = merged_instances.pred_boxes.tensor.cpu().numpy()
            pred_boxes = torch.from_numpy(tfm.apply_rotated_box(pred_boxes))

            aug_instances = Instances(
                image_size=input["image"].shape[1:3],
                pred_boxes=Boxes(pred_boxes),
                pred_classes=merged_instances.pred_classes,
                scores=merged_instances.scores,
            )
            augmented_instances.append(aug_instances)
        return augmented_instances

    def _reduce_pred_masks(self, outputs, tfms):
        # Should apply inverse transforms on masks.
        # We assume only resize & flip are used. pred_masks is a scale-invariant
        # representation, so we handle flip specially
        for output, tfm in zip(outputs, tfms):
            if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
                output.pred_masks = output.pred_masks.flip(dims=[3])
        all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
        avg_pred_masks = torch.mean(all_pred_masks, dim=0)
        return avg_pred_masks

一些超参数调整

学习率BASE_LR调为0.01，MAX_ITER调为100000，学习率衰减STEPS调为（50000，75000）

cfg.SOLVER.BASE_LR = 0.01
cfg.SOLVER.MAX_ITER = 100000
cfg.SOLVER.STEPS = (50000,75000)

cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING = 'range'
cfg.INPUT.MIN_SIZE_TRAIN = (512, 832)

错误/无效/失败尝试

去雾

由于最初没有仔细观察数据，看见有雾就想着去雾，但由于图中多是厚云，常见的去雾代码并不能带来很好的效果，并且后面发现云下根本没数据

（但是也放一下实验后感觉效果还比较好的两个去雾算法）

改变anchor size

通过统计数据标注框的长宽，再使用kmeans聚类得出长和宽的聚类结果，再计算面积开方作为anchor.size。这里主要是因为不熟悉detectron2默认送入的anchor是尺寸和纵横比，而不是直接送入anchor的长和宽，所以参考了网上关于yolo聚类获得anchor的过程

数据增强

其实到最后针对旋转框的数据增强也没有做成功，但是记录一下试错的过程吧

直接用detectron2自带的data augmentation进行：该方法中的data augmentation策略并不是完全适用于旋转框目标检测，该方法无法针对单个类别进行数据增强
线下针对少类别数据进行复制并使用其他库进行数据增强：没有搜寻到适用于旋转框目标检测的数据增强库
在线下针对少类别数据进行复制；使用detectron2自带的data augmentation进行数据增强；再把增强后的数据转为XYWHA_ABS格式的coco数据进行训练：数据增强后不太清楚在哪里加入转换坐标

尚未实现的尝试

加入Mosaic增强

Yolov4的mosaic数据增强参考了CutMix数据增强方式, 是CutMix数据增强方法的改进版，对四张图片进行拼接，得到一张新的图片。虽然对图片做Mosaic增强的代码写好了，但因为一些原因还没有进行训练验证。

贴一下代码，这里读入的数据是json格式，输出是txt（其实是还没有改成输出也是json格式）

from PIL import Image, ImageDraw
import numpy as np
from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
import math
import os
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
import pandas as pds
import json
import cv2

def rand(a=0, b=1):
    return np.random.rand() * (b - a) + a


def merge_bboxes(bboxes, cutx, cuty):
    merge_bbox = []
    for i in range(len(bboxes)):
        for box in bboxes[i]:
            tmp_box = []
            x1, y1, x2, y2, x3, y3, x4, y4 = box[0], box[1], box[2], box[3], box[4], box[5], box[6], box[7]

            if i == 0:
                if np.min(box[1::2]) > cuty or np.min(box[::2]) > cutx:
                    continue
                if np.max(box[1::2]) >= cuty and np.min(box[1::2]) <= cuty:
                    box[1::2][np.argmax(box[1::2])] = cuty
                    if cuty - np.min(box[1::2]) < 5:
                        continue
                if np.max(box[::2]) >= cutx and x1 <= cutx:
                    box[::2][np.argmax(box[::2])] = cutx
                    if cutx - x1 < 5:
                        continue

            if i == 1:
                if y2 < cuty or x1 > cutx:
                    continue

                if y2 >= cuty and y1 <= cuty:
                    y1 = cuty
                    if y2 - y1 < 5:
                        continue

                if x2 >= cutx and x1 <= cutx:
                    x2 = cutx
                    if x2 - x1 < 5:
                        continue

            if i == 2:
                if y2 < cuty or x2 < cutx:
                    continue

                if y2 >= cuty and y1 <= cuty:
                    y1 = cuty
                    if y2 - y1 < 5:
                        continue

                if x2 >= cutx and x1 <= cutx:
                    x1 = cutx
                    if x2 - x1 < 5:
                        continue

            if i == 3:
                if y1 > cuty or x2 < cutx:
                    continue

                if y2 >= cuty and y1 <= cuty:
                    y2 = cuty
                    if y2 - y1 < 5:
                        continue

                if x2 >= cutx and x1 <= cutx:
                    x1 = cutx
                    if x2 - x1 < 5:
                        continue

            tmp_box.append(x1)
            tmp_box.append(y1)
            tmp_box.append(x2)
            tmp_box.append(y2)
            tmp_box.append(box[-1])
            merge_bbox.append(tmp_box)
    return merge_bbox


def get_random_data(image_file, annotation_line, input_shape):
    '''random preprocessing for real-time data augmentation'''
    h, w = input_shape
    box_datas = []
    cls_datas = []
    index = 0
    place_x = [0, 0, 256, 256]
    place_y = [0, 256, 0, 256]
    new_image = Image.new('RGB', (w, h), (128, 128, 128))

    for line in annotation_line:
        # 每一行进行分割
        # line_content = line.split(",")
        # 打开图片
        path = os.path.join(image_file, line['imagePath'])
        image = utils.read_image(path, format='BGR')
        r = np.random.rand(2)
        augs = T.AugmentationList([
            T.RandomFlip(prob=0.5),
            T.RandomFlip(prob=0.5, vertical=True, horizontal=False),
            T.RandomApply(T.RandomBrightness(0.9, 1.1), prob=0.3),
            T.RandomApply(T.RandomSaturation(0.9, 1.1), prob=0.3),
            T.RandomApply(T.RandomContrast(0.9, 1.1), prob=0.3),
            T.RandomApply(T.ColorTransform(lambda x: x * r[0] + r[1] * 10), prob=0.3)
        ])
        image, transforms = T.apply_transform_gens([augs], image)
        dx = place_x[index]
        dy = place_y[index]
        image = image[:, :, ::-1]
        new_image.paste(Image.fromarray(np.uint8(image)), (dx, dy))
        # cv2.imshow('new_image', new_image)
        # cv2.imshow('image', Image.fromarray(np.uint8(image)))
        index += 1
        iw, ih = image.shape[:2]
        box = []
        cls = []
        for shape in line['shapes']:
            bbox = []
            for point in shape['points']:
                bbox.append(point[0])
                bbox.append(point[1])
            box.append(bbox)
            cls.append(shape['label'])
        box = np.array(box)
        # box = np.array([np.array(list(map(float, box.split()[1]))) for box in line['shapes'][0:]])
        # cls = [cls.split()[-2:] for cls in line['shapes']['label']]
        if box.shape[-1] == 0:
            continue
        box = transforms.apply_coords(box.reshape(-1, 2)).clip(min=0)

        # if index == 0:
        #     image, transforms = T.apply_transform_gens([T.RandomCrop(crop_type='absolute', crop_size=(cuty, cutx))],
        #                                                image)
        #     box = transforms.apply_coords(box).clip(min=0)
        # if index == 1:
        #     image, transforms = T.apply_transform_gens(
        #         [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), cutx))],
        #         image)
        #     box = transforms.apply_coords(box).clip(min=0)
        #     box[0, :] += cutx
        # if index == 3:
        #     image, transforms = T.apply_transform_gens(
        #         [T.RandomCrop(crop_type='absolute', crop_size=(cuty, (w - cutx)))],
        #         image)
        #     box = transforms.apply_coords(box).clip(min=0)
        #     box[1, :] += cuty
        # if index == 2:
        #     image, transforms = T.apply_transform_gens(
        #         [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), (w - cutx)))],
        #         image)
        #     box = transforms.apply_coords(box).clip(min=0)
        #     box[0, :] += cutx
        #     box[1, :] += cuty

        if index == 2:
            box[:, 1] += 256
        elif index == 3:
            box[:, 0] += 256
            box[:, 1] += 256
        elif index == 4:
            box[:, 0] += 256

        box_datas.append(box)
        cls_datas.extend(cls)

    if len(box_datas) == 0:
        return new_image, []

    box_datas = np.concatenate(box_datas, axis=0)

    # vis box
    box_line = box_datas.reshape(-1, 8)
    # for line in box_line:
    #     x1, y1, x2, y2, x3, y3, x4, y4 = line
    #     draw = ImageDraw.Draw(new_image)
    #     draw.line([(x1, y1), (x2, y2)], fill='red')
    #     draw.line([(x2, y2), (x3, y3)], fill='red')
    #     draw.line([(x3, y3), (x4, y4)], fill='red')
    #     draw.line([(x4, y4), (x1, y1)], fill='red')
    pd = pds.DataFrame(box_line)
    pd2 = pds.DataFrame(cls_datas)
    pd = pds.concat([pd, pd2], axis=1)
    return new_image, pd


def normal_(annotation_line, input_shape):
    '''random preprocessing for real-time data augmentation'''
    line = annotation_line.split()
    image = Image.open(line[0])
    box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])

    iw, ih = image.size
    image = image.transpose(Image.FLIP_LEFT_RIGHT)
    box[:, [0, 2]] = iw - box[:, [2, 0]]

    return image, box

def get_json(json_path):
    info_group = []
    for root, dirs, files in os.walk(json_path):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file)) as f:
                    info = json.load(f)
                    # info = ",".join(info)
                info_group.append(info)
    return info_group


if __name__ == "__main__":
    json_path = './train'
    output_path = './train_mosaic'
    json_group = get_json(json_path)

    for ind in range(0, len(json_group) - 4, 4):
        line = json_group[ind:ind + 4]
        image_data, box_data = get_random_data(json_path, line, [512, 512])
        if len(box_data) == 0:
            continue
        json_output_path = os.path.join(output_path, str(ind) +'.txt')
        img_output_path = os.path.join(output_path, str(ind) + '.png')
        js = box_data.to_json
        # box_data.to_json(json_output_path)
        box_data.to_csv(json_output_path, sep=' ', index=False, header=None, mode='w')
        image_data.save(img_output_path)
        print(ind)
    print("finished")
    # img = Image.fromarray((image_data * 255).astype(np.uint8))
    # for j in range(len(box_data)):
    #     x1, y1, x2, y2, x3, y3, x4, y4 = box_data[j][0:8]
    #     draw = ImageDraw.Draw(img)
    #     draw.line([(x1, y1), (x2, y2)], fill='red')
    #     draw.line([(x2, y2), (x3, y3)], fill='red')
    #     draw.line([(x3, y3), (x4, y4)], fill='red')
    #     draw.line([(x4, y4), (x1, y1)], fill='red')
    #     # thickness = 3
    #     # left, top, right, bottom = box_data[j][0:4]
    #     # draw = ImageDraw.Draw(img)
    #     # for i in range(thickness):
    #     #     draw.rectangle([left + i, top + i, right - i, bottom - i], outline=(255, 255, 255))
    # img.show()
    # img.save("box_all.jpg")