通过DarkLabel构建DeepSort标注格式和ReID数据集

原创

pprp 2021-12-29 17:18:30 博主文章分类：深度学习专栏 ©著作权

文章标签 darklabel 标注软件 voc reid MOT 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者pprp的原创作品，请联系作者获取转载授权，否则将追究法律责任

上一篇文章推荐了DarkLabel标注软件，承诺会附上配套的代码，本文主要分享的是格式转换的几个脚本。

先附上脚本地址： https://github.com/pprp/SimpleCVReproduction/tree/master/DarkLabel" target="_blank">https://github.com/pprp/SimpleCVReproduction/tree/master/DarkLabel

先来了解一下为何DarkLabel能生成这么多格式的数据集，来看看DarkLabel的格式：

frame(从0开始计), 数量, id(从0开始), box(x1,y1,x2,y2), class=null
0,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
1,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
2,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null

每一帧，每张图片上的目标都可以提取到，并且每个目标有bbox、分配了一个ID、class

这些信息都可以满足目标检测、ReID、跟踪数据集。

ps：说明一下，以下脚本都是笔者自己写的，专用于单类的检测、跟踪、重识别的代码，如果有需要多类的，还需要自己修改多类部分的代码。

1. DarkLabel转Detection

这里笔者写了一个脚本转成VOC2007中的xml格式的标注，代码如下：

import cv2
import os
import shutil
import tqdm
import sys

root_path = r"I:\Dataset\VideoAnnotation"

def print_flush(str):
    print(str, end='\r')
    sys.stdout.flush()

def genXML(xml_dir, outname, bboxes, width, height):
    xml_file = open((xml_dir + '/' + outname + '.xml'), 'w')
    xml_file.write('<annotation>\n')
    xml_file.write('    <folder>VOC2007</folder>\n')
    xml_file.write('    <filename>' + outname + '.jpg' + '</filename>\n')
    xml_file.write('    <size>\n')
    xml_file.write('        <width>' + str(width) + '</width>\n')
    xml_file.write('        <height>' + str(height) + '</height>\n')
    xml_file.write('        <depth>3</depth>\n')
    xml_file.write('    </size>\n')

    for bbox in bboxes:
        x1, y1, x2, y2 = bbox
        xml_file.write('    <object>\n')
        xml_file.write('        <name>' + 'cow' + '</name>\n')
        xml_file.write('        <pose>Unspecified</pose>\n')
        xml_file.write('        <truncated>0</truncated>\n')
        xml_file.write('        <difficult>0</difficult>\n')
        xml_file.write('        <bndbox>\n')
        xml_file.write('            <xmin>' + str(x1) + '</xmin>\n')
        xml_file.write('            <ymin>' + str(y1) + '</ymin>\n')
        xml_file.write('            <xmax>' + str(x2) + '</xmax>\n')
        xml_file.write('            <ymax>' + str(y2) + '</ymax>\n')
        xml_file.write('        </bndbox>\n')
        xml_file.write('    </object>\n')

    xml_file.write('</annotation>')


def gen_empty_xml(xml_dir, outname, width, height):
    xml_file = open((xml_dir + '/' + outname + '.xml'), 'w')
    xml_file.write('<annotation>\n')
    xml_file.write('    <folder>VOC2007</folder>\n')
    xml_file.write('    <filename>' + outname + '.png' + '</filename>\n')
    xml_file.write('    <size>\n')
    xml_file.write('        <width>' + str(width) + '</width>\n')
    xml_file.write('        <height>' + str(height) + '</height>\n')
    xml_file.write('        <depth>3</depth>\n')
    xml_file.write('    </size>\n')
    xml_file.write('</annotation>')


def getJPG(src_video_file, tmp_video_frame_save_dir):
    # gen jpg from video
    cap = cv2.VideoCapture(src_video_file)
    if not os.path.exists(tmp_video_frame_save_dir):
        os.makedirs(tmp_video_frame_save_dir)

    frame_cnt = 0
    isrun, frame = cap.read()
    width, height = frame.shape[1], frame.shape[0]

    while (isrun):
        save_name = append_name + "_" + str(frame_cnt) + ".jpg"
        cv2.imwrite(os.path.join(tmp_video_frame_save_dir, save_name), frame)
        frame_cnt += 1
        print_flush("Extracting frame :%d" % frame_cnt)
        isrun, frame = cap.read()
    return width, height

def delTmpFrame(tmp_video_frame_save_dir):
    if os.path.exists(tmp_video_frame_save_dir):
        shutil.rmtree(tmp_video_frame_save_dir)
    print('delete %s success!' % tmp_video_frame_save_dir)

def assign_jpgAndAnnot(src_annot_file, dst_annot_dir, dst_jpg_dir, tmp_video_frame_save_dir, width, height):
    # get coords from annotations files
    txt_file = open(src_annot_file, "r")

    content = txt_file.readlines()

    for line in content:
        item = line[:-1]
        items = item.split(',')
        frame_id, num_of_cow = items[0], items[1]
        print_flush("Assign jpg and annotion : %s" % frame_id)

        bboxes = []

        for i in range(int(num_of_cow)):
            obj_id = items[1 + i * 6 + 1]
            obj_x1, obj_y1 = int(items[1 + i * 6 + 2]), int(items[1 + i * 6 + 3])
            obj_x2, obj_y2 = int(items[1 + i * 6 + 4]), int(items[1 + i * 6 + 5])
            # preprocess the coords
            obj_x1 = max(1, obj_x1)
            obj_y1 = max(1, obj_y1)
            obj_x2 = min(width, obj_x2)
            obj_y2 = min(height, obj_y2)
            bboxes.append([obj_x1, obj_y1, obj_x2, obj_y2])

        genXML(dst_annot_dir, append_name + "_" + str(frame_id), bboxes, width,
            height)
        shutil.copy(
            os.path.join(tmp_video_frame_save_dir,
                        append_name + "_" + str(frame_id) + ".jpg"),
            os.path.join(dst_jpg_dir, append_name + "_" + str(frame_id) + ".jpg"))

    txt_file.close()


if __name__ == "__main__":
    append_names = ["cutout%d" % i for i in range(19, 66)]

    for append_name in append_names:
        print("processing",append_name)
        src_video_file = os.path.join(root_path, append_name + ".mp4")

        if not os.path.exists(src_video_file):
            continue

        src_annot_file = os.path.join(root_path, append_name + "_gt.txt")

        dst_annot_dir = os.path.join(root_path, "Annotations")
        dst_jpg_dir = os.path.join(root_path, "JPEGImages")

        tmp_video_frame_save_dir = os.path.join(root_path, append_name)

        width, height = getJPG(src_video_file, tmp_video_frame_save_dir)

        assign_jpgAndAnnot(src_annot_file, dst_annot_dir, dst_jpg_dir, tmp_video_frame_save_dir, width, height)

        delTmpFrame(tmp_video_frame_save_dir)

如果想转成U版yolo需要的格式可以点击 https://github.com/pprp/voc2007_for_yolo_torch" target="_blank">https://github.com/pprp/voc2007_for_yolo_torch 使用这里的脚本。

2. DarkLabel转ReID数据集

ReID数据集其实与分类数据集很相似，最出名的是Market1501数据集，对这个数据集不熟悉的可以先百度一下。简单来说ReID数据集只比分类中多了query, gallery的概念，也很简单。转换代码如下：

import os
import shutil
import cv2
import numpy as np
import glob
import sys
import random
"""[summary]
根据视频和darklabel得到的标注文件
"""

def preprocessVideo(video_path):
    '''
    预处理，将视频变为一帧一帧的图片
    '''
    if not os.path.exists(video_frame_save_path):
        os.mkdir(video_frame_save_path)

    vidcap = cv2.VideoCapture(video_path)
    (cap, frame) = vidcap.read()

    height = frame.shape[0]
    width = frame.shape[1]

    cnt_frame = 0

    while (cap):
        cv2.imwrite(
            os.path.join(video_frame_save_path, "frame_%d.jpg" % (cnt_frame)),
            frame)
        cnt_frame += 1
        print(cnt_frame, end="\r")
        sys.stdout.flush()
        (cap, frame) = vidcap.read()
    vidcap.release()
    return width, height


def postprocess(video_frame_save_path):
    '''
    后处理，删除无用的文件夹
    '''
    if os.path.exists(video_frame_save_path):
        shutil.rmtree(video_frame_save_path)


def extractVideoImgs(frame, video_frame_save_path, coords):
    '''
    抠图
    '''
    x1, y1, x2, y2 = coords
    # get image from save path
    img = cv2.imread(
        os.path.join(video_frame_save_path, "frame_%d.jpg" % (frame)))

    if img is None:
        return None
    # crop
    save_img = img[y1:y2, x1:x2]
    return save_img


def bbox_ious(box1, box2):
    b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
    b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]

    # Intersection area
    inter_area = (min(b1_x2, b2_x2) - max(b1_x1, b2_x1)) * \
                 (min(b1_y2, b2_y2) - max(b1_y1, b2_y1))

    # Union Area
    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1
    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1
    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area

    return inter_area / union_area


def bbox_iou(box1, box2):
    # format box1: x1,y1,x2,y2
    # format box2: a1,b1,a2,b2
    x1, y1, x2, y2 = box1
    a1, b1, a2, b2 = box2

    i_left_top_x = max(a1, x1)
    i_left_top_y = max(b1, y1)

    i_bottom_right_x = min(a2, x2)
    i_bottom_right_y = min(b2, y2)

    intersection = (i_bottom_right_x - i_left_top_x) * (i_bottom_right_y -
                                                        i_left_top_y)

    area_two_box = (x2 - x1) * (y2 - y1) + (a2 - a1) * (b2 - b1)

    return intersection * 1.0 / (area_two_box - intersection)


def restrictCoords(width, height, x, y):
    x = max(1, x)
    y = max(1, y)
    x = min(x, width)
    y = min(y, height)
    return x, y


if __name__ == "__main__":

    total_cow_num = 0

    root_dir = "./data/videoAndLabel"
    reid_dst_path = "./data/reid"
    done_dir = "./data/done"

    txt_list = glob.glob(os.path.join(root_dir, "*.txt"))
    video_list = glob.glob(os.path.join(root_dir, "*.mp4"))

    for i in range(len(txt_list)):
        txt_path = txt_list[i]
        video_path = video_list[i]

        print("processing:", video_path)

        if not os.path.exists(txt_path):
            continue

        video_name = os.path.basename(video_path).split('.')[0]
        video_frame_save_path = os.path.join(os.path.dirname(video_path),
                                             video_name)

        f_txt = open(txt_path, "r")

        width, height = preprocessVideo(video_path)

        print("done")

        # video_cow_id = video_name + str(total_cow_num)

        for line in f_txt.readlines():
            bboxes = line.split(',')
            ids = []
            frame_id = int(bboxes[0])

            box_list = []

            if frame_id % 30 != 0:
                continue

            num_object = int(bboxes[1])
            for num_obj in range(num_object):
                # obj = 0, 1, 2
                obj_id = bboxes[1 + (num_obj) * 6 + 1]
                obj_x1 = int(bboxes[1 + (num_obj) * 6 + 2])
                obj_y1 = int(bboxes[1 + (num_obj) * 6 + 3])
                obj_x2 = int(bboxes[1 + (num_obj) * 6 + 4])
                obj_y2 = int(bboxes[1 + (num_obj) * 6 + 5])

                box_list.append([obj_x1, obj_y1, obj_x2, obj_y2])
                # process coord
                obj_x1, obj_y1 = restrictCoords(width, height, obj_x1, obj_y1)
                obj_x2, obj_y2 = restrictCoords(width, height, obj_x2, obj_y2)

                specific_object_name = video_name + "_" + obj_id

                # mkdir for reid dataset
                id_dir = os.path.join(reid_dst_path, specific_object_name)

                if not os.path.exists(id_dir):
                    os.makedirs(id_dir)

                # save pic
                img = extractVideoImgs(frame_id, video_frame_save_path,
                                       (obj_x1, obj_y1, obj_x2, obj_y2))
                print(type(img))

                if img is None or img.shape[0] == 0 or img.shape[1] == 0:
                    print(specific_object_name + " is empty")
                    continue

                # print(frame_id)
                img = cv2.resize(img, (256, 256))

                normalizedImg = np.zeros((256, 256))
                img = cv2.normalize(img, normalizedImg, 0, 255,
                                    cv2.NORM_MINMAX)

                cv2.imwrite(
                    os.path.join(id_dir, "%s_%d.jpg") %
                    (specific_object_name, frame_id), img)

            max_w = width - 256
            max_h = height - 256

            # 随机选取左上角坐标
            select_x = random.randint(1, max_w)
            select_y = random.randint(1, max_h)
            rand_box = [select_x, select_y, select_x + 256, select_y + 256]

            # 背景图保存位置
            bg_dir = os.path.join(reid_dst_path, "bg")
            if not os.path.exists(bg_dir):
                os.makedirs(bg_dir)

            iou_list = []

            for idx in range(len(box_list)):
                cow_box = box_list[idx]
                iou = bbox_iou(cow_box, rand_box)
                iou_list.append(iou)

            # print("iou list:" , iou_list)

            if np.array(iou_list).all() < 0:
                img = extractVideoImgs(frame_id, video_frame_save_path,
                                       rand_box)
                if img is None:
                    print(specific_object_name + "is empty")
                    continue
                normalizedImg = np.zeros((256, 256))
                img = cv2.normalize(img, normalizedImg, 0, 255,
                                    cv2.NORM_MINMAX)
                cv2.imwrite(
                    os.path.join(bg_dir, "bg_%s_%d.jpg") %
                    (video_name, frame_id), img)

        f_txt.close()
        postprocess(video_frame_save_path)
        shutil.move(video_path, done_dir)
        shutil.move(txt_path, done_dir)

数据集配套代码在： https://github.com/pprp/reid_for_deepsort" target="_blank">https://github.com/pprp/reid_for_deepsort

3. DarkLabel转MOT16格式

其实DarkLabel标注得到信息和MOT16是几乎一致的，只不过需要转化一下，脚本如下：

import os
'''
gt.txt:
---------
frame(从1开始计), id, box(left top w, h),ignore=1(不忽略), class=1(从1开始),覆盖=1), 
1,1,1363,569,103,241,1,1,0.86014
2,1,1362,568,103,241,1,1,0.86173
3,1,1362,568,103,241,1,1,0.86173
4,1,1362,568,103,241,1,1,0.86173

cutout24_gt.txt
---
frame(从0开始计), 数量, id(从0开始), box(x1,y1,x2,y2), class=null
0,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
1,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
2,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
'''


def xyxy2xywh(x):
    # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
    # y = torch.zeros_like(x) if isinstance(x,
    #                                       torch.Tensor) else np.zeros_like(x)
    y = [0, 0, 0, 0]

    y[0] = (x[0] + x[2]) / 2
    y[1] = (x[1] + x[3]) / 2
    y[2] = x[2] - x[0]
    y[3] = x[3] - x[1]
    return y

def process_darklabel(video_label_path, mot_label_path):
    f = open(video_label_path, "r")
    f_o = open(mot_label_path, "w")

    contents = f.readlines()

    for line in contents:
        line = line[:-1]
        num_list = [num for num in line.split(',')]

        frame_id = int(num_list[0]) + 1
        total_num = int(num_list[1])

        base = 2

        for i in range(total_num):

            print(base, base + i * 6, base + i * 6 + 4)

            _id = int(num_list[base + i * 6]) + 1
            _box_x1 = int(num_list[base + i * 6 + 1])
            _box_y1 = int(num_list[base + i * 6 + 2])
            _box_x2 = int(num_list[base + i * 6 + 3])
            _box_y2 = int(num_list[base + i * 6 + 4])

            y = xyxy2xywh([_box_x1, _box_y1, _box_x2, _box_y2])

            write_line = "%d,%d,%d,%d,%d,%d,1,1,1\n" % (frame_id, _id, y[0],
                                                        y[1], y[2], y[3])

            f_o.write(write_line)

    f.close()
    f_o.close()

if __name__ == "__main__":
    root_dir = "./data/videosample"

    for item in os.listdir(root_dir):
        full_path = os.path.join(root_dir, item)

        video_path = os.path.join(full_path, item+".mp4")
        video_label_path = os.path.join(full_path, item + "_gt.txt")
        mot_label_path = os.path.join(full_path, "gt.txt")
        process_darklabel(video_label_path, mot_label_path)

以上就是DarkLabel转各种数据集格式的脚本了，DarkLabel还是非常方便的，可以快速构建自己的数据集。通常两分钟的视频可以生成2880张之多的图片，但是在目标检测中并不推荐将所有的图片都作为训练集，因为前后帧之间差距太小了，几乎是一模一样的。这种数据会导致训练速度很慢、泛化能力变差。

有两种解决方案：

可以选择隔几帧选取一帧作为数据集，比如每隔10帧作为数据集。具体选择多少作为间隔还是具体问题具体分析，如果视频中变化目标变化较快，可以适当缩短间隔；如果视频中大部分都是静止对象，可以适当增大间隔。
还有一种更好的方案是：对原视频用ffmpeg提取关键帧，将关键帧的内容作为数据集。关键帧和关键帧之间的差距比较大，适合作为目标检测数据集。

代码改变世界