上一篇文章推荐了DarkLabel标注软件,承诺会附上配套的代码,本文主要分享的是格式换的几个脚本。

先附上脚本地址: ​https://github.com/pprp/SimpleCVReproduction/tree/master/DarkLabel" target="_blank">​https://github.com/pprp/SimpleCVReproduction/tree/master/DarkLabel​

先来了解一下为何DarkLabel能生成这么多格式的数据集,来看看DarkLabel的格式:

frame(从0开始计), 数量, id(从0开始), box(x1,y1,x2,y2), class=null
0,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
1,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
2,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null

每一帧,每张图片上的目标都可以提取到,并且每个目标有bbox、分配了一个ID、class

这些信息都可以满足目标检测、ReID、跟踪数据集。

ps:说明一下,以下脚本都是笔者自己写的,专用于单类的检测、跟踪、重识别的代码,如果有需要多类的,还需要自己修改多类部分的代码。

1. DarkLabel转Detection

这里笔者写了一个脚本成VOC2007中的xml格式的标注,代码如下:

import cv2
import os
import shutil
import tqdm
import sys

root_path = r"I:\Dataset\VideoAnnotation"

def print_flush(str):
print(str, end='\r')
sys.stdout.flush()

def genXML(xml_dir, outname, bboxes, width, height):
xml_file = open((xml_dir + '/' + outname + '.xml'), 'w')
xml_file.write('<annotation>\n')
xml_file.write(' <folder>VOC2007</folder>\n')
xml_file.write(' <filename>' + outname + '.jpg' + '</filename>\n')
xml_file.write(' <size>\n')
xml_file.write(' <width>' + str(width) + '</width>\n')
xml_file.write(' <height>' + str(height) + '</height>\n')
xml_file.write(' <depth>3</depth>\n')
xml_file.write(' </size>\n')

for bbox in bboxes:
x1, y1, x2, y2 = bbox
xml_file.write(' <object>\n')
xml_file.write(' <name>' + 'cow' + '</name>\n')
xml_file.write(' <pose>Unspecified</pose>\n')
xml_file.write(' <truncated>0</truncated>\n')
xml_file.write(' <difficult>0</difficult>\n')
xml_file.write(' <bndbox>\n')
xml_file.write(' <xmin>' + str(x1) + '</xmin>\n')
xml_file.write(' <ymin>' + str(y1) + '</ymin>\n')
xml_file.write(' <xmax>' + str(x2) + '</xmax>\n')
xml_file.write(' <ymax>' + str(y2) + '</ymax>\n')
xml_file.write(' </bndbox>\n')
xml_file.write(' </object>\n')

xml_file.write('</annotation>')


def gen_empty_xml(xml_dir, outname, width, height):
xml_file = open((xml_dir + '/' + outname + '.xml'), 'w')
xml_file.write('<annotation>\n')
xml_file.write(' <folder>VOC2007</folder>\n')
xml_file.write(' <filename>' + outname + '.png' + '</filename>\n')
xml_file.write(' <size>\n')
xml_file.write(' <width>' + str(width) + '</width>\n')
xml_file.write(' <height>' + str(height) + '</height>\n')
xml_file.write(' <depth>3</depth>\n')
xml_file.write(' </size>\n')
xml_file.write('</annotation>')


def getJPG(src_video_file, tmp_video_frame_save_dir):
# gen jpg from video
cap = cv2.VideoCapture(src_video_file)
if not os.path.exists(tmp_video_frame_save_dir):
os.makedirs(tmp_video_frame_save_dir)

frame_cnt = 0
isrun, frame = cap.read()
width, height = frame.shape[1], frame.shape[0]

while (isrun):
save_name = append_name + "_" + str(frame_cnt) + ".jpg"
cv2.imwrite(os.path.join(tmp_video_frame_save_dir, save_name), frame)
frame_cnt += 1
print_flush("Extracting frame :%d" % frame_cnt)
isrun, frame = cap.read()
return width, height

def delTmpFrame(tmp_video_frame_save_dir):
if os.path.exists(tmp_video_frame_save_dir):
shutil.rmtree(tmp_video_frame_save_dir)
print('delete %s success!' % tmp_video_frame_save_dir)

def assign_jpgAndAnnot(src_annot_file, dst_annot_dir, dst_jpg_dir, tmp_video_frame_save_dir, width, height):
# get coords from annotations files
txt_file = open(src_annot_file, "r")

content = txt_file.readlines()

for line in content:
item = line[:-1]
items = item.split(',')
frame_id, num_of_cow = items[0], items[1]
print_flush("Assign jpg and annotion : %s" % frame_id)

bboxes = []

for i in range(int(num_of_cow)):
obj_id = items[1 + i * 6 + 1]
obj_x1, obj_y1 = int(items[1 + i * 6 + 2]), int(items[1 + i * 6 + 3])
obj_x2, obj_y2 = int(items[1 + i * 6 + 4]), int(items[1 + i * 6 + 5])
# preprocess the coords
obj_x1 = max(1, obj_x1)
obj_y1 = max(1, obj_y1)
obj_x2 = min(width, obj_x2)
obj_y2 = min(height, obj_y2)
bboxes.append([obj_x1, obj_y1, obj_x2, obj_y2])

genXML(dst_annot_dir, append_name + "_" + str(frame_id), bboxes, width,
height)
shutil.copy(
os.path.join(tmp_video_frame_save_dir,
append_name + "_" + str(frame_id) + ".jpg"),
os.path.join(dst_jpg_dir, append_name + "_" + str(frame_id) + ".jpg"))

txt_file.close()


if __name__ == "__main__":
append_names = ["cutout%d" % i for i in range(19, 66)]

for append_name in append_names:
print("processing",append_name)
src_video_file = os.path.join(root_path, append_name + ".mp4")

if not os.path.exists(src_video_file):
continue

src_annot_file = os.path.join(root_path, append_name + "_gt.txt")

dst_annot_dir = os.path.join(root_path, "Annotations")
dst_jpg_dir = os.path.join(root_path, "JPEGImages")

tmp_video_frame_save_dir = os.path.join(root_path, append_name)

width, height = getJPG(src_video_file, tmp_video_frame_save_dir)

assign_jpgAndAnnot(src_annot_file, dst_annot_dir, dst_jpg_dir, tmp_video_frame_save_dir, width, height)

delTmpFrame(tmp_video_frame_save_dir)

如果想成U版yolo需要的格式可以点击 ​https://github.com/pprp/voc2007_for_yolo_torch" target="_blank">​https://github.com/pprp/voc2007_for_yolo_torch​​ 使用这里的脚本。

2. DarkLabel转ReID数据集

ReID数据集其实与分类数据集很相似,最出名的是Market1501数据集,对这个数据集不熟悉的可以先百度一下。简单来说ReID数据集只比分类中多了query, gallery的概念,也很简单。换代码如下:

import os
import shutil
import cv2
import numpy as np
import glob
import sys
import random
"""[summary]
根据视频和darklabel得到的标注文件
"""

def preprocessVideo(video_path):
'''
预处理,将视频变为一帧一帧的图片
'''
if not os.path.exists(video_frame_save_path):
os.mkdir(video_frame_save_path)

vidcap = cv2.VideoCapture(video_path)
(cap, frame) = vidcap.read()

height = frame.shape[0]
width = frame.shape[1]

cnt_frame = 0

while (cap):
cv2.imwrite(
os.path.join(video_frame_save_path, "frame_%d.jpg" % (cnt_frame)),
frame)
cnt_frame += 1
print(cnt_frame, end="\r")
sys.stdout.flush()
(cap, frame) = vidcap.read()
vidcap.release()
return width, height


def postprocess(video_frame_save_path):
'''
后处理,删除无用的文件夹
'''
if os.path.exists(video_frame_save_path):
shutil.rmtree(video_frame_save_path)


def extractVideoImgs(frame, video_frame_save_path, coords):
'''
抠图
'''
x1, y1, x2, y2 = coords
# get image from save path
img = cv2.imread(
os.path.join(video_frame_save_path, "frame_%d.jpg" % (frame)))

if img is None:
return None
# crop
save_img = img[y1:y2, x1:x2]
return save_img


def bbox_ious(box1, box2):
b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]

# Intersection area
inter_area = (min(b1_x2, b2_x2) - max(b1_x1, b2_x1)) * \
(min(b1_y2, b2_y2) - max(b1_y1, b2_y1))

# Union Area
w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1
w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1
union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area

return inter_area / union_area


def bbox_iou(box1, box2):
# format box1: x1,y1,x2,y2
# format box2: a1,b1,a2,b2
x1, y1, x2, y2 = box1
a1, b1, a2, b2 = box2

i_left_top_x = max(a1, x1)
i_left_top_y = max(b1, y1)

i_bottom_right_x = min(a2, x2)
i_bottom_right_y = min(b2, y2)

intersection = (i_bottom_right_x - i_left_top_x) * (i_bottom_right_y -
i_left_top_y)

area_two_box = (x2 - x1) * (y2 - y1) + (a2 - a1) * (b2 - b1)

return intersection * 1.0 / (area_two_box - intersection)


def restrictCoords(width, height, x, y):
x = max(1, x)
y = max(1, y)
x = min(x, width)
y = min(y, height)
return x, y


if __name__ == "__main__":

total_cow_num = 0

root_dir = "./data/videoAndLabel"
reid_dst_path = "./data/reid"
done_dir = "./data/done"

txt_list = glob.glob(os.path.join(root_dir, "*.txt"))
video_list = glob.glob(os.path.join(root_dir, "*.mp4"))

for i in range(len(txt_list)):
txt_path = txt_list[i]
video_path = video_list[i]

print("processing:", video_path)

if not os.path.exists(txt_path):
continue

video_name = os.path.basename(video_path).split('.')[0]
video_frame_save_path = os.path.join(os.path.dirname(video_path),
video_name)

f_txt = open(txt_path, "r")

width, height = preprocessVideo(video_path)

print("done")

# video_cow_id = video_name + str(total_cow_num)

for line in f_txt.readlines():
bboxes = line.split(',')
ids = []
frame_id = int(bboxes[0])

box_list = []

if frame_id % 30 != 0:
continue

num_object = int(bboxes[1])
for num_obj in range(num_object):
# obj = 0, 1, 2
obj_id = bboxes[1 + (num_obj) * 6 + 1]
obj_x1 = int(bboxes[1 + (num_obj) * 6 + 2])
obj_y1 = int(bboxes[1 + (num_obj) * 6 + 3])
obj_x2 = int(bboxes[1 + (num_obj) * 6 + 4])
obj_y2 = int(bboxes[1 + (num_obj) * 6 + 5])

box_list.append([obj_x1, obj_y1, obj_x2, obj_y2])
# process coord
obj_x1, obj_y1 = restrictCoords(width, height, obj_x1, obj_y1)
obj_x2, obj_y2 = restrictCoords(width, height, obj_x2, obj_y2)

specific_object_name = video_name + "_" + obj_id

# mkdir for reid dataset
id_dir = os.path.join(reid_dst_path, specific_object_name)

if not os.path.exists(id_dir):
os.makedirs(id_dir)

# save pic
img = extractVideoImgs(frame_id, video_frame_save_path,
(obj_x1, obj_y1, obj_x2, obj_y2))
print(type(img))

if img is None or img.shape[0] == 0 or img.shape[1] == 0:
print(specific_object_name + " is empty")
continue

# print(frame_id)
img = cv2.resize(img, (256, 256))

normalizedImg = np.zeros((256, 256))
img = cv2.normalize(img, normalizedImg, 0, 255,
cv2.NORM_MINMAX)

cv2.imwrite(
os.path.join(id_dir, "%s_%d.jpg") %
(specific_object_name, frame_id), img)

max_w = width - 256
max_h = height - 256

# 随机选取左上角坐标
select_x = random.randint(1, max_w)
select_y = random.randint(1, max_h)
rand_box = [select_x, select_y, select_x + 256, select_y + 256]

# 背景图保存位置
bg_dir = os.path.join(reid_dst_path, "bg")
if not os.path.exists(bg_dir):
os.makedirs(bg_dir)

iou_list = []

for idx in range(len(box_list)):
cow_box = box_list[idx]
iou = bbox_iou(cow_box, rand_box)
iou_list.append(iou)

# print("iou list:" , iou_list)

if np.array(iou_list).all() < 0:
img = extractVideoImgs(frame_id, video_frame_save_path,
rand_box)
if img is None:
print(specific_object_name + "is empty")
continue
normalizedImg = np.zeros((256, 256))
img = cv2.normalize(img, normalizedImg, 0, 255,
cv2.NORM_MINMAX)
cv2.imwrite(
os.path.join(bg_dir, "bg_%s_%d.jpg") %
(video_name, frame_id), img)

f_txt.close()
postprocess(video_frame_save_path)
shutil.move(video_path, done_dir)
shutil.move(txt_path, done_dir)

数据集配套代码在: ​https://github.com/pprp/reid_for_deepsort" target="_blank">​https://github.com/pprp/reid_for_deepsort​

3. DarkLabel转MOT16格式

其实DarkLabel标注得到信息和MOT16是几乎一致的,只不过需要化一下,脚本如下:

import os
'''
gt.txt:
---------
frame(从1开始计), id, box(left top w, h),ignore=1(不忽略), class=1(从1开始),覆盖=1),
1,1,1363,569,103,241,1,1,0.86014
2,1,1362,568,103,241,1,1,0.86173
3,1,1362,568,103,241,1,1,0.86173
4,1,1362,568,103,241,1,1,0.86173

cutout24_gt.txt
---
frame(从0开始计), 数量, id(从0开始), box(x1,y1,x2,y2), class=null
0,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
1,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
2,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
'''


def xyxy2xywh(x):
# Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
# y = torch.zeros_like(x) if isinstance(x,
# torch.Tensor) else np.zeros_like(x)
y = [0, 0, 0, 0]

y[0] = (x[0] + x[2]) / 2
y[1] = (x[1] + x[3]) / 2
y[2] = x[2] - x[0]
y[3] = x[3] - x[1]
return y

def process_darklabel(video_label_path, mot_label_path):
f = open(video_label_path, "r")
f_o = open(mot_label_path, "w")

contents = f.readlines()

for line in contents:
line = line[:-1]
num_list = [num for num in line.split(',')]

frame_id = int(num_list[0]) + 1
total_num = int(num_list[1])

base = 2

for i in range(total_num):

print(base, base + i * 6, base + i * 6 + 4)

_id = int(num_list[base + i * 6]) + 1
_box_x1 = int(num_list[base + i * 6 + 1])
_box_y1 = int(num_list[base + i * 6 + 2])
_box_x2 = int(num_list[base + i * 6 + 3])
_box_y2 = int(num_list[base + i * 6 + 4])

y = xyxy2xywh([_box_x1, _box_y1, _box_x2, _box_y2])

write_line = "%d,%d,%d,%d,%d,%d,1,1,1\n" % (frame_id, _id, y[0],
y[1], y[2], y[3])

f_o.write(write_line)

f.close()
f_o.close()

if __name__ == "__main__":
root_dir = "./data/videosample"

for item in os.listdir(root_dir):
full_path = os.path.join(root_dir, item)

video_path = os.path.join(full_path, item+".mp4")
video_label_path = os.path.join(full_path, item + "_gt.txt")
mot_label_path = os.path.join(full_path, "gt.txt")
process_darklabel(video_label_path, mot_label_path)

以上就是DarkLabel各种数据集格式的脚本了,DarkLabel还是非常方便的,可以快速构建自己的数据集。通常两分钟的视频可以生成2880张之多的图片,但是在目标检测中并不推荐将所有的图片都作为训练集,因为前后帧之间差距太小了,几乎是一模一样的。这种数据会导致训练速度很慢、泛化能力变差。

有两种解决方案:


  • 可以选择隔几帧选取一帧作为数据集,比如每隔10帧作为数据集。具体选择多少作为间隔还是具体问题具体分析,如果视频中变化目标变化较快,可以适当缩短间隔;如果视频中大部分都是静止对象,可以适当增大间隔。
  • 还有一种更好的方案是:对原视频用ffmpeg提取关键帧,将关键帧的内容作为数据集。关键帧和关键帧之间的差距比较大,适合作为目标检测数据集。


代码改变世界