文章目录
- YoloV5模型的简单使用
- 一、模型推理
- 二、模型格式转换
- 三、使用yolov5n.onnx模型
- 1、`yolov5_onnx_model.py` 创建推理类
- 2)`general.py` 代码
- 3)onnxruntime 和 onnxruntime-gpu耗时比较
- 四、模型训练
- 1、下载数据集
- 2、封装成yolov5模型要求的数据集
- 1)划分train,val,test数据集
- 2)将voc标注文件转换成txt
- 3、运行`train.py`训练模型
一、模型推理
直接运行detect.py
,修改--weights
参数(比如yolov5s.pt
),如果项目根目录下没有yolov5s.pt
,会到云端帮你下载coco权重文件(当然还可以是yolov5n.pt
,yolov5m.pt
),这里默认使用coco.yaml
作为data
参数。
运行后,项目会在run/detect
文件夹下生成检测结果的图片。
二、模型格式转换
在export.py
中,为了将pt
文件转化成onnx
,需要保证--include
参数包括onnx
,接着直接运行即可完成权重文件的格式转换。
三、使用yolov5n.onnx模型
推理代码参考
这里做了小改进,可以在不安装torch,torchvision情况下,使用numpy
+ onnxruntime-gpu
,直接进行yolov5n.onnx模型的推理。
1、yolov5_onnx_model.py
创建推理类
# 参考
# coding=utf-8
import cv2
import numpy as np
import onnxruntime
import time
import random
from yolov5_onnx.general import non_max_suppression, numpy_cpu_nms
class YOLOV5_ONNX(object):
def __init__(self,onnx_path = "yolov5_onnx/yolov5n.onnx", classes = []):
'''
:param onnx_path: onnx路径
:param classes: classes类别 list[]
'''
'''初始化onnx'''
# onnx_path = "yolov5_onnx/yolov5n.onnx"
self.onnx_session=onnxruntime.InferenceSession(onnx_path)
print(onnxruntime.get_device())
self.input_name=self.get_input_name()
self.output_name=self.get_output_name()
self.classes=classes
def get_input_name(self):
'''获取输入节点名称'''
input_name=[]
for node in self.onnx_session.get_inputs():
input_name.append(node.name)
return input_name
def get_output_name(self):
'''获取输出节点名称'''
output_name=[]
for node in self.onnx_session.get_outputs():
output_name.append(node.name)
return output_name
def get_input_feed(self,image_tensor):
'''获取输入tensor'''
input_feed={}
for name in self.input_name:
input_feed[name]=image_tensor
return input_feed
def letterbox(self,img, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True,
stride=32):
'''图片归一化'''
# Resize and pad image while meeting stride-multiple constraints
shape = img.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scaleup: # only scale down, do not scale up (for better test mAP)
r = min(r, 1.0)
# Compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if auto: # minimum rectangle
dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
elif scaleFill: # stretch
dw, dh = 0.0, 0.0
new_unpad = (new_shape[1], new_shape[0])
ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
dw /= 2 # divide padding into 2 sides
dh /= 2
if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return img, ratio, (dw, dh)
def xywh2xyxy(self,x):
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
y = np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def nms(self,prediction, conf_thres=0.1, iou_thres=0.6, agnostic=False):
# if prediction.dtype is torch.float16:
# prediction = prediction.float() # to FP32
xc = prediction[..., 4] > conf_thres # candidates
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
max_det = 300 # maximum number of detections per image
output = [None] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
x = x[xc[xi]] # confidence
if not x.shape[0]:
continue
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
box = self.xywh2xyxy(x[:, :4])
# conf, j = x[:, 5:].max(1, keepdim=True)
# x = torch.cat((torch.tensor(box), conf, j.float()), 1)[conf.view(-1) > conf_thres]
conf, j = x[:, 5:].max(1), x[:, 5:].argmax(1)
conf = conf.reshape(conf.shape[0], 1)
j = j.reshape(j.shape[0], 1)
x = np.concatenate((box, conf, j), 1)[conf.flatten() > conf_thres]
n = x.shape[0] # number of boxes
if not n:
continue
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
# i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
i = numpy_cpu_nms(boxes, scores, iou_thres)
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
output[xi] = x[i]
return output
def clip_coords(self,boxes, img_shape):
'''查看是否越界'''
# Clip bounding xyxy bounding boxes to image shape (height, width)
#将数值压缩到区间 [min,max]中
for i in range(len(boxes)): #x1
if boxes[i][0] < 0:
boxes[i][0] = 0
elif boxes[i][0] > img_shape[1]:
boxes[i][0] = img_shape[1]
for i in range(len(boxes)): #y1
if boxes[i][1] < 0:
boxes[i][1] = 0
elif boxes[i][1] > img_shape[0]:
boxes[i][1] = img_shape[0]
for i in range(len(boxes)): #x2
if boxes[i][2] < 0:
boxes[i][2] = 0
elif boxes[i][2] > img_shape[1]:
boxes[i][2] = img_shape[1]
for i in range(len(boxes)): # y2
if boxes[i][3] < 0:
boxes[i][3] = 0
elif boxes[i][3] > img_shape[0]:
boxes[i][3] = img_shape[0]
def scale_coords(self,img1_shape, coords, img0_shape, ratio_pad=None):
'''
坐标对应到原始图像上,反操作:减去pad,除以最小缩放比例
:param img1_shape: 输入尺寸
:param coords: 输入坐标
:param img0_shape: 映射的尺寸
:param ratio_pad:
:return:
'''
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new,计算缩放比率
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding ,计算扩充的尺寸
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2]] -= pad[0] # x padding,减去x方向上的扩充
coords[:, [1, 3]] -= pad[1] # y padding,减去y方向上的扩充
coords[:, :4] /= gain # 将box坐标对应到原始图像上
self.clip_coords(coords, img0_shape) # 边界检查
return coords
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def infer(self,src_img):
'''
:param src_img: ndarray[ndarray[]]
:return:
'''
'''执行前向操作预测输出'''
# 超参数设置
img_size=(640,640) #图片缩放大小
# 读取图片
# src_img=cv2.imread(img_path)
start=time.time()
src_size=src_img.shape[:2]
# 图片填充并归一化
img=self.letterbox(src_img,img_size,stride=32)[0]
# Convert
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img)
# 归一化
img=img.astype(dtype=np.float32)
img/=255.0
# # BGR to RGB
# img = img[:, :, ::-1].transpose(2, 0, 1)
# img = np.ascontiguousarray(img)
# 维度扩张
img=np.expand_dims(img,axis=0)
# print('img resuming: ',time.time()-start)
# 前向推理
# start=time.time()
input_feed=self.get_input_feed(img)
# ort_inputs = {self.onnx_session.get_inputs()[0].name: input_feed[None].numpy()}
# pred = torch.tensor(self.onnx_session.run(None, input_feed)[0])
pred = self.onnx_session.run(None, input_feed)[0]
results = non_max_suppression(pred, 0.5,0.5)
# print('onnx resuming: ',time.time() - start)
# pred=self.onnx_session.run(output_names=self.output_name,input_feed=input_feed)
#映射到原始图像
img_shape=img.shape[2:]
# print(img_size)
for det in results: # detections per image
if det is not None and len(det):
det[:, :4] = self.scale_coords(img_shape, det[:, :4],src_size).round()
print(time.time()-start)
if det is not None and len(det):
self.draw(src_img, det) #绘制动物边界框
return src_img
return src_img
def plot_one_box(self,x, img, color=None, label=None, line_thickness=None):
# Plots one bounding box on image img
tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
def draw(self,img, boxinfo):
colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(self.classes))]
for *xyxy, conf, cls in boxinfo:
label = '%s %.2f' % (self.classes[int(cls)], conf)
# print('xyxy: ', xyxy)
self.plot_one_box(xyxy, img, label=label, color=colors[int(cls)], line_thickness=1)
# cv2.namedWindow("dst",0)
# cv2.imshow("dst", img)
# cv2.imwrite("res1.jpg",img)
# cv2.waitKey(0)
# cv2.imencode('.jpg', img)[1].tofile(os.path.join(dst, id + ".jpg"))
return 0
if __name__=="__main__":
classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush'] # class names
img_path = "images/bus.jpg"
src_img = cv2.imread(img_path)
# model=YOLOV5_ONNX(onnx_path="yolov5n.onnx",classes=classes) #FPS = 15
model=YOLOV5_ONNX(onnx_path="yolov5s.onnx",classes=classes) #FPS = 8
frame = model.infer(src_img)
# cv2.imshow("objectDetect",frame)
# cv2.waitKey(0)
#视频测试
camera = cv2.VideoCapture("images/catAndDog.mp4")
while(camera.isOpened()):
ret,frame = camera.read()
if(ret):
t1 = time.time()
frame = model.infer(frame)
t2 = time.time()
fps = round(1 / (t2 - t1),6)
cv2.putText(frame, "FPS = " + str(fps), (60, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
cv2.imshow("res", frame)
cv2.waitKey(15)
2)general.py
代码
推理类需要用到general.py
import time
import numpy as np
def xywh2xyxy(x):
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
y = np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def box_iou(box1, box2):
# https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
"""
Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
Arguments:
box1 (Tensor[N, 4])
box2 (Tensor[M, 4])
Returns:
iou (Tensor[N, M]): the NxM matrix containing the pairwise
IoU values for every element in boxes1 and boxes2
"""
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
x11, y11, x12, y12 = np.split(box1, 4, axis=1)
x21, y21, x22, y22 = np.split(box2, 4, axis=1)
xa = np.maximum(x11, np.transpose(x21))
xb = np.minimum(x12, np.transpose(x22))
ya = np.maximum(y11, np.transpose(y21))
yb = np.minimum(y12, np.transpose(y22))
area_inter = np.maximum(0, (xb - xa + 1)) * np.maximum(0, (yb - ya + 1))
area_1 = (x12 - x11 + 1) * (y12 - y11 + 1)
area_2 = (x22 - x21 + 1) * (y22 - y21 + 1)
area_union = area_1 + np.transpose(area_2) - area_inter
iou = area_inter / area_union
return iou
def numpy_cpu_nms(dets, scores, thresh):
"""
nms
:param dets: ndarray [x1,y1,x2,y2]
:param scores: ndarray []
:param thresh: int
:return: ndarray[index]
"""
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
order = scores.argsort()[::-1]
area = (x2 - x1 + 1) * (y2 - y1 + 1)
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0, xx2 - xx1 + 1)
h = np.maximum(0, yy2 - yy1 + 1)
over = (w * h) / (area[i] + area[order[1:]] - w * h)
index = np.where(over <= thresh)[0]
order = order[index + 1] # 不包括第0个
return np.array(keep)
def non_max_suppression(prediction,
conf_thres=0.25,
iou_thres=0.45,
classes=None,
agnostic=False,
multi_label=False,
labels=(),
max_det=300):
"""Non-Maximum Suppression (NMS) on inference results to reject overlapping bounding boxes
Returns:
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
"""
bs = prediction.shape[0] # batch size
nc = prediction.shape[2] - 5 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
# Checks
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
# Settings
# min_wh = 2 # (pixels) minimum box width and height
max_wh = 7680 # (pixels) maximum box width and height
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
time_limit = 0.3 + 0.03 * bs # seconds to quit after
redundant = True # require redundant detections
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
t = time.time()
# output = [torch.zeros((0, 6), device=prediction.device)] * bs
output = [np.zeros((0, 6))] * bs
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x[xc[xi]] # confidence
# Cat apriori labels if autolabelling
if labels and len(labels[xi]):
lb = labels[xi]
# v = torch.zeros((len(lb), nc + 5), device=x.device)
v = np.zeros((len(lb), nc + 5))
v[:, :4] = lb[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(lb)), lb[:, 0].long() + 5] = 1.0 # cls
# x = torch.cat((x, v), 0)
x = np.concatenate((x, v), 0)
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
# x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
x = np.concatenate((box[i], x[i, j + 5, None], j[:, None].float()), 1)
else: # best class only
conf, j = x[:, 5:].max(1), x[:, 5:].argmax(1)
# x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
conf = conf.reshape(conf.shape[0], 1)
j = j.reshape(j.shape[0], 1)
x = np.concatenate((box, conf, j), 1)[conf.flatten() > conf_thres]
# Filter by class
if classes is not None:
# x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
# x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
pass
# Apply finite constraint
# if not torch.isfinite(x).all():
# x = x[torch.isfinite(x).all(1)]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
# Batched NMS
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
# i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
i = numpy_cpu_nms(boxes, scores, iou_thres) # NMS
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
# x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
x[i, :4] = np.multiply(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
return output
3)onnxruntime 和 onnxruntime-gpu耗时比较
Note:上面的推理代码不仅适合于yolov5s.onnx
,还适合于yolov5n.onnx
,yolov5m.onnx
。这里以yolov5n.onnx
为例(最小yolov5模型),比较onnxruntime的耗时估计。
#onnruntime FPS = 15
CPU
img resuming: 0.008979320526123047
onnx resuming: 0.061834096908569336
0.06283116340637207
#onnxruntime-gpu FPS = 30
GPU
img resuming: 0.004993915557861328
onnx resuming: 0.03390932083129883
0.03390932083129883
四、模型训练
下载后的Images 和 Annotation两个文件夹目录结构如下,每个文件夹下为各个种类狗的图像。
├─ Images or Annotation
├─n02085620-Chihuahua
├─n02085782-Japanese_spaniel
├─n02085936-Maltese_dog
├─n02086079-Pekinese
├─n02086240-Shih-Tzu
...
├─n02113186-Cardigan
├─n02113624-toy_poodle
├─n02113712-miniature_poodle
├─n02113799-standard_poodle
├─n02113978-Mexican_hairless
├─n02115641-dingo
├─n02115913-dhole
└─n02116738-African_hunting_dog
2、封装成yolov5模型要求的数据集
yolov5模型在训练时,要求数据集目录结构如下:
datasets
├─images
│ ├─test
│ ├─train
│ └─val
├─labels
│ ├─test
│ ├─train
│ └─val
其中images,labels均按train,val,test数据集进行划分,其中images中的图片和labels中的标注文件在对应文件夹中是一致的。
而且要求标注文件是txt(yoloV5)格式,而上面的StanfordExtra 数据集的标注文件是xml
的,需要完成一定的转换。
这里说明一下,datasets这个文件夹其实放哪里都无所谓,放在yolov5项目同级,或是放在yolov5/data文件夹下也可以,只不过在配置dogs_oxford.yaml
时,需要特别指定images
路径,而Annotation
不需要。
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
# PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC by University of Oxford
# Example usage: python train.py --data VOC.yaml
# parent
# ├── yolov5
# └── datasets
# └── VOC ← downloads here (2.8 GB)
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: F:/animal_recognition/datasets/
train: # train images (relative to 'path') 16551 images
- images/train
val: # val images (relative to 'path') 4952 images
- images/val
test: # test images (optional)
- images/test
# Classes
nc: 120 # number of classes
names: ['Chihuahua', 'Japanese_spaniel', 'Maltese_dog', 'Pekinese', 'Shih',
'Blenheim_spaniel', 'papillon', 'toy_terrier', 'Rhodesian_ridgeback',
'Afghan_hound', 'basset', 'beagle', 'bloodhound', 'bluetick', 'black',
'Walker_hound', 'English_foxhound', 'redbone', 'borzoi', 'Irish_wolfhound',
'Italian_greyhound', 'whippet', 'Ibizan_hound', 'Norwegian_elkhound', 'otterhound',
'Saluki', 'Scottish_deerhound', 'Weimaraner', 'Staffordshire_bullterrier',
'American_Staffordshire_terrier', 'Bedlington_terrier', 'Border_terrier',
'Kerry_blue_terrier', 'Irish_terrier', 'Norfolk_terrier', 'Norwich_terrier',
'Yorkshire_terrier', 'wire', 'Lakeland_terrier', 'Sealyham_terrier', 'Airedale',
'cairn', 'Australian_terrier', 'Dandie_Dinmont', 'Boston_bull', 'miniature_schnauzer',
'giant_schnauzer', 'standard_schnauzer', 'Scotch_terrier', 'Tibetan_terrier',
'silky_terrier', 'soft', 'West_Highland_white_terrier', 'Lhasa', 'flat', 'curly',
'golden_retriever', 'Labrador_retriever', 'Chesapeake_Bay_retriever',
'German_short', 'vizsla', 'English_setter', 'Irish_setter', 'Gordon_setter',
'Brittany_spaniel', 'clumber', 'English_springer', 'Welsh_springer_spaniel',
'cocker_spaniel', 'Sussex_spaniel', 'Irish_water_spaniel', 'kuvasz', 'schipperke',
'groenendael', 'malinois', 'briard', 'kelpie', 'komondor', 'Old_English_sheepdog',
'Shetland_sheepdog', 'collie', 'Border_collie', 'Bouvier_des_Flandres', 'Rottweiler',
'German_shepherd', 'Doberman', 'miniature_pinscher', 'Greater_Swiss_Mountain_dog',
'Bernese_mountain_dog', 'Appenzeller', 'EntleBucher', 'boxer', 'bull_mastiff',
'Tibetan_mastiff', 'French_bulldog', 'Great_Dane', 'Saint_Bernard', 'Eskimo_dog',
'malamute', 'Siberian_husky', 'affenpinscher', 'basenji', 'pug', 'Leonberg', 'Newfoundland',
'Great_Pyrenees', 'Samoyed', 'Pomeranian', 'chow', 'keeshond', 'Brabancon_griffon',
'Pembroke', 'Cardigan', 'toy_poodle', 'miniature_poodle', 'standard_poodle',
'Mexican_hairless', 'dingo', 'dhole', 'African_hunting_dog']
# Download script/URL (optional) ---------------------------------------------------------------------------------------
download: |
import xml.etree.ElementTree as ET
from tqdm import tqdm
from utils.general import download, Path
def convert_label(path, lb_path, year, image_id):
def convert_box(size, box):
dw, dh = 1. / size[0], 1. / size[1]
x, y, w, h = (box[0] + box[1]) / 2.0 - 1, (box[2] + box[3]) / 2.0 - 1, box[1] - box[0], box[3] - box[2]
return x * dw, y * dh, w * dw, h * dh
in_file = open(path / f'VOC{year}/Annotations/{image_id}.xml')
out_file = open(lb_path, 'w')
tree = ET.parse(in_file)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
for obj in root.iter('object'):
cls = obj.find('name').text
if cls in yaml['names'] and not int(obj.find('difficult').text) == 1:
xmlbox = obj.find('bndbox')
bb = convert_box((w, h), [float(xmlbox.find(x).text) for x in ('xmin', 'xmax', 'ymin', 'ymax')])
cls_id = yaml['names'].index(cls) # class id
out_file.write(" ".join([str(a) for a in (cls_id, *bb)]) + '\n')
# Download
dir = Path(yaml['path']) # dataset root dir
url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/'
urls = [f'{url}VOCtrainval_06-Nov-2007.zip', # 446MB, 5012 images
f'{url}VOCtest_06-Nov-2007.zip', # 438MB, 4953 images
f'{url}VOCtrainval_11-May-2012.zip'] # 1.95GB, 17126 images
download(urls, dir=dir / 'images', delete=False, curl=True, threads=3)
# Convert
path = dir / 'images/VOCdevkit'
for year, image_set in ('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test'):
imgs_path = dir / 'images' / f'{image_set}{year}'
lbs_path = dir / 'labels' / f'{image_set}{year}'
imgs_path.mkdir(exist_ok=True, parents=True)
lbs_path.mkdir(exist_ok=True, parents=True)
with open(path / f'VOC{year}/ImageSets/Main/{image_set}.txt') as f:
image_ids = f.read().strip().split()
for id in tqdm(image_ids, desc=f'{image_set}{year}'):
f = path / f'VOC{year}/JPEGImages/{id}.jpg' # old img path
lb_path = (lbs_path / f.name).with_suffix('.txt') # new label path
f.rename(imgs_path / f.name) # move image
convert_label(path, lb_path, year, id) # convert labels to YOLO format
1)划分train,val,test数据集
这里的train,val,test数据集的比例分别为7:2:1,原文件夹中的图片和相应的标注文件要同时写入到,代码如下:
import cv2
from pathlib import Path
import glob
import os
import shutil
import numpy as np
import unittest
## 读取图像,解决imread不能读取中文路径的问题
def cv_imread(filePath):
# 核心就是下面这句,一般直接用这句就行,直接把图片转为mat数据
cv_img=cv2.imdecode(np.fromfile(filePath,dtype=np.uint8),-1)
# imdecode读取的是rgb,如果后续需要opencv处理的话,需要转换成bgr,转换后图片颜色会变化
# cv_img=cv2.cvtColor(cv_img,cv2.COLOR_RGB2BGR)
return cv_img
'''将输入文件夹中的图片按比例输出到out_dir中'''
def split_train_val_test(img_read_dir, label_read_dir, img_out_dir, label_out_dir, train_ratio = 0.7, val_ratio = 0.2, test_ratio = 0.1):
'''
@param: img_read_dir 图片输入文件夹路径
@param: label_read_dir 标签输入文件夹路径
@param: img_out_dir 图片输出文件夹路径
@param: label_out_dir 标签输出文件夹路径
|_root
|_dog_cls1
|_img1
|_img2
|_img3
train图片输出到: img_out_dir/train
test图片输出到: img_out_dir/test
val图片输出到: img_out_dir/val
'''
if(train_ratio + val_ratio + test_ratio >= 1.0):
raise Exception("sum of ratio must equal to 1.0")
train_out_dir = img_out_dir + "/train/"
val_out_dir = img_out_dir + "/val/"
test_out_dir = img_out_dir + "/test/"
for dir in Path(img_read_dir).iterdir():
print(str(dir) + "---------------------")
dirName = Path(dir).name #文件夹名
img_totalPaths = glob.glob(os.path.join(str(dir),"*.*")) #图片路径
label_totalPaths = label_read_dir + dirName
train_size = round(len(img_totalPaths) * train_ratio)
val_size = round(len(img_totalPaths) * val_ratio)
# test_size = len(img_totalPaths) - train_size - val_size
for index, path in enumerate(img_totalPaths):
img = cv_imread(path)
filename = Path(path).name
#读取标注文件
label_path = label_totalPaths + "/" + filename.split(".")[0]
if(index <= train_size):
cv2.imwrite(train_out_dir + filename,img) #写入图片
label_out_path = label_out_dir + "/train/" + filename.split(".")[0] + ".xml"
shutil.copyfile(label_path, label_out_path) #写入标注文件(xml格式)
elif(index - train_size <= val_size):
cv2.imwrite(val_out_dir + filename, img) # 写入图片
label_out_path = label_out_dir + "/val/" + filename.split(".")[0] + ".xml"
shutil.copyfile(label_path, label_out_path) # 写入标注文件(xml格式)
else:
cv2.imwrite(test_out_dir + filename, img) # 写入图片
label_out_path = label_out_dir + "/test/" + filename.split(".")[0] + ".xml"
shutil.copyfile(label_path, label_out_path) # 写入标注文件(xml格式)
'''获取狗的类别'''
def get_dogs_cls(img_read_dir):
dogs_cls = []
for dir in Path(img_read_dir).iterdir():
dog_name = Path(dir).name.split("-")[1]
dogs_cls.append(dog_name)
return dogs_cls
class MyTest(unittest.TestCase):
def test_get_dogs_cls(self):
img_read_dir = "F:/Dataset/目标检测数据集/Dog_OxFord/Images/"
label_read_dir = "F:/Dataset/目标检测数据集/Dog_OxFord/Annotation/"
img_out_dir = "F:/animal_recognition/datasets/images/"
label_out_dir = "F:/animal_recognition/datasets/labels/"
dogs_cls = get_dogs_cls(img_read_dir)
print(f"cls num = {len(dogs_cls)}, dogs_cls = {dogs_cls}")
def test_split_train_val_test(self):
img_read_dir = "F:/Dataset/目标检测数据集/Dog_OxFord/Images/"
label_read_dir = "F:/Dataset/目标检测数据集/Dog_OxFord/Annotation/"
img_out_dir = "F:/animal_recognition/datasets/images/"
label_out_dir = "F:/animal_recognition/datasets/labels/"
split_train_val_test(img_read_dir,label_read_dir,img_out_dir,label_out_dir)
运行完后,文件夹结构如下:
datasets
├─images
│ ├─test
│ ├─train
│ └─val
├─labels_VOC
│ ├─test
│ ├─train
│ └─val
2)将voc标注文件转换成txt
代码如下(参考):
import unittest
'''
将voc格式中的xml文件转换为支持yolo训练的txt文件
'''
# 设置好自己数据集的类别
# class_list = ['object', 'animal'] # class names
class_list = ['Chihuahua', 'Japanese_spaniel', 'Maltese_dog', 'Pekinese', 'Shih',
'Blenheim_spaniel', 'papillon', 'toy_terrier', 'Rhodesian_ridgeback',
'Afghan_hound', 'basset', 'beagle', 'bloodhound', 'bluetick', 'black',
'Walker_hound', 'English_foxhound', 'redbone', 'borzoi', 'Irish_wolfhound',
'Italian_greyhound', 'whippet', 'Ibizan_hound', 'Norwegian_elkhound', 'otterhound',
'Saluki', 'Scottish_deerhound', 'Weimaraner', 'Staffordshire_bullterrier',
'American_Staffordshire_terrier', 'Bedlington_terrier', 'Border_terrier',
'Kerry_blue_terrier', 'Irish_terrier', 'Norfolk_terrier', 'Norwich_terrier',
'Yorkshire_terrier', 'wire', 'Lakeland_terrier', 'Sealyham_terrier', 'Airedale',
'cairn', 'Australian_terrier', 'Dandie_Dinmont', 'Boston_bull', 'miniature_schnauzer',
'giant_schnauzer', 'standard_schnauzer', 'Scotch_terrier', 'Tibetan_terrier',
'silky_terrier', 'soft', 'West_Highland_white_terrier', 'Lhasa', 'flat', 'curly',
'golden_retriever', 'Labrador_retriever', 'Chesapeake_Bay_retriever',
'German_short', 'vizsla', 'English_setter', 'Irish_setter', 'Gordon_setter',
'Brittany_spaniel', 'clumber', 'English_springer', 'Welsh_springer_spaniel',
'cocker_spaniel', 'Sussex_spaniel', 'Irish_water_spaniel', 'kuvasz', 'schipperke',
'groenendael', 'malinois', 'briard', 'kelpie', 'komondor', 'Old_English_sheepdog',
'Shetland_sheepdog', 'collie', 'Border_collie', 'Bouvier_des_Flandres', 'Rottweiler',
'German_shepherd', 'Doberman', 'miniature_pinscher', 'Greater_Swiss_Mountain_dog',
'Bernese_mountain_dog', 'Appenzeller', 'EntleBucher', 'boxer', 'bull_mastiff',
'Tibetan_mastiff', 'French_bulldog', 'Great_Dane', 'Saint_Bernard', 'Eskimo_dog',
'malamute', 'Siberian_husky', 'affenpinscher', 'basenji', 'pug', 'Leonberg', 'Newfoundland',
'Great_Pyrenees', 'Samoyed', 'Pomeranian', 'chow', 'keeshond', 'Brabancon_griffon',
'Pembroke', 'Cardigan', 'toy_poodle', 'miniature_poodle', 'standard_poodle',
'Mexican_hairless', 'dingo', 'dhole', 'African_hunting_dog']
# 参考
import xml.etree.ElementTree as ET
import pickle
import os
from os import listdir, getcwd
from os.path import join
def convert(size, box):
# size=(width, height) b=(xmin, xmax, ymin, ymax)
# x_center = (xmax+xmin)/2 y_center = (ymax+ymin)/2
# x = x_center / width y = y_center / height
# w = (xmax-xmin) / width h = (ymax-ymin) / height
x_center = (box[0] + box[1]) / 2.0
y_center = (box[2] + box[3]) / 2.0
x = x_center / size[0]
y = y_center / size[1]
w = (box[1] - box[0]) / size[0]
h = (box[3] - box[2]) / size[1]
# print(x, y, w, h)
return (x, y, w, h)
def convert_annotation(xml_files_path, save_txt_files_path, classes):
xml_files = os.listdir(xml_files_path)
# print(xml_files)
for xml_name in xml_files:
# print(xml_name)
xml_file = os.path.join(xml_files_path, xml_name)
out_txt_path = os.path.join(save_txt_files_path, xml_name.split('.')[0] + '.txt')
out_txt_f = open(out_txt_path, 'w')
tree = ET.parse(xml_file)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
for obj in root.iter('object'):
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1:
continue
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
float(xmlbox.find('ymax').text))
# b=(xmin, xmax, ymin, ymax)
# print(w, h, b)
bb = convert((w, h), b)
out_txt_f.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
class MyTest(unittest.TestCase):
def test_voc_2_yolo_train(self):
# 把forklift_pallet的voc的xml标签文件转化为yolo的txt标签文件
# 1、需要转化的类别
classes = class_list # 注意:这里根据自己的类别名称及种类自行更改
# 2、voc格式的xml标签文件路径
xml_files1 = r'F:/animal_recognition/datasets/labels_voc/train/'
# 3、转化为yolo格式的txt标签文件存储路径
save_txt_files1 = r'F:/animal_recognition/datasets/labels/train/'
convert_annotation(xml_files1, save_txt_files1, classes) #train
def test_voc_2_yolo_val(self):
# 把forklift_pallet的voc的xml标签文件转化为yolo的txt标签文件
# 1、需要转化的类别
classes = class_list # 注意:这里根据自己的类别名称及种类自行更改
# 2、voc格式的xml标签文件路径
xml_files1 = r'F:/animal_recognition/datasets/labels_voc/val/'
# 3、转化为yolo格式的txt标签文件存储路径
save_txt_files1 = r'F:/animal_recognition/datasets/labels/val/'
convert_annotation(xml_files1, save_txt_files1, classes) #val
def test_voc_2_yolo_test(self):
# 把forklift_pallet的voc的xml标签文件转化为yolo的txt标签文件
# 1、需要转化的类别
classes = class_list # 注意:这里根据自己的类别名称及种类自行更改
# 2、voc格式的xml标签文件路径
xml_files1 = r'F:/animal_recognition/datasets/labels_voc/test/'
# 3、转化为yolo格式的txt标签文件存储路径
save_txt_files1 = r'F:/animal_recognition/datasets/labels/test/'
convert_annotation(xml_files1, save_txt_files1, classes) #test
最后文件夹结构如下:
datasets
├─images
│ ├─test
│ ├─train
│ └─val
├─labels
│ ├─test
│ ├─train
│ └─val
├─labels_VOC
│ ├─test
│ ├─train
│ └─val
3、运行train.py
训练模型
python train.py --data dogs_oxford.yaml --cfg yolov5n.yaml --batch-size 1
运行效果:
Epoch gpu_mem box obj cls labels img_size
7/299 0.25G 0.01312 0.01482 0.05971 1 640: 100%|██████████| 14517/14517 [44:19<00:00, 5.46it/s]
Class Images Labels P R mAP@.5 mAP@.5:.95: 100%|██████████| 2055/2055 [02:07<00:00, 16.14it/s]
all 4110 4165 0.00753 0.947 0.0133 0.0106
Epoch gpu_mem box obj cls labels img_size
8/299 0.25G 0.01288 0.01454 0.05952 1 640: 79%|███████▊ | 11421/14517 [38:30<10:26, 4.94it/s]
最新和最优的权重文件会保存在runs/train/exp/weights
中。