YOLO V1算法的优点
(1). 速度快,基本上能达到了实时分辨。
(2). 能学到物体的广义表示,泛化能力强
(3). 基于图像的全局信息进行预测,与滑动窗口与region proposal 有区别
数据处理
##数据处理 基于pascal_voc数据集进行处理
import numpy as np
import xml.etree.ElementTree as ET
import cv2
import pickle
import copy
import os
import yolo.config as cfg
class pascal_voc(object):
def __init__(self, ):
self.data_path = os.path.join(cfg.PASCAL_PATH, 'VOC2012')
self.image_size = cfg.IMAGE_PATH
self.cell_size = cfg.CELL_SIZE
self.rebulid = False
self.classes = cfg.CLASSES
self.phase = "train"
self.class_to_ind = dict(zip(self.classes, range(len(self.classes))))
self.cache_path = cfg.CACHE_PATH
self.batch_size = cfg.BATCH_PATH
self.flipped = True
self.epoch = 1
self.gt_labels = True
self.indicator = 0
self.parper()
def get(self):
labels = np.zeros((self.batch_size, self.cell_size, self.cell_size, self.cell_size, 25))
images = np.zeros((self.batch_size, self.image_size, self.image_size, 3))
count = 0
while count < self.bacth_size:
label = self.gt_labels[self.indicator]["label"]
imname = self.gt_labels[self.indicator]["imname"]
flipped = self.gt_labels[self.indicator]["flipped"]
image = self.image_read(imname, flipped)
labels[count,:,:,:] = label
images[count,:,:,:] = image
count += 1
self.indicator += 1
if self.indicator >= len(self.gt_labels):#如果标号大于集合长度,将标号记为0,周期加1,打乱数据集
self.indicator = 0
self.epoch += 1
np.random.shuffle(self.gt_labels)
def parpare(self):
gt_labels = self.load_labels()#读取标签
if self,_flipped == True: #添加水平翻转的实例
print("Appedning horizntally-flipped training examples ..")
gt_labels_dp = copy.deepcopy(gt_labels)
for idx in range(len(gt_labels_dp)):
gt_labels_dp[idx]["flipped"] = True #将翻转设置为True
gt_labels_dp[idx]["label"] = gt_labels_dp[idx]["label"][:,::-1,:]#将标签翻转
for i in range(self.cell_size):
for j in range(self.cell_size):
if label[i,j,0] == 1:#得到翻转后的x的坐标
labels[i,j,1] = self.image_size - 1 - labels[i,j,1]
gt_labels += gt_labels_dp
np.random.shuffle(gt_labels)#shuffle下数据
self.gt_labels = gt_labels
return gt_labels
def image_read(self, imname, flipped):
image = cv2.imread(imname)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)#cv2读入的图片为BGR格式的,将其转换为RGB格式
image = cv2.resize(image, (self.image_size, self.image_size)).astype(np.float32)
image = (image / 255.0) * 2.0 - 1.0 #对图片进行归一化处理
if flipped:#如果为真,我们要将进行flip
image = image[:,::-1,:]
return image
def load_labels(self):
cahce_file = os.path.join(self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl')
#如果文件存在并且不需要rebulid那么直接从cache中读取文件
if os.path.isfile(cache_file) and not self.rebulid:
print("loading cache_file from: " + cache_file)
with open(cache_file, 'rb') as f:
gt_labels = pickle.load(f)
return gt_labels
print("Processing gt_labels from " + self.data_path)
#如果不存在这个文件就创建这个文件
if not os.path.isdir(self.cache_path):
os.mkdir(self.cache_path)
if self.phase == "train":
txtname = os.path.join(self.data_path, "ImageSets", "Main", "trainval.txt") #加载训练文件
else:
txtname = os.path.join(self.data_path, "ImageSets", "Main", "test.txt")#加载测试文件
with open(txtname, 'r') as f:
self.image_idx = [x.strip() for x in f.readlines()]#加载得到图片的索引
gt_labels = []
for idx in self.image_idx:
label, num_objs = self.load_poscal_annotation(idx)
if num_objs == 0:
continue
imname = os.path.join(self.data_path, "JPEGImages", idx +".jpg")#构造图片名
#得到的gt_labels为一个集合的列表
#集合包含标签,图片地址以及是否要翻转
gt_labels.append(
{"label":label,
"imname":imname,
"flipped":False})
print("Saving gt_labels to: " + self.cache_file)
with open(self.cache_file, 'rw') as f:
pickle.dump(gt_labels, f)
return gt_labels
def load_poscal_annotation(self, idx):
imname = os.path.join(self.image_path, JPEGImages, idx + ".jpg")
image = cv2.imread(imname)
#得到原始图片的宽和高,为了计算转变为标准尺寸后的中心点坐标
w = image.shape[1]
h = image.shape[0]
w_ratio = self.image_size / w
h_ratio = self.image_size / h
#构造数据标签,维度为(self.cell_size, self.cell_size, 25),一个格子负责预测一个目标
laebel = np.zeros((self.cell_size, self.cell_size, 25))
#在xml文件中,读取图片的信息
label_path = os.path.join(self.data_path, Annotations, idx + ".xml)
tree = ET.parse(label_path)
objs = tree.findall("object")
for obj in objs:
#读取数据
x1 = max(min((float(obj.find("xmin").text) - 1) * w_ratio - 1, self.image_size - 1), 0)
y1 = max(min((float(obj.find("ymin").text) - 1) * h_ratio - 1, self.image_size - 1), 0)
x2 = max(min((float(obj.find("xmax").text) - 1) * w_ratio - 1, self.image_size - 1), 0)
y2 = max(min((float(obj.find("ymax").text) - 1) * h_ratio - 1, self.image_size - 1), 0)
cls_ind = self.cls_to_ind[obj.find("name").text.lower().strip()]
#构造中心点、宽、高的标签
boxes = [(x1 + x2)/2.0, (y1 + y2)/2.0, x2-x1,y2-y1]
#计算目标中心点落在那个格子:(x_c * cell_size) / image_size
x_ind = int((boxes[0] * self.cell_size)/self.image_size)
y_ind = int((boxes[1] * self.cell_size)/self.image_size)
if label[x_ind,y_ind,0] == 1: 判断格子里是否有目标了如果有目标就跳过这个格子
continue
label[x_ind,y_ind,0] = 1
label[x_ind,y_ind,1:5] = boxes
label[x_ind,y_ind,5+cls_ind] = 1
return label, len(objs) #返回标签和目标个数
数据的处理其实比较简单, 重要的要知道标签的输出维度,这样才能对标签的进一步处理来计算loss。
神经网络的搭建
网络结构:
网络输出:
当BATCH_SIZE为1时, bulid_net输出的维度为[1, 77(num_classes + 2 * (boxes_per_cell + 4))],排列方式如下图所示:
我们需要经过reshape操作,将其reshape成[1, cell_size, cell_size, num_classes + 2 * (boxes_per_cell + 4)],但是不能直接对输出继续reshape操作,要注意以其中的boundary为界限。归根结底,这个地方就是使用全连接层造成的不方便,前馈神经网络归根结底就是函数逼近器,是我们通过构造损失函数赋予其意义的,和概率图比较起来结构本身没啥可解释性。损失函数
YOLO v1的网络结构比较简单,比较复杂的是损失函数,损失函数如下:
有目标的cell(在数据处理的时候标注的那个cell有目标),且与真实图片iou较大的那个boxes负责预测,参与到损失函数的计算。同理,有目标的那个cell,参与到类别损失计算,个人感觉,这个地方与其较概率,不如叫分类得分更好一点。
置信度计算为p(objcet) * iou, 也就是负责预测的那个cell里面两个boxes中iou大的那个,我感觉这个地方还能再改进一下用giou替代iou。可能效果会更好一些。除此之外,还需要计算无目标的置信度损失。
import numpy as np
import tensorflow as tf
import yolo.config as cfg
#采用slim构建卷积使代码更简洁
slim = tf.contrib.slim
class YOLONet(object):
def __init__(self,):
self.classes = cfg.CLASSES #PASCAL VOC数据集的20个数据类
self.num_class = len(self.classes) #20个类别
self.image_size = cfg.IMAGE_SIZE #图片的大小
self.cell_size = cfg.CELL_SIZE #整张输入图片划分为cell_size * cell_size的网格
self.boxes_per_cell = cfg.BOXES_PER_CELL #每个cell负责预测多少个(mayebe 2)bounding box
self.output_size = (self.cell_size * self.cell_size) *\
(self.num_class + self.boxes_per_cell * 5) #最后输出的tensor大小,其为S*S*(C+5*B),具体可以看论文
self.scale = 1.0 * self.image_size / self.cell_size #每个cell像素的大小
self.boundary1 = self.cell_size * self.cell_size * self.num_class #类似于7*7*20
self.boundary2 = self.boundary1 +\
self.cell_size * self.cell_size * self.boxes_per_cell #类似于 7*7*20 + 7*7*2
#论文中的参数
self.object_scale = cfg.OBJECT_SCALE
self.noobject_scale = cfg.NOOBJECT_SCALE
self.class_scale = cfg.CLASS_SCALE
self.coord_scale = cfg.COORD_SCALE
self.learning_rate = cfg.LEARNING_RATE #学习率
self.batch_size = cfg.BATCH_SIZE #batch_size
self.alpha = cfg.ALPHA #激活函数的参数
def bulid_net(self, images, num_outputs, alpha, keep_prob = 0.5, is_training = True, scope = "yolo"):
with tf.variable_scope(scope):
with slim.arg_scope( #设置卷积层和全连接层的默认参数
[slim.conv2d, slim.fully_connect],
activation_fn=leaky_relu(alpha),#设置激活函数
weights_regularizer=slim.l2_regularizer(0.0005),#设置l2正则化,
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)): #设置权重初始化方式
net = tf.pad(images, [[0.0],[3,3],[3,3],[0,0]], name = ""pad_1)#对图片填充,为了能整除后边的步长(我猜的), 维度为 (batch_size,454, 454,3)
net = slim.conv2d(net, 64, 7, 2, padding = "VALID", scope = ""conv_2)# 输出64个通道 卷积核大小为7 步长为2 输出维度为(batch_size, 224,224, 64)
net = slim.max_pool2d(net, 2, padding = "SAME", name = "pool_3")#卷积核大小为(2,2), 步长为(2,2),输出维度为(batch_size, 112, 112, 64)
net = slim.conv2d(net, 192, 3, scope = "conv_4")#卷积核大小为(3,3), 步长为(1,1), 输出维度为 (batch_size, 112,112,192)
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5') #经过pooling层,输出维度为(batch_size, 56, 56, 192)
net = slim.conv2d(net, 128, 1, scope='conv_6') #128个通道,卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 56,56,128)
net = slim.conv2d(net, 256, 3, scope='conv_7') #256个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 56,56,256)
net = slim.conv2d(net, 256, 1, scope='conv_8') #256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 56,56,256)
net = slim.conv2d(net, 512, 3, scope='conv_9') #512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 56,56,512)
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10') #Pooling层卷积核大小(2,2),步长为(2,2), 输出维度为(batch_size, 28,28,512)
net = slim.conv2d(net, 256, 1, scope='conv_11') #256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
net = slim.conv2d(net, 512, 3, scope='conv_12')#512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
net = slim.conv2d(net, 256, 1, scope='conv_13')#256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
net = slim.conv2d(net, 512, 3, scope='conv_14')#256个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
net = slim.conv2d(net, 256, 1, scope='conv_15')#256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
net = slim.conv2d(net, 512, 3, scope='conv_16')#512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
net = slim.conv2d(net, 256, 1, scope='conv_17')#256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
net = slim.conv2d(net, 512, 3, scope='conv_18')#512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
net = slim.conv2d(net, 512, 1, scope='conv_19')#512个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,512)
net = slim.conv2d(net, 1024, 3, scope='conv_20')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,1024)
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21') # 经过pooling层,卷积核为(2,2),步长为(2,2)输出维度为(batch_size, 14,14,1024)
net = slim.conv2d(net, 512, 1, scope='conv_22')#512个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 14,14,512)
net = slim.conv2d(net, 1024, 3, scope='conv_23')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 14,14,1024)
net = slim.conv2d(net, 512, 1, scope='conv_24')#512个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 14,14,512)
net = slim.conv2d(net, 1024, 3, scope='conv_25')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 14,14,1024)
net = slim.conv2d(net, 1024, 3, scope='conv_26')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 14,14,1024)
#对net进行pad,为了经过步长为2的卷积时能输出尺寸为7
net = tf.pad(net, [[0,0],[1,1],[1,1],[0,0]], name='pad_27') #维度为(batch_size, 15, 15, 1024)
net = slim.conv2d(net, 1024, 3, 2, padding='VALID', scope='conv_28') #1024个通道, 卷积核大小(3,3),步长为(2,2), 输出维度为(batch_size, 7,7,1024)
net = slim.conv2d(net, 1024, 3, scope='conv_29')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 7,7,1024)
net = slim.conv2d(net, 1024, 3, scope='conv_30')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 7,7,1024)
net = tf.transpose(net, [0,3,1,2], name='trans_31')
#经过个transpose, 是flatten层按照, 第一层通道的第一行,第二行....,第二层通道的第一行,第二行.......方式展开。
#这个地方在Yolo v2中改为了1*1的卷积层, 毕竟前馈神经网络工作原理还是黑盒子,就算按照这种方式展开了,经过两个全连接层,也不具有可解释性了。
net = slim.flatten(net, scope='flat_32') #将输入拉展, 输出维度为(batch_size, 7 * 7 * 1024)
net = slim.fully_connected(net, 512, scope='fc_33') #全连接层, 输出维度为(batch_size, 512)
net = slim.fully_connected(net, 4096, scope='fc_34') #全连接层,输出维度为(batch_size, 4096)
net = slim.dropout( #加入个dropout层, 防止过拟合
net, keep_prob=keep_prob, is_training=is_training,
scope='dropout_35')
net = slim.fully_connected( #全连接层, 输出维度为(batch_size, 7*7*2*25)
net, num_outputs, activation_fn=None, scope='fc_36')
return net
def calc_iou(self, boxes1, boxes2, scope = "iou"):
with tf.variable_scope(iou):
#注意下,这里输入的维度是5维的[batch_size, cell_size, cell_size, boxes_per_cell, 4]
#输入的是中心点和宽高,将其变成做左端点和右端点
boxes1_t = tf.stack([boxes1[:,:,:,:,0] - boxes1[:,:,:,:,2] / 2.0,
boxes1[:,:,:,:,1] - boxes1[:,:,:,:,3] / 2.0,
boxes1[:,:,:,:,0] + boxes1[:,:,:,:,2] / 2.0,
boxes1[:,:,:,:,1] + boxes1[:,:,:,:,3] / 2.0], axis = -1)
boxes2_t = tf.stack([boxes2[:,:,:,:,0] - boxe21[:,:,:,:,2] / 2.0,
boxes2[:,:,:,:,1] - boxes2[:,:,:,:,3] / 2.0,
boxes2[:,:,:,:,0] + boxes2[:,:,:,:,2] / 2.0,
boxes2[:,:,:,:,1] + boxes2[:,:,:,:,3] / 2.0], axis = -1)
#计算重叠面积
lu = tf.maximum(boxes1[:,:,:,:,:2], boxes2[:,:,:,:,:2])
rd = tf.minimum(boxes1[:,:,:,:,2:], boxes1[:,:,:,:,2:])
intersection = tf.maximum(0.0, rd - lu)
inter_square = intersection[:,:,:,:,0] * intersection[:,:,:,:,1]
#计算总面积
square1 = boxes1[:,:,:,:,2] * boxes1[:,:,:,:,3]
square2 = boxes2[:,:,:,:,2] * boxes2[:,:,:,:,3]
union_square = tf.maximum(square1 + square2 - inter_square, 14 - 10)
#计算iou
iou = tf.clip_by_value(inter_square / union_square, 0.0, 1.0) #限制iou范围为0-1
return iou
def loss_layer(self, predicts, labels, scope = "loss_layer"):
with tf.variable_scope(scope):
#我们这个地方的predict是全连接层的输出,按照:分类得分, 置信度,坐标进行排列。维度是[batch_size, (20+10) * cell_size* cell_size]
predict_classes = tf.reshape(predicts[:,self.boundary1], [self.batch_size, self.cell_size, self.cell_size, self.num_classes]) #输出的类别得分
predict_scales = tf.reshape(predicts[:,boundary1:boundary2], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])#每个boxes置信度
predict_boxes = tf.reshape(predicts[:,boundary2:], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell,4]) #每个boxes的坐标
#处理label
response = tf.reshape(labels[:,:,:,0], [self.bact_size, self.cell_size, self.cell_size, 1])#置信度
boxes = tf.reshape(labels[:,:,:,1:5], [self.bact_size, self.cell_size, self.cell_size, 1, 4]) #边框坐标
boxes = tf.tile(boxes, [0,0,0,self.boxes_per_cell, 0]) / self.image_size #维度为(batch_size, cell_size, cell_size, boxes_per_cell, 4) 对于predict_boxes, 除以self.image_size, 得到相对图片的比例
classes = tf.reshape(labels[:,:,:,5:], [self.bact_size, self.cell_size, self.cell_size, self.num_classes])
#我们预测的值为中心点相对于cell左上角的偏移量, 构架整个cell的坐标
offset = np.array([i for i in range(self.cell_size)] * self.cell_size * self.boxes_per_cell).reshape((self.boxes_per_cell, self.cell_Size, self.cell_size))
offset = np.transpose(offset, (1,2,0))
offset = tf.constant(offset, dtype = tf.float32).reshape(1,self.cell_size, self.cell_size, 2)
offset = tf.tile(offset, [batch_size, 0, 0, 0])
offset_tran = tf.transpose(offset, (0,2,1,3))
#计算相对特征图的boxes坐标, 为了计算iou做准备
#偏移量与中心点公式为
#x_偏 = (x_中心 * cell_size) / image_size - offset
predict_boxes_tran = tf.stack([(predict_boxes[:,:,:,:,0] + offset) / self.cell_size, #没有对image_size处理, 相当于计算了中心点相对于特征图的比例, 与label中的boxes对应
(predict_boxes[..., 1] + offset_tran) / self.cell_size,
tf.square(predict_boxes[..., 2]),#宽度的平方,和论文中的开方对应
tf.square(predict_boxes[..., 3])], axis=-1) #高度的平方
iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes) #计算iou 返回维度为(bacth_size, cell_size, cell_size,self.boxes_per_cell)
#通过iou构造标签, 有目标的那个cell的iou大的boxes负责预测,其余不预测
object_mask = tf.reduce_max(ioy_predict_truth, axis = -1 ,keep_dims = True)
object_mask = tf.cast((iou_predict_truth >= object_mask), tf.float32) * response
noobject_mask = tf.ones_like(object_mask) - object_mask
#将boxes变为偏移量, 对宽高开方
boxes_tran = tf.stack([(boxes[:,:,:,:,0] * self.cell_size - offset),
(boxes[:,:,:,:,1] * self.cell_size - offset_tran),
tf.sqrt(boxes[:,:,:,:,2]),
tf.sqrt(boxes[:,:,:,:,3])], axis = -1)
#计算类别损失,只计算有目标的那个cell的分类损失
class_delta = response * (predict_classes - classes)
class_loss = tf.reduce_mean(tf.reduce_sum(tf.square(class_delta), axis = [1,2,3]), name = "class_loss") * self.class_scale
#计算置信度损失
object_scale = object_mask * (predict_scale - iou_predict_truth)
object_loss = tf.reduce_mean(tf.reduce_mean(tf.square(object_scale), axis = [1,2,3]), name = "object_loss") * self.object_scale
noobject_scale = noobject * predict_scale
object_loss = tf.reduce_mean(tf.reduce_mean(tf.square(noobject_scale) , axis = [1,2,3]), name = "noobject_loss") * self.noobject_scale
#计算boxes损失,只计算有目标的那个cell的坐标损失
coord_mask = np.expand_dims(object_mask, 4) #加一维 类似 tf.reshape((object_mask.shape, 1))
coord_delta = coord_mask * (predict_boxes - boxes_tran)
coord_losses = tf.reduce_mean(tf.reduce_sum(tf.square(coord_delta), axis = [1,2,3,4]), name = "coord_losses") * self.coord_scale
tf.losses.add_loss(class_loss)
tf.losses.add_loss(object_loss)
tf.losses.add_loss(noobject_loss)
tf.losses.add_loss(coord_loss) #将各个损失总结起来
tf.summary.scalar('class_loss', class_loss)
tf.summary.scalar('object_loss', object_loss)
tf.summary.scalar('noobject_loss', noobject_loss)
tf.summary.scalar('coord_loss', coord_loss)
tf.summary.histogram('boxes_delta_x', boxes_delta[..., 0])
tf.summary.histogram('boxes_delta_y', boxes_delta[..., 1])
tf.summary.histogram('boxes_delta_w', boxes_delta[..., 2])
tf.summary.histogram('boxes_delta_h', boxes_delta[..., 3])
tf.summary.histogram('iou', iou_predict_truth)
def leaky_relu(alpha): #leaky_relu激活函数
def op(inputs):
return tf.nn.leaky_relu(inputs, alpha=alpha, name='leaky_relu')
return op
训练阶段
import os
import argparse
import datetime
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer
from utils.pascal_voc import pascal_voc
slim = tf.contrib.slim
class Solver(object):
def __init__(self, net, data): #Yolon_net and pascal_voc_data
self.net = net #训练的网络
self.data = data #train或者test的数据
self.weights_file = cfg.WEIGHTS_FILE #权重文件
self.max_iter = cfg.MAX_ITER #迭代次数,迭代次数可自定义
self.initial_learning_rate = cfg.LEARNING_RATE #学习率,0.0001
self.decay_steps = cfg.DECAY_STEPS #衰变步数
self.decay_rate = cfg.DECAY_RATE #衰变率
self.staircase = cfg.STAIRCASE #true
self.summary_iter = cfg.SUMMARY_ITER # SUMMARY_ITER, default 10
self.save_iter = cfg.SAVE_ITER #save itger, default 1000
self.output_dir = os.path.join(
cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')) # add time, data/pascal_voc/output/date_time
if not os.path.exists(self.output_dir): #不存在则创建目录
os.makedirs(self.output_dir)
self.save_cfg() #保存配置
self.variable_to_restore = tf.global_variables() #初始化tensorflow的全局变量
self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None) #定义tf.saver
self.ckpt_file = os.path.join(self.output_dir, 'yolo.ckpt') #定义保存模型输出的权重文件
self.summary_op = tf.summary.merge_all() #将tensorflow各个操作联合起来,省事
self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60) #将内容写入到文件中,每60秒更新一次
self.global_step = tf.train.create_global_step() #创建全局的步骤
self.learning_rate = tf.train.exponential_decay( #设定变化的学习率,这个可以yolo论文中的相关指导来设定
self.initial_learning_rate, self.global_step, self.decay_steps,
self.decay_rate, self.staircase, name='learning_rate')
self.optimizer = tf.train.GradientDescentOptimizer( #采用的优化方法是随机梯度下降
learning_rate=self.learning_rate)
self.train_op = slim.learning.create_train_op( #将tensorflow的operation联合起来
self.net.total_loss, self.optimizer, global_step=self.global_step)
gpu_options = tf.GPUOptions()
config = tf.ConfigProto(gpu_options=gpu_options)
self.sess = tf.Session(config=config)#对会话进行配置
self.sess.run(tf.global_variables_initializer()) #初始化全局变量
if self.weights_file is not None: #权重文件不等于None的时候
print('Restoring weights from: ' + self.weights_file) #加载预训练模型
self.saver.restore(self.sess, self.weights_file) #从预训练模型中restore
self.writer.add_graph(self.sess.graph) #加图 也就是tf.summary.FileWriter(".").add_garph(self.sess.graph)
def train(self): #start training
train_timer = Timer() #train_timer
load_timer = Timer() #load_timer
for step in range(1, self.max_iter + 1): #开始训练
print("step: ",step)
load_timer.tic()
images, labels = self.data.get() #获取到batch_size大小的图片和对应的label
load_timer.toc()
feed_dict = {self.net.images: images,
self.net.labels: labels} #喂数据
if step % self.summary_iter == 0:
if step % (self.summary_iter * 10) == 0: #将一些训练信息打印出来
train_timer.tic()
summary_str, loss, _ = self.sess.run(
[self.summary_op, self.net.total_loss, self.train_op],
feed_dict=feed_dict)
train_timer.toc()
log_str = '''{} Epoch: {}, Step: {}, Learning rate: {},'''
''' Loss: {:5.3f}\nSpeed: {:.3f}s/iter,'''
'''' Load: {:.3f}s/iter, Remain: {}'''.format(
datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
self.data.epoch,
int(step),
round(self.learning_rate.eval(session=self.sess), 6),
loss,
train_timer.average_time,
load_timer.average_time,
train_timer.remain(step, self.max_iter))
print(log_str)
else:
train_timer.tic()
summary_str, _ = self.sess.run(
[self.summary_op, self.train_op],
feed_dict=feed_dict)
train_timer.toc()
self.writer.add_summary(summary_str, step)
else: #只是训练,不打印出信息
train_timer.tic()
self.sess.run(self.train_op, feed_dict=feed_dict)
train_timer.toc()
if step % self.save_iter == 0: #保留检查点,以供测试时用
print('{} Saving checkpoint file to: {}'.format(
datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
self.output_dir))
self.saver.save( #保存会话,将模型文件保存
self.sess, self.ckpt_file, global_step=self.global_step)
print("save done!!!")
def save_cfg(self):
with open(os.path.join(self.output_dir, 'config.txt'), 'w') as f: #把配置信息写入到文件中
cfg_dict = cfg.__dict__
for key in sorted(cfg_dict.keys()):
if key[0].isupper():
cfg_str = '{}: {}\n'.format(key, cfg_dict[key])
f.write(cfg_str)
def update_config_paths(data_dir, weights_file): #更新配置文件路径
print("应该是加载了YOLO_small.ckpt")
cfg.DATA_PATH = data_dir
cfg.PASCAL_PATH = os.path.join(data_dir, 'pascal_voc')
cfg.CACHE_PATH = os.path.join(cfg.PASCAL_PATH, 'cache')
cfg.OUTPUT_DIR = os.path.join(cfg.PASCAL_PATH, 'output')
cfg.WEIGHTS_DIR = os.path.join(cfg.PASCAL_PATH, 'weights')
cfg.WEIGHTS_FILE = os.path.join(cfg.WEIGHTS_DIR, weights_file)
def main(): #自定义参数
parser = argparse.ArgumentParser()
parser.add_argument('--weights', default="YOLO_small.ckpt", type=str) #定义权重文件
parser.add_argument('--data_dir', default="data", type=str) #定义数据文件夹
parser.add_argument('--threshold', default=0.2, type=float) #阈值
parser.add_argument('--iou_threshold', default=0.5, type=float) #IOU阈值
parser.add_argument('--gpu', default='', type=str) #是否用gpu训练
args = parser.parse_args()
if args.gpu is not None: #是否用gpu训练
cfg.GPU = args.gpu
if args.data_dir != cfg.DATA_PATH:
update_config_paths(args.data_dir, args.weights)
os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU
yolo = YOLONet() #Yolo网络
pascal = pascal_voc('train') #获得训练的数据, 包含了经过水平翻转后的训练实例
solver = Solver(yolo, pascal) #准备训练的环境,包括设置优化器,学习率等内容
print('Start training ...')
solver.train() #start training
print('done!!!')
# f = open('result.txt', 'w')
# f.write('train finished!!!!')
# f.close()
Timer模块
主要涉及到time类和datetime类的使用,比较简单。
import time
import datetime
class Timer(object):
'''
A simple timer.
'''
def __init__(self):
self.init_time = time.time()
self.total_time = 0.
self.calls = 0
self.start_time = 0.
self.diff = 0.
self.average_time = 0.
self.remain_time = 0.
def tic(self):
# using time.time instead of time.clock because time time.clock
# does not normalize for multithreading
self.start_time = time.time()
def toc(self, average=True):
self.diff = time.time() - self.start_time
self.total_time += self.diff
self.calls += 1
self.average_time = self.total_time / self.calls
if average:
return self.average_time
else:
return self.diff
def remain(self, iters, max_iters):
if iters == 0:
self.remain_time = 0
else:
self.remain_time = (time.time() - self.init_time) * (max_iters - iters) / iters #计算一下还要多少时间 (用的总时间/以及执行的迭代数) * 还需要的迭代数
return str(datetime.timedelta(seconds=int(self.remain_time))) #规范下输出
这个部分主要是代码的复现如果有不对的地方希望不吝指正。
Yolo v1的具体内容,建议看看论文,其后的V2,V3都是在这个基础上改进的,弄懂了V1理解起来其他两个就非常容易了。Yolo系列难的地方在于损失函数,结构比较简单,Faster Rcnn结构比较复杂。过几天复现下Faster Rcnn的代码,弄懂了这两类的目标检测算法,其他的目标检测算法就很简单了了。