YOLO V1算法的优点

(1). 速度快,基本上能达到了实时分辨。
(2). 能学到物体的广义表示,泛化能力强
(3). 基于图像的全局信息进行预测,与滑动窗口与region proposal 有区别

数据处理

##数据处理 基于pascal_voc数据集进行处理
import numpy as np
import xml.etree.ElementTree as ET 
import cv2 
import pickle
import copy 
import os 
import yolo.config as cfg
class pascal_voc(object):
	def __init__(self, ):
		self.data_path = os.path.join(cfg.PASCAL_PATH, 'VOC2012')
		self.image_size = cfg.IMAGE_PATH 
		self.cell_size = cfg.CELL_SIZE
		self.rebulid = False
		self.classes = cfg.CLASSES
		self.phase = "train"
		self.class_to_ind = dict(zip(self.classes, range(len(self.classes))))
		self.cache_path = cfg.CACHE_PATH 
		self.batch_size = cfg.BATCH_PATH 
		self.flipped = True
		self.epoch = 1
		self.gt_labels = True
		self.indicator = 0
		self.parper()
		
	def get(self):
		labels = np.zeros((self.batch_size, self.cell_size, self.cell_size, self.cell_size, 25))
		images = np.zeros((self.batch_size, self.image_size, self.image_size, 3))
		count = 0 
		while count < self.bacth_size:
			label = self.gt_labels[self.indicator]["label"]
			imname = self.gt_labels[self.indicator]["imname"]
			flipped = self.gt_labels[self.indicator]["flipped"]
			image = self.image_read(imname, flipped)
			labels[count,:,:,:] = label
			images[count,:,:,:] = image
			count += 1
			self.indicator += 1
			if self.indicator >= len(self.gt_labels):#如果标号大于集合长度,将标号记为0,周期加1,打乱数据集 
				self.indicator = 0
				self.epoch += 1
				np.random.shuffle(self.gt_labels)
	
	def parpare(self):
		gt_labels = self.load_labels()#读取标签
		if self,_flipped == True: #添加水平翻转的实例
			print("Appedning horizntally-flipped training examples ..")
			gt_labels_dp = copy.deepcopy(gt_labels)
			for idx in range(len(gt_labels_dp)):
				gt_labels_dp[idx]["flipped"] = True #将翻转设置为True
				gt_labels_dp[idx]["label"] = gt_labels_dp[idx]["label"][:,::-1,:]#将标签翻转
				for i in range(self.cell_size):
					for j in range(self.cell_size):
						if label[i,j,0] == 1:#得到翻转后的x的坐标
							labels[i,j,1] = self.image_size - 1 - labels[i,j,1]
			gt_labels += gt_labels_dp
		np.random.shuffle(gt_labels)#shuffle下数据
		self.gt_labels = gt_labels
		return gt_labels	
		
	def image_read(self, imname, flipped):
		image = cv2.imread(imname)
		image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)#cv2读入的图片为BGR格式的,将其转换为RGB格式
		image = cv2.resize(image, (self.image_size, self.image_size)).astype(np.float32)
		image = (image / 255.0) * 2.0 - 1.0 #对图片进行归一化处理
		if flipped:#如果为真,我们要将进行flip
			image = image[:,::-1,:]
		return image
	
	def load_labels(self):
		cahce_file = os.path.join(self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl')
		#如果文件存在并且不需要rebulid那么直接从cache中读取文件
		if os.path.isfile(cache_file) and not self.rebulid:
			print("loading cache_file from: " + cache_file)
			with open(cache_file, 'rb') as f:
				gt_labels = pickle.load(f)
			return gt_labels
		print("Processing gt_labels from " + self.data_path)
		
		#如果不存在这个文件就创建这个文件
		if not os.path.isdir(self.cache_path): 
			os.mkdir(self.cache_path) 
		
		if self.phase == "train":
			txtname = os.path.join(self.data_path, "ImageSets", "Main", "trainval.txt") #加载训练文件
		else:
			txtname = os.path.join(self.data_path, "ImageSets", "Main", "test.txt")#加载测试文件
		with open(txtname, 'r') as f:
			self.image_idx = [x.strip() for x in f.readlines()]#加载得到图片的索引
		
		gt_labels = []
		for idx in self.image_idx:
			label, num_objs = self.load_poscal_annotation(idx)
			if num_objs == 0:
				continue
			imname = os.path.join(self.data_path, "JPEGImages", idx +".jpg")#构造图片名
			#得到的gt_labels为一个集合的列表
			#集合包含标签,图片地址以及是否要翻转
			gt_labels.append(
				{"label":label,
				"imname":imname,
				"flipped":False})
		print("Saving gt_labels to: " + self.cache_file)
		with open(self.cache_file, 'rw') as f:
			pickle.dump(gt_labels, f)
		return gt_labels
						
	def load_poscal_annotation(self, idx):
		imname = os.path.join(self.image_path, JPEGImages, idx + ".jpg")
		image = cv2.imread(imname)
		#得到原始图片的宽和高,为了计算转变为标准尺寸后的中心点坐标
		w = image.shape[1] 
		h = image.shape[0] 
		w_ratio = self.image_size / w 
		h_ratio = self.image_size / h
		
		#构造数据标签,维度为(self.cell_size, self.cell_size, 25),一个格子负责预测一个目标
		laebel = np.zeros((self.cell_size, self.cell_size, 25))		
		#在xml文件中,读取图片的信息
		label_path = os.path.join(self.data_path, Annotations, idx + ".xml)
		tree = ET.parse(label_path)
		objs = tree.findall("object")
		
		for obj in objs:
			#读取数据
			x1 = max(min((float(obj.find("xmin").text) - 1) * w_ratio - 1, self.image_size - 1), 0) 
			y1 = max(min((float(obj.find("ymin").text) - 1) * h_ratio - 1, self.image_size - 1), 0) 
			x2 = max(min((float(obj.find("xmax").text) - 1) * w_ratio - 1, self.image_size - 1), 0) 
			y2 = max(min((float(obj.find("ymax").text) - 1) * h_ratio - 1, self.image_size - 1), 0) 
			cls_ind = self.cls_to_ind[obj.find("name").text.lower().strip()]
			#构造中心点、宽、高的标签
			boxes = [(x1 + x2)/2.0, (y1 + y2)/2.0, x2-x1,y2-y1]
			#计算目标中心点落在那个格子:(x_c * cell_size) / image_size
			x_ind = int((boxes[0] * self.cell_size)/self.image_size)
			y_ind = int((boxes[1] * self.cell_size)/self.image_size)
			if label[x_ind,y_ind,0] == 1: 判断格子里是否有目标了如果有目标就跳过这个格子
				continue
			label[x_ind,y_ind,0] = 1
			label[x_ind,y_ind,1:5] = boxes
			label[x_ind,y_ind,5+cls_ind] = 1
		return label, len(objs) #返回标签和目标个数

        数据的处理其实比较简单, 重要的要知道标签的输出维度,这样才能对标签的进一步处理来计算loss。

神经网络的搭建

网络结构:

tensorflow pytorch yolo 的关系 tensorflow和yolo区别_2d


网络输出:

        当BATCH_SIZE为1时, bulid_net输出的维度为[1, 77(num_classes + 2 * (boxes_per_cell + 4))],排列方式如下图所示:

tensorflow pytorch yolo 的关系 tensorflow和yolo区别_2d_02


        我们需要经过reshape操作,将其reshape成[1, cell_size, cell_size, num_classes + 2 * (boxes_per_cell + 4)],但是不能直接对输出继续reshape操作,要注意以其中的boundary为界限。归根结底,这个地方就是使用全连接层造成的不方便,前馈神经网络归根结底就是函数逼近器,是我们通过构造损失函数赋予其意义的,和概率图比较起来结构本身没啥可解释性。损失函数

        YOLO v1的网络结构比较简单,比较复杂的是损失函数,损失函数如下:

tensorflow pytorch yolo 的关系 tensorflow和yolo区别_损失函数_03


        有目标的cell(在数据处理的时候标注的那个cell有目标),且与真实图片iou较大的那个boxes负责预测,参与到损失函数的计算。同理,有目标的那个cell,参与到类别损失计算,个人感觉,这个地方与其较概率,不如叫分类得分更好一点。

        置信度计算为p(objcet) * iou, 也就是负责预测的那个cell里面两个boxes中iou大的那个,我感觉这个地方还能再改进一下用giou替代iou。可能效果会更好一些。除此之外,还需要计算无目标的置信度损失。

import numpy as np
import tensorflow as tf 
import yolo.config as cfg 
#采用slim构建卷积使代码更简洁
slim = tf.contrib.slim

class  YOLONet(object):
	def __init__(self,):
		self.classes = cfg.CLASSES    #PASCAL VOC数据集的20个数据类
        self.num_class = len(self.classes)  #20个类别
        self.image_size = cfg.IMAGE_SIZE   #图片的大小
        self.cell_size = cfg.CELL_SIZE    #整张输入图片划分为cell_size * cell_size的网格
        self.boxes_per_cell = cfg.BOXES_PER_CELL  #每个cell负责预测多少个(mayebe 2)bounding box
        self.output_size = (self.cell_size * self.cell_size) *\
            (self.num_class + self.boxes_per_cell * 5)    #最后输出的tensor大小,其为S*S*(C+5*B),具体可以看论文
        self.scale = 1.0 * self.image_size / self.cell_size  #每个cell像素的大小
        self.boundary1 = self.cell_size * self.cell_size * self.num_class  #类似于7*7*20
        self.boundary2 = self.boundary1 +\
            self.cell_size * self.cell_size * self.boxes_per_cell  #类似于 7*7*20 + 7*7*2
		
		#论文中的参数
        self.object_scale = cfg.OBJECT_SCALE  
        self.noobject_scale = cfg.NOOBJECT_SCALE
        self.class_scale = cfg.CLASS_SCALE
        self.coord_scale = cfg.COORD_SCALE

        self.learning_rate = cfg.LEARNING_RATE  #学习率
        self.batch_size = cfg.BATCH_SIZE  #batch_size
        self.alpha = cfg.ALPHA  #激活函数的参数
	
	
	
	def bulid_net(self, images, num_outputs, alpha, keep_prob = 0.5, is_training = True, scope = "yolo"):
		with tf.variable_scope(scope):
			with slim.arg_scope( #设置卷积层和全连接层的默认参数
			[slim.conv2d, slim.fully_connect],
			activation_fn=leaky_relu(alpha),#设置激活函数
			weights_regularizer=slim.l2_regularizer(0.0005),#设置l2正则化,
			weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)): #设置权重初始化方式			
				net = tf.pad(images, [[0.0],[3,3],[3,3],[0,0]], name = ""pad_1)#对图片填充,为了能整除后边的步长(我猜的), 维度为 (batch_size,454, 454,3)
				net = slim.conv2d(net, 64, 7, 2, padding = "VALID", scope = ""conv_2)# 输出64个通道 卷积核大小为7 步长为2 输出维度为(batch_size, 224,224, 64)
				net = slim.max_pool2d(net, 2, padding = "SAME", name = "pool_3")#卷积核大小为(2,2), 步长为(2,2),输出维度为(batch_size, 112, 112, 64)
				net = slim.conv2d(net, 192, 3, scope = "conv_4")#卷积核大小为(3,3), 步长为(1,1), 输出维度为 (batch_size, 112,112,192)
				net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')  #经过pooling层,输出维度为(batch_size, 56, 56, 192)
            	net = slim.conv2d(net, 128, 1, scope='conv_6') #128个通道,卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 56,56,128)
            	net = slim.conv2d(net, 256, 3, scope='conv_7') #256个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 56,56,256)
            	net = slim.conv2d(net, 256, 1, scope='conv_8') #256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 56,56,256)
            	net = slim.conv2d(net, 512, 3, scope='conv_9') #512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 56,56,512)
            	net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10') #Pooling层卷积核大小(2,2),步长为(2,2), 输出维度为(batch_size, 28,28,512)
            	net = slim.conv2d(net, 256, 1, scope='conv_11') #256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
            	net = slim.conv2d(net, 512, 3, scope='conv_12')#512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
            	net = slim.conv2d(net, 256, 1, scope='conv_13')#256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
            	net = slim.conv2d(net, 512, 3, scope='conv_14')#256个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
            	net = slim.conv2d(net, 256, 1, scope='conv_15')#256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
            	net = slim.conv2d(net, 512, 3, scope='conv_16')#512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
            	net = slim.conv2d(net, 256, 1, scope='conv_17')#256个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,256)
            	net = slim.conv2d(net, 512, 3, scope='conv_18')#512个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,512)
            	net = slim.conv2d(net, 512, 1, scope='conv_19')#512个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 28,28,512)
            	net = slim.conv2d(net, 1024, 3, scope='conv_20')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 28,28,1024)
            	net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21') # 经过pooling层,卷积核为(2,2),步长为(2,2)输出维度为(batch_size, 14,14,1024)
            	net = slim.conv2d(net, 512, 1, scope='conv_22')#512个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 14,14,512)
            	net = slim.conv2d(net, 1024, 3, scope='conv_23')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 14,14,1024)
            	net = slim.conv2d(net, 512, 1, scope='conv_24')#512个通道, 卷积核大小(1,1),步长为(1,1), 输出维度为(batch_size, 14,14,512)
            	net = slim.conv2d(net, 1024, 3, scope='conv_25')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 14,14,1024)
            	net = slim.conv2d(net, 1024, 3, scope='conv_26')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 14,14,1024)
            	#对net进行pad,为了经过步长为2的卷积时能输出尺寸为7
            	net = tf.pad(net, [[0,0],[1,1],[1,1],[0,0]], name='pad_27') #维度为(batch_size, 15, 15, 1024)
            	net = slim.conv2d(net, 1024, 3, 2, padding='VALID', scope='conv_28') #1024个通道, 卷积核大小(3,3),步长为(2,2), 输出维度为(batch_size, 7,7,1024)
            	net = slim.conv2d(net, 1024, 3, scope='conv_29')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 7,7,1024)
            	net = slim.conv2d(net, 1024, 3, scope='conv_30')#1024个通道, 卷积核大小(3,3),步长为(1,1), 输出维度为(batch_size, 7,7,1024)
				net = tf.transpose(net, [0,3,1,2], name='trans_31')
				#经过个transpose, 是flatten层按照, 第一层通道的第一行,第二行....,第二层通道的第一行,第二行.......方式展开。
				#这个地方在Yolo v2中改为了1*1的卷积层, 毕竟前馈神经网络工作原理还是黑盒子,就算按照这种方式展开了,经过两个全连接层,也不具有可解释性了。
				net = slim.flatten(net, scope='flat_32')  #将输入拉展, 输出维度为(batch_size, 7 * 7 * 1024)
            	net = slim.fully_connected(net, 512, scope='fc_33')   #全连接层, 输出维度为(batch_size, 512)
            	net = slim.fully_connected(net, 4096, scope='fc_34')  #全连接层,输出维度为(batch_size, 4096)
            	net = slim.dropout(  #加入个dropout层, 防止过拟合
                    net, keep_prob=keep_prob, is_training=is_training,
                    scope='dropout_35')
            	net = slim.fully_connected(    #全连接层, 输出维度为(batch_size, 7*7*2*25)
                    net, num_outputs, activation_fn=None, scope='fc_36')
        return net
	
	def calc_iou(self, boxes1, boxes2, scope = "iou"):
		with tf.variable_scope(iou): 
			#注意下,这里输入的维度是5维的[batch_size, cell_size, cell_size, boxes_per_cell, 4]
			#输入的是中心点和宽高,将其变成做左端点和右端点
			boxes1_t = tf.stack([boxes1[:,:,:,:,0] - boxes1[:,:,:,:,2] / 2.0, 
			 								boxes1[:,:,:,:,1] - boxes1[:,:,:,:,3] / 2.0,
			 								boxes1[:,:,:,:,0] + boxes1[:,:,:,:,2] / 2.0, 
			 								boxes1[:,:,:,:,1] + boxes1[:,:,:,:,3] / 2.0], axis = -1)

			boxes2_t = tf.stack([boxes2[:,:,:,:,0] - boxe21[:,:,:,:,2] / 2.0, 
			 								boxes2[:,:,:,:,1] - boxes2[:,:,:,:,3] / 2.0,
			 								boxes2[:,:,:,:,0] + boxes2[:,:,:,:,2] / 2.0, 
			 								boxes2[:,:,:,:,1] + boxes2[:,:,:,:,3] / 2.0], axis = -1)
			#计算重叠面积
			lu = tf.maximum(boxes1[:,:,:,:,:2], boxes2[:,:,:,:,:2])
			rd = tf.minimum(boxes1[:,:,:,:,2:], boxes1[:,:,:,:,2:])
			intersection = tf.maximum(0.0, rd - lu)
			inter_square = intersection[:,:,:,:,0] * intersection[:,:,:,:,1]

			#计算总面积
			square1 = boxes1[:,:,:,:,2] * boxes1[:,:,:,:,3]
			square2 = boxes2[:,:,:,:,2] * boxes2[:,:,:,:,3]
			union_square = tf.maximum(square1 + square2 - inter_square, 14 - 10)
			
			#计算iou
			iou = tf.clip_by_value(inter_square / union_square, 0.0, 1.0) #限制iou范围为0-1
			return iou 
	
	def loss_layer(self, predicts, labels, scope = "loss_layer"):
		with tf.variable_scope(scope):
			#我们这个地方的predict是全连接层的输出,按照:分类得分, 置信度,坐标进行排列。维度是[batch_size, (20+10) * cell_size* cell_size]
			predict_classes = tf.reshape(predicts[:,self.boundary1], [self.batch_size, self.cell_size, self.cell_size, self.num_classes]) #输出的类别得分
			predict_scales = tf.reshape(predicts[:,boundary1:boundary2], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])#每个boxes置信度
			predict_boxes = tf.reshape(predicts[:,boundary2:], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell,4]) #每个boxes的坐标
			
			#处理label
			response = tf.reshape(labels[:,:,:,0], [self.bact_size, self.cell_size, self.cell_size, 1])#置信度
			boxes =  tf.reshape(labels[:,:,:,1:5], [self.bact_size, self.cell_size, self.cell_size, 1, 4]) #边框坐标
			boxes = tf.tile(boxes, [0,0,0,self.boxes_per_cell, 0]) / self.image_size #维度为(batch_size, cell_size, cell_size, boxes_per_cell, 4) 对于predict_boxes, 除以self.image_size, 得到相对图片的比例
			classes = tf.reshape(labels[:,:,:,5:], [self.bact_size, self.cell_size, self.cell_size, self.num_classes])
			
			#我们预测的值为中心点相对于cell左上角的偏移量, 构架整个cell的坐标
			offset = np.array([i for i in range(self.cell_size)] * self.cell_size * self.boxes_per_cell).reshape((self.boxes_per_cell, self.cell_Size, self.cell_size))
			offset = np.transpose(offset, (1,2,0))
			offset = tf.constant(offset, dtype = tf.float32).reshape(1,self.cell_size, self.cell_size, 2)
			offset = tf.tile(offset, [batch_size, 0, 0, 0])
			offset_tran = tf.transpose(offset, (0,2,1,3))
			
			#计算相对特征图的boxes坐标, 为了计算iou做准备
			#偏移量与中心点公式为
			#x_偏 = (x_中心 * cell_size) / image_size - offset
			predict_boxes_tran = tf.stack([(predict_boxes[:,:,:,:,0] + offset) / self.cell_size, #没有对image_size处理, 相当于计算了中心点相对于特征图的比例, 与label中的boxes对应
										   (predict_boxes[..., 1] + offset_tran) / self.cell_size,
										   tf.square(predict_boxes[..., 2]),#宽度的平方,和论文中的开方对应
	   									   tf.square(predict_boxes[..., 3])], axis=-1)  #高度的平方
			iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes) #计算iou 返回维度为(bacth_size, cell_size, cell_size,self.boxes_per_cell)

			#通过iou构造标签, 有目标的那个cell的iou大的boxes负责预测,其余不预测
			object_mask = tf.reduce_max(ioy_predict_truth, axis = -1 ,keep_dims = True) 
			object_mask = tf.cast((iou_predict_truth >= object_mask), tf.float32) * response
			noobject_mask = tf.ones_like(object_mask) - object_mask
			
			#将boxes变为偏移量, 对宽高开方
			boxes_tran = tf.stack([(boxes[:,:,:,:,0] * self.cell_size - offset),
								   (boxes[:,:,:,:,1] * self.cell_size - offset_tran),
								   tf.sqrt(boxes[:,:,:,:,2]),
								   tf.sqrt(boxes[:,:,:,:,3])], axis = -1)
			
			#计算类别损失,只计算有目标的那个cell的分类损失
			class_delta = response * (predict_classes - classes)
			class_loss = tf.reduce_mean(tf.reduce_sum(tf.square(class_delta), axis = [1,2,3]), name = "class_loss") * self.class_scale

			#计算置信度损失
			object_scale = object_mask * (predict_scale - iou_predict_truth)
			object_loss = tf.reduce_mean(tf.reduce_mean(tf.square(object_scale), axis = [1,2,3]), name = "object_loss") * self.object_scale
			noobject_scale = noobject * predict_scale
			object_loss = tf.reduce_mean(tf.reduce_mean(tf.square(noobject_scale) , axis = [1,2,3]), name = "noobject_loss") * self.noobject_scale

			#计算boxes损失,只计算有目标的那个cell的坐标损失
			 coord_mask = np.expand_dims(object_mask, 4) #加一维 类似 tf.reshape((object_mask.shape, 1))
			 coord_delta = coord_mask * (predict_boxes - boxes_tran)
			 coord_losses = tf.reduce_mean(tf.reduce_sum(tf.square(coord_delta), axis = [1,2,3,4]), name = "coord_losses") * self.coord_scale
			 tf.losses.add_loss(class_loss)
             tf.losses.add_loss(object_loss)
             tf.losses.add_loss(noobject_loss)
             tf.losses.add_loss(coord_loss)  #将各个损失总结起来

             tf.summary.scalar('class_loss', class_loss)
             tf.summary.scalar('object_loss', object_loss)
             tf.summary.scalar('noobject_loss', noobject_loss)
             tf.summary.scalar('coord_loss', coord_loss)

             tf.summary.histogram('boxes_delta_x', boxes_delta[..., 0])
             tf.summary.histogram('boxes_delta_y', boxes_delta[..., 1])
             tf.summary.histogram('boxes_delta_w', boxes_delta[..., 2])
             tf.summary.histogram('boxes_delta_h', boxes_delta[..., 3])
             tf.summary.histogram('iou', iou_predict_truth)

	def leaky_relu(alpha):  #leaky_relu激活函数
  		def op(inputs):
        	return tf.nn.leaky_relu(inputs, alpha=alpha, name='leaky_relu')
    	return op

训练阶段

import os
import argparse
import datetime
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer
from utils.pascal_voc import pascal_voc
slim = tf.contrib.slim

class Solver(object):
    def __init__(self, net, data):   #Yolon_net and pascal_voc_data
        self.net = net   #训练的网络
        self.data = data  #train或者test的数据
        self.weights_file = cfg.WEIGHTS_FILE   #权重文件
        self.max_iter = cfg.MAX_ITER  #迭代次数,迭代次数可自定义
        self.initial_learning_rate = cfg.LEARNING_RATE  #学习率,0.0001
        self.decay_steps = cfg.DECAY_STEPS  #衰变步数
        self.decay_rate = cfg.DECAY_RATE   #衰变率
        self.staircase = cfg.STAIRCASE    #true
        self.summary_iter = cfg.SUMMARY_ITER # SUMMARY_ITER, default 10
        self.save_iter = cfg.SAVE_ITER   #save itger, default 1000
        self.output_dir = os.path.join(
            cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M'))  # add time, data/pascal_voc/output/date_time
        if not os.path.exists(self.output_dir):  #不存在则创建目录
            os.makedirs(self.output_dir)
        self.save_cfg()  #保存配置

        self.variable_to_restore = tf.global_variables()   #初始化tensorflow的全局变量
        self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None)  #定义tf.saver
        self.ckpt_file = os.path.join(self.output_dir, 'yolo.ckpt')  #定义保存模型输出的权重文件
        self.summary_op = tf.summary.merge_all()   #将tensorflow各个操作联合起来,省事
        self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60)   #将内容写入到文件中,每60秒更新一次

        self.global_step = tf.train.create_global_step()  #创建全局的步骤
        self.learning_rate = tf.train.exponential_decay(  #设定变化的学习率,这个可以yolo论文中的相关指导来设定
            self.initial_learning_rate, self.global_step, self.decay_steps,
            self.decay_rate, self.staircase, name='learning_rate')
        self.optimizer = tf.train.GradientDescentOptimizer(   #采用的优化方法是随机梯度下降
            learning_rate=self.learning_rate)
        self.train_op = slim.learning.create_train_op(   #将tensorflow的operation联合起来
            self.net.total_loss, self.optimizer, global_step=self.global_step)

        gpu_options = tf.GPUOptions()
        config = tf.ConfigProto(gpu_options=gpu_options)
        self.sess = tf.Session(config=config)#对会话进行配置
        self.sess.run(tf.global_variables_initializer())   #初始化全局变量

        if self.weights_file is not None:  #权重文件不等于None的时候
            print('Restoring weights from: ' + self.weights_file) #加载预训练模型
            self.saver.restore(self.sess, self.weights_file)    #从预训练模型中restore

        self.writer.add_graph(self.sess.graph)  #加图 也就是tf.summary.FileWriter(".").add_garph(self.sess.graph)

    def train(self):  #start training

        train_timer = Timer()  #train_timer
        load_timer = Timer()   #load_timer

        for step in range(1, self.max_iter + 1):   #开始训练
            print("step: ",step)
            load_timer.tic()
            images, labels = self.data.get()    #获取到batch_size大小的图片和对应的label
            load_timer.toc()
            feed_dict = {self.net.images: images,
                         self.net.labels: labels}  #喂数据

            if step % self.summary_iter == 0:
                if step % (self.summary_iter * 10) == 0:   #将一些训练信息打印出来

                    train_timer.tic()
                    summary_str, loss, _ = self.sess.run(
                        [self.summary_op, self.net.total_loss, self.train_op],
                        feed_dict=feed_dict)
                    train_timer.toc()

                    log_str = '''{} Epoch: {}, Step: {}, Learning rate: {},'''
                    ''' Loss: {:5.3f}\nSpeed: {:.3f}s/iter,'''
                    '''' Load: {:.3f}s/iter, Remain: {}'''.format(
                        datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
                        self.data.epoch,
                        int(step),
                        round(self.learning_rate.eval(session=self.sess), 6),
                        loss,
                        train_timer.average_time,
                        load_timer.average_time,
                        train_timer.remain(step, self.max_iter))
                    print(log_str)

                else:
                    train_timer.tic()
                    summary_str, _ = self.sess.run(
                        [self.summary_op, self.train_op],
                        feed_dict=feed_dict)
                    train_timer.toc()

                self.writer.add_summary(summary_str, step)

            else:    #只是训练,不打印出信息
                train_timer.tic()
                self.sess.run(self.train_op, feed_dict=feed_dict)
                train_timer.toc()

            if step % self.save_iter == 0:   #保留检查点,以供测试时用
                print('{} Saving checkpoint file to: {}'.format(
                    datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
                    self.output_dir))
                self.saver.save(    #保存会话,将模型文件保存
                    self.sess, self.ckpt_file, global_step=self.global_step)
                print("save done!!!")
    def save_cfg(self):

        with open(os.path.join(self.output_dir, 'config.txt'), 'w') as f:   #把配置信息写入到文件中
            cfg_dict = cfg.__dict__
            for key in sorted(cfg_dict.keys()):
                if key[0].isupper():
                    cfg_str = '{}: {}\n'.format(key, cfg_dict[key])
                    f.write(cfg_str)


def update_config_paths(data_dir, weights_file):    #更新配置文件路径

    print("应该是加载了YOLO_small.ckpt")
    cfg.DATA_PATH = data_dir
    cfg.PASCAL_PATH = os.path.join(data_dir, 'pascal_voc')
    cfg.CACHE_PATH = os.path.join(cfg.PASCAL_PATH, 'cache')
    cfg.OUTPUT_DIR = os.path.join(cfg.PASCAL_PATH, 'output')
    cfg.WEIGHTS_DIR = os.path.join(cfg.PASCAL_PATH, 'weights')

    cfg.WEIGHTS_FILE = os.path.join(cfg.WEIGHTS_DIR, weights_file)


def main():    #自定义参数
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', default="YOLO_small.ckpt", type=str)  #定义权重文件
    parser.add_argument('--data_dir', default="data", type=str)  #定义数据文件夹
    parser.add_argument('--threshold', default=0.2, type=float)  #阈值
    parser.add_argument('--iou_threshold', default=0.5, type=float)  #IOU阈值
    parser.add_argument('--gpu', default='', type=str)   #是否用gpu训练
    args = parser.parse_args()

    if args.gpu is not None:   #是否用gpu训练
        cfg.GPU = args.gpu

    if args.data_dir != cfg.DATA_PATH:
        update_config_paths(args.data_dir, args.weights)

    os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU

    yolo = YOLONet()   #Yolo网络
    pascal = pascal_voc('train')     #获得训练的数据, 包含了经过水平翻转后的训练实例

    solver = Solver(yolo, pascal)  #准备训练的环境,包括设置优化器,学习率等内容

    print('Start training ...')
    solver.train()  #start training
    print('done!!!')

    # f = open('result.txt', 'w')
    # f.write('train finished!!!!')
    # f.close()

Timer模块

        主要涉及到time类和datetime类的使用,比较简单。

import time
import datetime


class Timer(object):
    '''
    A simple timer.
    '''

    def __init__(self):
        self.init_time = time.time()
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.
        self.remain_time = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            return self.average_time
        else:
            return self.diff

    def remain(self, iters, max_iters):
        if iters == 0:
            self.remain_time = 0
        else:
            self.remain_time = (time.time() - self.init_time) * (max_iters - iters) / iters #计算一下还要多少时间 (用的总时间/以及执行的迭代数) * 还需要的迭代数
        return str(datetime.timedelta(seconds=int(self.remain_time))) #规范下输出

         这个部分主要是代码的复现如果有不对的地方希望不吝指正。
        Yolo v1的具体内容,建议看看论文,其后的V2,V3都是在这个基础上改进的,弄懂了V1理解起来其他两个就非常容易了。Yolo系列难的地方在于损失函数,结构比较简单,Faster Rcnn结构比较复杂。过几天复现下Faster Rcnn的代码,弄懂了这两类的目标检测算法,其他的目标检测算法就很简单了了。