self.model = YOLOV3(self.input_data, self.trainable)

然后进入yolov3.py代码,执行YOLOV3类的__init__函数,同样先是获取相关参数,然后先后调用了__build_nework(input_data) 函数来创建网络,并获取卷积后的大中小特征图;再调用decode()函数解码。




class YOLOV3(object):
    """Implement tensorflow yolov3 here 在这里实现tensorflow yolov3"""
    def __init__(self, input_data, trainable):

        # 获取相关变量
        self.trainable        = trainable    # 是否训练
        self.classes          = utils.read_class_names(cfg.YOLO.CLASSES)   # 读取类别名称
        self.num_class        = len(self.classes)   # 类别数量
        self.strides          = np.array(cfg.YOLO.STRIDES)  # 下采样倍率  小中大
        self.anchors          = utils.get_anchors(cfg.YOLO.ANCHORS)   # 获取anchor
        self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE  # 每个框anchor数量
        self.iou_loss_thresh  = cfg.YOLO.IOU_LOSS_THRESH   # 交并比 loss阈值
        self.upsample_method  = cfg.YOLO.UPSAMPLE_METHOD   # 上采样方法

            self.conv_lbbox, self.conv_mbbox, self.conv_sbbox = self.__build_nework(input_data)  # 创建网络
        with tf.variable_scope('pred_sbbox'):
            self.pred_sbbox = self.decode(self.conv_sbbox, self.anchors[0], self.strides[0])   #  strides[0]=8

        with tf.variable_scope('pred_mbbox'):
            self.pred_mbbox = self.decode(self.conv_mbbox, self.anchors[1], self.strides[1])   #  strides[1]=16

        with tf.variable_scope('pred_lbbox'):
            self.pred_lbbox = self.decode(self.conv_lbbox, self.anchors[2], self.strides[2])   #  strides[2]=32

        with tf.variable_scope('pred_multi_scale'):
            self.pred_multi_scale = tf.concat([tf.reshape(self.pred_sbbox, [-1, 85]),
                                               tf.reshape(self.pred_mbbox, [-1, 85]),
                                               tf.reshape(self.pred_lbbox, [-1, 85])], axis=0, name='concat')
        # hand-coded the dimensions: if 608, use 19; if 416, use 13
        with tf.variable_scope('pred_multi_scale'):
            self.pred_multi_scale = tf.concat([tf.reshape(self.pred_sbbox, [-1, 19, 19, 85]),
                                               tf.reshape(self.pred_mbbox, [-1, 19, 19, 85]),
                                               tf.reshape(self.pred_lbbox, [-1, 19, 19, 85])], axis=0, name='concat')


__build_nework(input_data) 函数


# 构建网络结构
    def __build_nework(self, input_data):

        route_1, route_2, input_data = backbone.darknet53(input_data, self.trainable)
        # input_data is -1*13*13*1024
        input_data = common.convolutional(input_data, (1, 1, 1024,  512), self.trainable, 'conv52')
        input_data = common.convolutional(input_data, (3, 3,  512, 1024), self.trainable, 'conv53')
        input_data = common.convolutional(input_data, (1, 1, 1024,  512), self.trainable, 'conv54')
        input_data = common.convolutional(input_data, (3, 3,  512, 1024), self.trainable, 'conv55')
        input_data = common.convolutional(input_data, (1, 1, 1024,  512), self.trainable, 'conv56')

        conv_lobj_branch = common.convolutional(input_data, (3, 3, 512, 1024), self.trainable, name='conv_lobj_branch')
        # -1*13*13*[3*(self.num_class + 5)]
        conv_lbbox = common.convolutional(conv_lobj_branch, (1, 1, 1024, 3*(self.num_class + 5)),
                                          trainable=self.trainable, name='conv_lbbox', activate=False, bn=False)

        # -1*13*13*512 --> -1*13*13*256
        input_data = common.convolutional(input_data, (1, 1,  512,  256), self.trainable, 'conv57')
        # upsampling input data (1/32) to match route_2 (1/16), -1*26*26*512
        # -1*13*13*256 --> -1*26*26*256
        input_data = common.upsample(input_data, name='upsample0', method=self.upsample_method)

        with tf.variable_scope('route_1'):
            # route_2 is -1*26*26*512, 最终input_data is -1*26*26*768
            input_data = tf.concat([input_data, route_2], axis=-1)

        input_data = common.convolutional(input_data, (1, 1, 768, 256), self.trainable, 'conv58')
        input_data = common.convolutional(input_data, (3, 3, 256, 512), self.trainable, 'conv59')
        input_data = common.convolutional(input_data, (1, 1, 512, 256), self.trainable, 'conv60')
        input_data = common.convolutional(input_data, (3, 3, 256, 512), self.trainable, 'conv61')
        input_data = common.convolutional(input_data, (1, 1, 512, 256), self.trainable, 'conv62')

        conv_mobj_branch = common.convolutional(input_data, (3, 3, 256, 512),  self.trainable, name='conv_mobj_branch' )
        # -1*26*26*[3*(self.num_class + 5)]
        conv_mbbox = common.convolutional(conv_mobj_branch, (1, 1, 512, 3*(self.num_class + 5)),
                                          trainable=self.trainable, name='conv_mbbox', activate=False, bn=False)

        input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv63')
        # -1*26*26*128 --> -1*52*52*128
        input_data = common.upsample(input_data, name='upsample1', method=self.upsample_method)

        with tf.variable_scope('route_2'):
            #  route_1,  -1*52*52*256, 最终input_data is -1*52*52*384
            input_data = tf.concat([input_data, route_1], axis=-1)

        input_data = common.convolutional(input_data, (1, 1, 384, 128), self.trainable, 'conv64')
        input_data = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, 'conv65')
        input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv66')
        input_data = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, 'conv67')
        input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv68')

        conv_sobj_branch = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, name='conv_sobj_branch')
        # -1*52*52*[3*(self.num_class + 5)]
        conv_sbbox = common.convolutional(conv_sobj_branch, (1, 1, 256, 3*(self.num_class + 5)),
                                          trainable=self.trainable, name='conv_sbbox', activate=False, bn=False)
        # dimensions are: -1*13*13*255, -1*26*26*255, -1*52*52*255
        return conv_lbbox, conv_mbbox, conv_sbbox



def darknet53(input_data, trainable):

    with tf.variable_scope('darknet'):

        input_data = common.convolutional(input_data, filters_shape=(3, 3,  3,  32), trainable=trainable, name='conv0')
        input_data = common.convolutional(input_data, filters_shape=(3, 3, 32,  64),
                                          trainable=trainable, name='conv1', downsample=True)

        for i in range(1):
            input_data = common.residual_block(input_data,  64,  32, 64, trainable=trainable, name='residual%d' %(i+0))

        input_data = common.convolutional(input_data, filters_shape=(3, 3,  64, 128),
                                          trainable=trainable, name='conv4', downsample=True)

        for i in range(2):
            input_data = common.residual_block(input_data, 128,  64, 128, trainable=trainable, name='residual%d' %(i+1))

        input_data = common.convolutional(input_data, filters_shape=(3, 3, 128, 256),
                                          trainable=trainable, name='conv9', downsample=True)

        for i in range(8):
            input_data = common.residual_block(input_data, 256, 128, 256, trainable=trainable, name='residual%d' %(i+3))

        route_1 = input_data
        input_data = common.convolutional(input_data, filters_shape=(3, 3, 256, 512),
                                          trainable=trainable, name='conv26', downsample=True)

        for i in range(8):
            input_data = common.residual_block(input_data, 512, 256, 512, trainable=trainable, name='residual%d' %(i+11))

        route_2 = input_data
        input_data = common.convolutional(input_data, filters_shape=(3, 3, 512, 1024),
                                          trainable=trainable, name='conv43', downsample=True)

        for i in range(4):
            input_data = common.residual_block(input_data, 1024, 512, 1024, trainable=trainable, name='residual%d' %(i+19))

        #  route_1,  -1*52*52*256
        #  route_2,  -1*26*26*512
        #  input_data  -1*13*13*1024

        return route_1, route_2, input_data




# 基本单元:zeropaddings(为true时)+卷积+BN+leaky_relu
def convolutional(input_data, filters_shape, trainable, name, downsample=False, activate=True, bn=True):

    with tf.variable_scope(name):
        if downsample:   # 下采样
            pad_h, pad_w = (filters_shape[0] - 2) // 2 + 1, (filters_shape[1] - 2) // 2 + 1
            paddings = tf.constant([[0, 0], [pad_h, pad_h], [pad_w, pad_w], [0, 0]])
            input_data = tf.pad(input_data, paddings, 'CONSTANT')
            strides = (1, 2, 2, 1)   # 不用pooling缩放图像
            padding = 'VALID'
            strides = (1, 1, 1, 1)
            padding = "SAME"

        weight = tf.get_variable(name='weight', dtype=tf.float32, trainable=True,
                                 shape=filters_shape, initializer=tf.random_normal_initializer(stddev=0.01))
        conv = tf.nn.conv2d(input=input_data, filter=weight, strides=strides, padding=padding)

        if bn:
            conv = tf.layers.batch_normalization(conv, beta_initializer=tf.zeros_initializer(),
                                                 moving_variance_initializer=tf.ones_initializer(), training=trainable)
            bias = tf.get_variable(name='bias', shape=filters_shape[-1], trainable=True,
                                   dtype=tf.float32, initializer=tf.constant_initializer(0.0))
            conv = tf.nn.bias_add(conv, bias)

        if activate == True: conv = tf.nn.leaky_relu(conv, alpha=0.1)

    return conv



# 基本单元: 卷积+BN+leaky_relu + 卷积+BN+leaky_relu
def residual_block(input_data, input_channel, filter_num1, filter_num2, trainable, name):

    short_cut = input_data

    with tf.variable_scope(name):
        input_data = convolutional(input_data, filters_shape=(1, 1, input_channel, filter_num1),
                                   trainable=trainable, name='conv1')
        input_data = convolutional(input_data, filters_shape=(3, 3, filter_num1,   filter_num2),
                                   trainable=trainable, name='conv2')

        residual_output = input_data + short_cut

    return residual_output



# 解码 1.3.1 边界框的预测
    def decode(self, conv_output, anchors, stride):
        return tensor of shape [batch_size, output_size, output_size, anchor_per_scale, 5 + num_classes]
               contains (x, y, w, h, score, probability)
        conv_shape       = tf.shape(conv_output)  # 获取
        batch_size       = conv_shape[0]  #
        output_size      = conv_shape[1]  # 13,26,52  dimensions are: -1*13*13*255, -1*26*26*255, -1*52*52*255
        # number of anchors
        anchor_per_scale = len(anchors)  # 每个框anchor数量

        # shape(batch_size, output_size, output_size, anchor_per_scale, 5 + self.num_class)
        conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, anchor_per_scale, 5 + self.num_class))

        conv_raw_dxdy = conv_output[:, :, :, :, 0:2]  # 取 dx dy 中心位置的偏移量
        conv_raw_dwdh = conv_output[:, :, :, :, 2:4]  # 取 dw dh 预测框长宽的偏移量
        conv_raw_conf = conv_output[:, :, :, :, 4:5]  # 取 置信度
        conv_raw_prob = conv_output[:, :, :, :, 5: ]  # 取 预测概率
        # tf.tile creates a new tensor by replicating input m time
        # tf.tile通过复制输入的时间创建一个新的张量
        # 好了,接下来需要画网格了。其中,output_size 等于 13、26 或者 52
        y = tf.tile(tf.range(output_size, dtype=tf.int32)[:, tf.newaxis], [1, output_size])
        x = tf.tile(tf.range(output_size, dtype=tf.int32)[tf.newaxis, :], [output_size, 1])

        xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)  # 数据合并
        # 计算网格左上角的位置,相当于图中的Cx,Cy
        xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, anchor_per_scale, 1])
        xy_grid = tf.cast(xy_grid, tf.float32)  # tf.cast()数据类型转换
        # tf.sigmoid(dxdy) gives the relative position within a grid cell. Adding the position of the cell (xy_grid)
        # multiplying stride scales the relative positions to the original image
        # 根据上图公式计算预测框的中心位置
        pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * stride   # 乘上缩放的倍数映射到原图坐标,如 8、16 和 32 倍。
        # tf.exp() scales the anchors larger or smaller or changes the shape
        # 根据上图公式计算预测框的长和宽大小
        pred_wh = (tf.exp(conv_raw_dwdh) * anchors) * stride    # 预测的 w,h
        # 合并边界框的位置和长宽信息
        pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)      # 合并预测的x,y,w,h

        pred_conf = tf.sigmoid(conv_raw_conf)                  # 计算预测框里object的置信度
        pred_prob = tf.sigmoid(conv_raw_prob)                  # 计算预测框里object的类别概率

        return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)
