目标检测—1 SSD

  • 1 主干网络
  • 1.1 结构
  • 1.2 代码
  • 2 损失
  • 2.1 结构
  • 2.2 代码
  • 3 代码流程
  • 3.1 数据处理
  • 3.2 训练
  • 3.3 预测


#1 SSD

1 主干网络

嗯山ESXi8_嗯山ESXi8


SSD以VGG16为主干网络,提取第4次和第5次下采样的特征层,根据第5次下采样特征继续下采样4次,一共6个特征层预测目标。不同尺寸的特征层预测不同尺寸的目标。

1.1 结构

(1) VGG

input-->net['input']
    ↓   
net['input']-->Conv2D*2+MaxPooling2D-->net['pool1']
    ↓
net['pool1']-->Conv2D*2+MaxPooling2D-->net['pool2']
    ↓
net['pool2']-->Conv2D*3+MaxPooling2D-->net['pool3']
    ↓
net['pool3']-->Conv2D*3+MaxPooling2D-->net['pool4']  
    ↓  
net['pool4']-->Conv2D*3+MaxPooling2D-->net['pool5'] 
    ↓
net['pool5']-->Conv2D-->net['fc6']
    ↓
net['pool6']-->Conv2D-->net['fc7']
    ↓
net['fc7']-->Conv2D+ZeroPadding2D+Conv2D-->net['conv6_2']    
    ↓ 
net['conv6_2']-->Conv2D+ZeroPadding2D+Conv2D-->net['conv7_2'] 
    ↓
net['conv7_2']-->Conv2D*2-->net['conv8_2']
    ↓
net['conv8_2']-->Conv2D*2-->net['conv9_2']

(2) SSD300

input
    ↓
net = VGG16(input_tensor)
    ↓
net['conv4_3']-->Normalize-->net['conv4_3_norm']
net['conv4_3_norm']-->Conv2D+Flatten-->net['conv4_3_norm_mbox_loc_flat']
net['conv4_3_norm']-->Conv2D+Flatten-->net['conv4_3_norm_mbox_conf_flat']
net['conv4_3_norm']-->priorbox-->net['conv4_3_norm_mbox_priorbox']
    ↓
net['fc7']-->Conv2D+Flatten-->net['fc7_mbox_loc_flat']
net['fc7']-->Conv2D+Flatten-->net['fc7_mbox_conf_flat']
net['fc7']-->priorbox-->net['fc7_mbox_priorbox']
    ↓
net['conv6_2']-->Conv2D+Flatten-->net['conv6_2_mbox_loc_flat'] 
net['conv6_2']-->Conv2D+Flatten-->net['conv6_2_mbox_conf_flat']
net['conv6_2']-->priorbox-->net['conv6_2_mbox_priorbox']
    ↓
net['conv7_2']-->Conv2D+Flatten-->net['conv7_2_mbox_loc_flat'] 
net['conv7_2']-->Conv2D+Flatten-->net['conv7_2_mbox_conf_flat']
net['conv7_2']-->priorbox-->net['conv7_2_mbox_priorbox']
    ↓
net['conv8_2']-->Conv2D+Flatten-->net['conv8_2_mbox_loc_flat'] 
net['conv8_2']-->Conv2D+Flatten-->net['conv8_2_mbox_conf_flat']
net['conv8_2']-->priorbox-->net['conv8_2_mbox_priorbox']   
    ↓  
net['conv9_2']-->Conv2D+Flatten-->net['conv9_2_mbox_loc_flat'] 
net['conv9_2']-->Conv2D+Flatten-->net['conv9_2_mbox_conf_flat']
net['conv9_2']-->priorbox-->net['conv9_2_mbox_priorbox']

1.2 代码

# VGG
import keras.backend as K
from keras.layers import Activation
from keras.layers import Conv2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import GlobalAveragePooling2D
from keras.layers import Input
from keras.layers import MaxPooling2D
from keras.layers import merge, concatenate
from keras.layers import Reshape
from keras.layers import ZeroPadding2D
from keras.models import Model

def VGG16(input_tensor):
    #----------------------------主干特征提取网络开始---------------------------#
    # SSD结构,net字典
    net = {} 
    # Block 1
    net['input'] = input_tensor
    # 300,300,3 -> 150,150,64
    net['conv1_1'] = Conv2D(64, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv1_1')(net['input'])
    net['conv1_2'] = Conv2D(64, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv1_2')(net['conv1_1'])
    net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same',
                                name='pool1')(net['conv1_2'])

    
    # Block 2
    # 150,150,64 -> 75,75,128
    net['conv2_1'] = Conv2D(128, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv2_1')(net['pool1'])
    net['conv2_2'] = Conv2D(128, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv2_2')(net['conv2_1'])
    net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same',
                                name='pool2')(net['conv2_2'])
    # Block 3
    # 75,75,128 -> 38,38,256
    net['conv3_1'] = Conv2D(256, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv3_1')(net['pool2'])
    net['conv3_2'] = Conv2D(256, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv3_2')(net['conv3_1'])
    net['conv3_3'] = Conv2D(256, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv3_3')(net['conv3_2'])
    net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same',
                                name='pool3')(net['conv3_3'])
    # Block 4
    # 38,38,256 -> 19,19,512
    net['conv4_1'] = Conv2D(512, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv4_1')(net['pool3'])
    net['conv4_2'] = Conv2D(512, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv4_2')(net['conv4_1'])
    net['conv4_3'] = Conv2D(512, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv4_3')(net['conv4_2'])
    net['pool4'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same',
                                name='pool4')(net['conv4_3'])
    # Block 5
    # 19,19,512 -> 19,19,512
    net['conv5_1'] = Conv2D(512, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv5_1')(net['pool4'])
    net['conv5_2'] = Conv2D(512, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv5_2')(net['conv5_1'])
    net['conv5_3'] = Conv2D(512, kernel_size=(3,3),
                                   activation='relu',
                                   padding='same',
                                   name='conv5_3')(net['conv5_2'])
    net['pool5'] = MaxPooling2D((3, 3), strides=(1, 1), padding='same',
                                name='pool5')(net['conv5_3'])
    # FC6
    # 19,19,512 -> 19,19,1024
    net['fc6'] = Conv2D(1024, kernel_size=(3,3), dilation_rate=(6, 6),
                                     activation='relu', padding='same',
                                     name='fc6')(net['pool5'])

    # x = Dropout(0.5, name='drop6')(x)
    # FC7
    # 19,19,1024 -> 19,19,1024
    net['fc7'] = Conv2D(1024, kernel_size=(1,1), activation='relu',
                               padding='same', name='fc7')(net['fc6'])

    # x = Dropout(0.5, name='drop7')(x)
    # Block 6
    # 19,19,512 -> 10,10,512
    net['conv6_1'] = Conv2D(256, kernel_size=(1,1), activation='relu',
                                   padding='same',
                                   name='conv6_1')(net['fc7'])
    net['conv6_2'] = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(net['conv6_1'])
    net['conv6_2'] = Conv2D(512, kernel_size=(3,3), strides=(2, 2),
                                   activation='relu',
                                   name='conv6_2')(net['conv6_2'])

    # Block 7
    # 10,10,512 -> 5,5,256
    net['conv7_1'] = Conv2D(128, kernel_size=(1,1), activation='relu',
                                   padding='same', 
                                   name='conv7_1')(net['conv6_2'])
    net['conv7_2'] = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(net['conv7_1'])
    net['conv7_2'] = Conv2D(256, kernel_size=(3,3), strides=(2, 2),
                                   activation='relu', padding='valid',
                                   name='conv7_2')(net['conv7_2'])
    # Block 8
    # 5,5,256 -> 3,3,256
    net['conv8_1'] = Conv2D(128, kernel_size=(1,1), activation='relu',
                                   padding='same',
                                   name='conv8_1')(net['conv7_2'])
    net['conv8_2'] = Conv2D(256, kernel_size=(3,3), strides=(1, 1),
                                   activation='relu', padding='valid',
                                   name='conv8_2')(net['conv8_1'])

    # Block 9
    # 3,3,256 -> 1,1,256
    net['conv9_1'] = Conv2D(128, kernel_size=(1,1), activation='relu',
                                   padding='same',
                                   name='conv9_1')(net['conv8_2'])
    net['conv9_2'] = Conv2D(256, kernel_size=(3,3), strides=(1, 1),
                                   activation='relu', padding='valid',
                                   name='conv9_2')(net['conv9_1'])
    #----------------------------主干特征提取网络结束---------------------------#
    return net

if __name__ == "__main__":
    from keras.layers import Input
    input_tensor = Input(shape = [300,300,3])
    net = VGG16(input_tensor)
    for i in net:
        print(net[i])
        # print('\n')
import keras.backend as K
from keras.layers import Activation
#from keras.layers import AtrousConvolution2D
from keras.layers import Conv2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import GlobalAveragePooling2D
from keras.layers import Input
from keras.layers import MaxPooling2D
from keras.layers import merge, concatenate
from keras.layers import Reshape
from keras.layers import ZeroPadding2D
from keras.models import Model
from nets.VGG16 import VGG16
from nets.ssd_layers import Normalize
from nets.ssd_layers import PriorBox


def SSD300(input_shape, num_classes=21):
    # 300,300,3
    input_tensor = Input(shape=input_shape)
    img_size = (input_shape[1], input_shape[0])

    # SSD结构,net字典
    net = VGG16(input_tensor)
    #-----------------------将提取到的主干特征进行处理---------------------------#
    # 对conv4_3进行处理 38,38,512
    net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3'])
    num_priors = 4
    # 预测框的处理
    # num_priors表示每个网格点先验框的数量,4是x,y,h,w的调整
    net['conv4_3_norm_mbox_loc'] = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same', name='conv4_3_norm_mbox_loc')(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_loc_flat'] = Flatten(name='conv4_3_norm_mbox_loc_flat')(net['conv4_3_norm_mbox_loc'])
    # num_priors表示每个网格点先验框的数量,num_classes是所分的类
    net['conv4_3_norm_mbox_conf'] = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv4_3_norm_mbox_conf')(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_conf_flat'] = Flatten(name='conv4_3_norm_mbox_conf_flat')(net['conv4_3_norm_mbox_conf'])
    priorbox = PriorBox(img_size, 30.0,max_size = 60.0, aspect_ratios=[2],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv4_3_norm_mbox_priorbox')
    net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm']) # prior_boxes_tensor.shape :TensorShape([Dimension(38), Dimension(5776), Dimension(8)])
    
    # 对fc7层进行处理 
    num_priors = 6
    # 预测框的处理
    # num_priors表示每个网格点先验框的数量,4是x,y,h,w的调整
    net['fc7_mbox_loc'] = Conv2D(num_priors * 4, kernel_size=(3,3),padding='same',name='fc7_mbox_loc')(net['fc7'])
    net['fc7_mbox_loc_flat'] = Flatten(name='fc7_mbox_loc_flat')(net['fc7_mbox_loc'])
    # num_priors表示每个网格点先验框的数量,num_classes是所分的类
    net['fc7_mbox_conf'] = Conv2D(num_priors * num_classes, kernel_size=(3,3),padding='same',name='fc7_mbox_conf')(net['fc7'])
    net['fc7_mbox_conf_flat'] = Flatten(name='fc7_mbox_conf_flat')(net['fc7_mbox_conf'])

    priorbox = PriorBox(img_size, 60.0, max_size=111.0, aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='fc7_mbox_priorbox')
    net['fc7_mbox_priorbox'] = priorbox(net['fc7'])

    # 对conv6_2进行处理
    num_priors = 6
    # 预测框的处理
    # num_priors表示每个网格点先验框的数量,4是x,y,h,w的调整
    x = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv6_2_mbox_loc')(net['conv6_2'])
    net['conv6_2_mbox_loc'] = x
    net['conv6_2_mbox_loc_flat'] = Flatten(name='conv6_2_mbox_loc_flat')(net['conv6_2_mbox_loc'])
    # num_priors表示每个网格点先验框的数量,num_classes是所分的类
    x = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv6_2_mbox_conf')(net['conv6_2'])
    net['conv6_2_mbox_conf'] = x
    net['conv6_2_mbox_conf_flat'] = Flatten(name='conv6_2_mbox_conf_flat')(net['conv6_2_mbox_conf'])

    priorbox = PriorBox(img_size, 111.0, max_size=162.0, aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv6_2_mbox_priorbox')
    net['conv6_2_mbox_priorbox'] = priorbox(net['conv6_2'])

    # 对conv7_2进行处理
    num_priors = 6
    # 预测框的处理
    # num_priors表示每个网格点先验框的数量,4是x,y,h,w的调整
    x = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv7_2_mbox_loc')(net['conv7_2'])
    net['conv7_2_mbox_loc'] = x
    net['conv7_2_mbox_loc_flat'] = Flatten(name='conv7_2_mbox_loc_flat')(net['conv7_2_mbox_loc'])
    # num_priors表示每个网格点先验框的数量,num_classes是所分的类
    x = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv7_2_mbox_conf')(net['conv7_2'])
    net['conv7_2_mbox_conf'] = x
    net['conv7_2_mbox_conf_flat'] = Flatten(name='conv7_2_mbox_conf_flat')(net['conv7_2_mbox_conf'])

    priorbox = PriorBox(img_size, 162.0, max_size=213.0, aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv7_2_mbox_priorbox')
    net['conv7_2_mbox_priorbox'] = priorbox(net['conv7_2'])

    # 对conv8_2进行处理
    num_priors = 4
    # 预测框的处理
    # num_priors表示每个网格点先验框的数量,4是x,y,h,w的调整
    x = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv8_2_mbox_loc')(net['conv8_2'])
    net['conv8_2_mbox_loc'] = x
    net['conv8_2_mbox_loc_flat'] = Flatten(name='conv8_2_mbox_loc_flat')(net['conv8_2_mbox_loc'])
    # num_priors表示每个网格点先验框的数量,num_classes是所分的类
    x = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv8_2_mbox_conf')(net['conv8_2'])
    net['conv8_2_mbox_conf'] = x
    net['conv8_2_mbox_conf_flat'] = Flatten(name='conv8_2_mbox_conf_flat')(net['conv8_2_mbox_conf'])

    priorbox = PriorBox(img_size, 213.0, max_size=264.0, aspect_ratios=[2],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv8_2_mbox_priorbox')
    net['conv8_2_mbox_priorbox'] = priorbox(net['conv8_2'])

    # 对conv9_2进行处理
    num_priors = 4
    # 预测框的处理
    # num_priors表示每个网格点先验框的数量,4是x,y,h,w的调整
    x = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv9_2_mbox_loc')(net['conv9_2'])
    net['conv9_2_mbox_loc'] = x
    net['conv9_2_mbox_loc_flat'] = Flatten(name='conv9_2_mbox_loc_flat')(net['conv9_2_mbox_loc'])
    # num_priors表示每个网格点先验框的数量,num_classes是所分的类
    x = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv9_2_mbox_conf')(net['conv9_2'])
    net['conv9_2_mbox_conf'] = x
    net['conv9_2_mbox_conf_flat'] = Flatten(name='conv9_2_mbox_conf_flat')(net['conv9_2_mbox_conf'])
    
    priorbox = PriorBox(img_size, 264.0, max_size=315.0, aspect_ratios=[2],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv9_2_mbox_priorbox')

    net['conv9_2_mbox_priorbox'] = priorbox(net['conv9_2'])

    # 将所有结果进行堆叠
    net['mbox_loc'] = concatenate([net['conv4_3_norm_mbox_loc_flat'],
                             net['fc7_mbox_loc_flat'],
                             net['conv6_2_mbox_loc_flat'],
                             net['conv7_2_mbox_loc_flat'],
                             net['conv8_2_mbox_loc_flat'],
                             net['conv9_2_mbox_loc_flat']],
                            axis=1, name='mbox_loc')
    net['mbox_conf'] = concatenate([net['conv4_3_norm_mbox_conf_flat'],
                              net['fc7_mbox_conf_flat'],
                              net['conv6_2_mbox_conf_flat'],
                              net['conv7_2_mbox_conf_flat'],
                              net['conv8_2_mbox_conf_flat'],
                              net['conv9_2_mbox_conf_flat']],
                             axis=1, name='mbox_conf')
    net['mbox_priorbox'] = concatenate([net['conv4_3_norm_mbox_priorbox'],
                                  net['fc7_mbox_priorbox'],
                                  net['conv6_2_mbox_priorbox'],
                                  net['conv7_2_mbox_priorbox'],
                                  net['conv8_2_mbox_priorbox'],
                                  net['conv9_2_mbox_priorbox']],
                                  axis=1, name='mbox_priorbox')

    if hasattr(net['mbox_loc'], '_keras_shape'):
        num_boxes = net['mbox_loc']._keras_shape[-1] // 4
    elif hasattr(net['mbox_loc'], 'int_shape'):
        num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4
    # 8732,4
    net['mbox_loc'] = Reshape((num_boxes, 4),name='mbox_loc_final')(net['mbox_loc'])
    # 8732,21
    net['mbox_conf'] = Reshape((num_boxes, num_classes),name='mbox_conf_logits')(net['mbox_conf'])
    net['mbox_conf'] = Activation('softmax',name='mbox_conf_final')(net['mbox_conf'])

    net['predictions'] = concatenate([net['mbox_loc'],
                               net['mbox_conf'],
                               net['mbox_priorbox']],
                               axis=2, name='predictions')
    # predictions(Concatenate)(None, 8732, 33)  8732= 38**2*4+19**2*6+10**2*6+5**2*6+3**2*4+1**2*4
    # print(net['predictions']) # 4+21+8=33 预测偏移+背景+类别+先验框x1y1x2y2+variances
    # print(net['predictions'].shape) : (None, 8732, 33)
    z=0
    for i ,j in net.items():
        print('{}  {}: {}'.format(z,i,j.shape))
        z+=1
    model = Model(net['input'], net['predictions'])
    return model


'''
if __name__=='__main__':
    model = SSD300((300,300,3), num_classes=21)
    # model.summary()
    TensorShape([Dimension(None), Dimension(5776), Dimension(8)])
    TensorShape([Dimension(None), Dimension(2166), Dimension(8)])
    TensorShape([Dimension(None), Dimension(600), Dimension(8)])
    TensorShape([Dimension(None), Dimension(150), Dimension(8)])
    TensorShape([Dimension(None), Dimension(36), Dimension(8)])
    TensorShape([Dimension(None), Dimension(4), Dimension(8)])
'''

2 损失

2.1 结构

嗯山ESXi8_ide_02


SSD 损失分为回归损失和分类损失。回归损失用Smooth L1计算。分类损失使用交叉熵损失计算。

2.2 代码

class MultiboxLoss(object):
    def __init__(self, num_classes, alpha=1.0, neg_pos_ratio=3.0,
                 background_label_id=0, negatives_for_hard=100.0):
        self.num_classes = num_classes
        self.alpha = alpha
        self.neg_pos_ratio = neg_pos_ratio
        if background_label_id != 0:
            raise Exception('Only 0 as background label id is supported')
        self.background_label_id = background_label_id
        self.negatives_for_hard = negatives_for_hard

    def _l1_smooth_loss(self, y_true, y_pred):
        abs_loss = tf.abs(y_true - y_pred)
        sq_loss = 0.5 * (y_true - y_pred)**2
        l1_loss = tf.where(tf.less(abs_loss, 1.0), sq_loss, abs_loss - 0.5)
        return tf.reduce_sum(l1_loss, -1)

    def _softmax_loss(self, y_true, y_pred):
        y_pred = tf.maximum(y_pred, 1e-7)
        softmax_loss = -tf.reduce_sum(y_true * tf.log(y_pred),
                                      axis=-1)
        return softmax_loss

    def compute_loss(self, y_true, y_pred):
        batch_size = tf.shape(y_true)[0]             # 输入图片的数量
        num_boxes = tf.to_float(tf.shape(y_true)[1]) # 每个图片先验框的数量  8732

        # 计算所有的loss
        # 分类的loss
        # batch_size,8732,4(gt)+1(bg)+21(cls)+4(anchor)+4(variance) -> batch_size,8732
        conf_loss = self._softmax_loss(y_true[:, :, 4:-8],
                                       y_pred[:, :, 4:-8])
        # 框的位置的loss
        # batch_size,8732,4 -> batch_size,8732
        loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
                                        y_pred[:, :, :4])

        # 获取所有的正标签的loss
        # 每一张图的pos的个数 y_true.shape[6,8,1]; num_pos = array([5., 3., 1., 2., 2., 4.])
        num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)  # 64  [batch_size,64]
        # 每一张图的pos_loc_loss
        pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8],
                                     axis=1)
        # 每一张图的pos_conf_loss
        pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8],
                                      axis=1)

        # 获取每张图片有的负样本数 一定的负样本  neg_pos_ratio * num_pos = 192.0  [batch_size,64]
        num_neg = tf.minimum(self.neg_pos_ratio * num_pos, #  num_boxes - num_pos =8668.0 = 8732.0-64.0
                             num_boxes - num_pos)          # num_boxes = tf.to_float(tf.shape(y_true)[1])

        # 找到了哪些值是大于0的 ; array([ True,  True,  True,  True,  True,  True])
        pos_num_neg_mask = tf.greater(num_neg, 0)    # 判断哪些图片有负样本      # return boolean : True
        # 获得一个1.0,判断哪些值大于零
        has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))   # has_min = 1.0
        num_neg = tf.concat( axis=0,values=[num_neg,  # 如果不存在负样本,就设置负样本的数量
                                [(1 - has_min) * self.negatives_for_hard]])  # array([192,   0])
        # 求平均每个图片要取多少个负样本
        num_neg_batch = tf.reduce_mean(tf.boolean_mask(num_neg,  # num_neg_batch = 192
                                                      tf.greater(num_neg, 0)))  # tf.greater(num_neg, 0)=array([ True, False])
        num_neg_batch = tf.to_int32(num_neg_batch)

        # conf的起始[5:-8]
        confs_start = 4 + self.background_label_id + 1  # confs_start = 5
        # conf的结束
        confs_end = confs_start + self.num_classes - 1  # confs_end = 25

        # 找到实际上在该位置不应该有预测结果的框,求他们最大的置信度。取top_k个置信度,作为负样本
        max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
                                  axis=2)
        _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]), # indices.shape=(?, 192)
                                 k=num_neg_batch)  # num_neg_batch = 192 ; indices.shape=(?, 192)

        # 找到其在1维上的索引  ???  ??? batch_size = 1
        batch_idx = tf.expand_dims(tf.range(0, batch_size), 1) # batch_idx.shape = (32, 1)
        batch_idx = tf.tile(batch_idx, (1, num_neg_batch))     # batch_idx.shape = (32, 33)
        full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +  # num_boxes=8732
                        tf.reshape(indices, [-1]))   # (8732, 33)

        # full_indices = tf.concat(2, [tf.expand_dims(batch_idx, 2),
        #                              tf.expand_dims(indices, 2)])
        # neg_conf_loss = tf.gather_nd(conf_loss, full_indices)
        neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
                                  full_indices)
        neg_conf_loss = tf.reshape(neg_conf_loss,
                                   [batch_size, num_neg_batch])
        neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)

        # loss is sum of positives and negatives

        num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,     # num_pos = 64
                            tf.ones_like(num_pos))
        total_loss = tf.reduce_sum(pos_conf_loss) + tf.reduce_sum(neg_conf_loss)
        total_loss /= tf.reduce_sum(num_pos)
        total_loss += tf.reduce_sum(self.alpha * pos_loc_loss) / tf.reduce_sum(num_pos)

        return total_loss

3 代码流程

3.1 数据处理

把数据分为训练集、验证集、测试集

'''voc2retinanet.py
1.0 设置xml地址和处理数据的存放地址
2.1 得到所有的xml文件
2.2 根据xml文件 和 数据集比例得到各类数据集的下标
3.1 数据集存放地址
3.2 把图片名写入指定文件文
3.3 关闭文件,释放空间
'''

3.2 训练

1. priors = pickle.load(open('model_data/prior_boxes_ssd300.pkl', 'rb'))  # 先验框 priors.shape(8732, 8(4prior+4variance)
2. bbox_util = BBoxUtility(NUM_CLASSES, priors).bbox_util.assign_boxes(y) # 生成标签
   2.1 assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8)) # assignment.shape (8732, 33) y.shape=(7, 24)
   2.2 encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])  # 找到框并编码[ num_priors , 4 + 1 ]
       (1) iou = self.iou(box)  # iou = iou(box[0]),iou.shape=(8732,) 根据iou 与真实框匹配的先验框的下标,把iou值放在encoded_box[:,-1]中对应位置下标
       (2) assigned_priors = self.priors[assign_mask] # 根据下标找出匹配的先验框
       (3) 计算重合度较高的先验框的中心与长宽
       (4) 编码
   2.3 best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0) #  取每个先验框对应iou最大的值 ,(8732,) 每个先验框对应真实框的坐标
   2.4  best_iou = encoded_boxes[:, :, -1].max(axis=0)  # encoded_boxes[:, :, -1].shape :(7, 8732) 
   2.5 best_iou_mask = best_iou > 0    # 取iou大于零的框的小标
   2.6 best_iou_idx = best_iou_idx[best_iou_mask]  # 取iou大于零的框 ; best_iou_idx.shape =  (64,)
   2.7 assign_num = len(best_iou_idx)  # 预测先验框的个数 ;  assign_num = 64
   2.8 encoded_boxes = encoded_boxes[:, best_iou_mask, :]     # encoded_boxes.shape = (7, 64, 5)
   2.9 assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx , np.arange(assign_num),:4] # 偏移
   2.10 assignment[:, 4][best_iou_mask] = 0   # 背 景
   2.11 assignment[:, -8][best_iou_mask] = 1  # 代表有物体
   2.12 assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]# 类 别
  
3. 划分训练集验证集比例
4. model = SSD300(input_shape, num_classes=NUM_CLASSES) # y_pre = model.output; y_pre.shape::[-1,8732,33]
5. 设置TensorBoard、ModelCheckpoint、ReduceLROnPlateau、EarlyStopping、BATCH_SIZ
6. gen.generate(True)  # 数据生成器
   6.1 shuffle(self.train_lines) # 打乱数据顺序
   6.2 img,y=self.get_random_data(annotation_line,self.image_size[0:2]) # y.shap3[4+cls],数据增强操作
      (1) resize image
      (2) place image
      (3) flip image or not
      (4) distort image
      (5) correct boxes
   6.3 真实框归一化、生成one_hot_label
   6.4 y = self.bbox_util.assign_boxes(y)    # 先根据IoU找出框,编码,再找出最符合条件的框, 制作成标签y_true[4+1+cls+8]
   6.5 返回preprocess_input(tmp_inp), tmp_targets 图片和标签
7. 损失函数 loss = MultiboxLoss(NUM_CLASSES, neg_pos_ratio=3.0).compute_loss
   7.1 conf_loss = self._softmax_loss(y_true[:, :, 4:-8],y_pred[:, :, 4:-8]) # 类别损失
   7.2 loc_loss = self._l1_smooth_loss(y_true[:, :, :4],y_pred[:, :, :4]) # 回归损失
   7.3 num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)  # 正样本数量
   7.4 计算难分类样本 full_indices
   7.5 计算conf_loss、loc_loss、neg_conf_loss 
   7.6 total_loss = tf.reduce_sum(pos_conf_loss) + tf.reduce_sum(neg_conf_loss)
8. 粗略训练
   6.1 model.compile()
   6.2 model.fit_generator()
9. 精细训练

3.3 预测

1. ssd = SSD()  # 加载模型
2. image = Image.open(img)  # 打开图片
3. r_image = ssd.detect_image(image)  # 预测
   3.1  crop_img, x_offset, y_offset = letterbox_image(image, (self.model_image_size[0], self.model_image_size[1]))  # 加入灰条
   3.2  photo = preprocess_input(np.reshape(photo, [1, self.model_image_size[0], self.model_image_size[1], 3]))  # 图片预处理,归一化
   3.3  preds = self.ssd_model.predict(photo)  # 预测[x_offset,y_offset,w,h,conf,cls+1]
        3.3.1 ssd_model = SSD300(input_shape, num_classes=21)
              (1) net = VGG16(input_tensor)
              (2) net['conv4_3']-->net['conv4_3_norm_mbox_loc_flat'] + net['conv4_3_norm_mbox_conf_flat'] + net['conv4_3_norm_mbox_priorbox']  # 由特征层得到框的预测偏移、物体概率、先验框
              (3) net['fc7']--> net['fc7_mbox_loc_flat']  + net['fc7_mbox_conf_flat']  + net['fc7_mbox_priorbox']
              (4) net['conv6_2']--> net['conv6_2_mbox_loc_flat'] +net['conv6_2_mbox_conf_flat']  + net['conv6_2_mbox_priorbox']
              (5) net['conv7_2']--> net['conv7_2_mbox_loc_flat'] +net['conv7_2_mbox_conf_flat']  + net['conv7_2_mbox_priorbox']
              (6) net['conv8_2']--> net['conv8_2_mbox_loc_flat'] +net['conv8_2_mbox_conf_flat']  + net['conv8_2_mbox_priorbox']
              (7) net['conv8_2']--> net['conv8_2_mbox_loc_flat'] +net['conv8_2_mbox_conf_flat']  + net['conv8_2_mbox_priorbox']
              (8) net['conv9_2']--> net['conv9_2_mbox_loc_flat'] +net['conv9_2_mbox_conf_flat']  + net['conv9_2_mbox_priorbox']
              (9) net['predictions'] = net['mbox_loc'],      # (None, 8732, 33) 
                                       net['mbox_conf'],     # 8732= 38**2*4+19**2*6+10**2*6+5**2*6+3**2*4+1**2*4
                                       net['mbox_priorbox']  # 33=4+2+1+8= 预测偏移+背景+类别+先验框x1y1x2y2+variances
   3.4  results = self.bbox_util.detection_out(preds, confidence_threshold=self.confidence)  # 解码--> 筛选--> nms--> 选出top_k
        (1) predictions --> mbox_loc + mbox_conf + mbox_priorbox + variances
        (2) decode_bbox = self.decode_boxes(mbox_loc[i], mbox_priorbox[i],  variances[i])  # 解码
        (3) c_confs_m = c_confs > confidence_threshold # 筛选大于阈值的框
        (4) self.nms   # 进行iou的非极大抑制
        (5) argsort = np.argsort(results[-1][:, 1])[::-1] # 按照置信度进行排序
        (6) results[-1] = results[-1][:keep_top_k]  # 选出置信度最大的keep_top_k个
   3.5  筛选出其中得分高于confidence的框
   3.6  boxes = ssd_correct_boxes(top_ymin, top_xmin, top_ymax, top_xmax, np.array([self.model_image_size[0], self.model_image_size[1]]), image_shape) # 去掉灰条
   3.7  画图
4. r_image.show()    # 显示