特征提取网络分析

D:\tf2\Models\research\object_detection\meta_architectures\faster_rcnn_meta_arch.py
define a new FasterRCNNFeatureExtractor and pass it to our FasterRCNNMetaArch constructor as input.
定义一个FasterRCNNFeatureExtractor,并且把它作为输入到 FasterRCNNMetaArch constructor 。
A FasterRCNNFeatureExtractor must define a few functions

preprocess: Run any preprocessing of input values that is necessary prior to running the detector on an input image.
_extract_proposal_features: Extract first stage Region Proposal Network (RPN) features.提取第一阶段区域建议网络(RPN)特征。
_extract_box_classifier_features: Extract second stage Box Classifier features.提取第二阶段盒分类器特征。
restore_from_classification_checkpoint_fn: Load a checkpoint into the TensorFlow graph.

1、FasterRCNNFeatureExtractor

from abc import abstractmethod
from functools import partial
import tensorflow as tf

from object_detection.anchor_generators import grid_anchor_generator
from object_detection.core import balanced_positive_negative_sampler as sampler
from object_detection.core import box_list
from object_detection.core import box_list_ops
from object_detection.core import box_predictor
from object_detection.core import losses
from object_detection.core import model
from object_detection.core import post_processing
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner
from object_detection.utils import ops
from object_detection.utils import shape_utils

slim = tf.contrib.slim


class FasterRCNNFeatureExtractor(object):
  """Faster R-CNN Feature Extractor definition."""

  def __init__(self,
               is_training,
               first_stage_features_stride,
               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.

    Args:
      is_training: A boolean indicating whether the training version of the
        computation graph should be constructed.
      first_stage_features_stride: Output stride of extracted RPN feature map.(提取的RPN feature map输出stride。)
      batch_norm_trainable: Whether to update batch norm parameters during
        training or not. When training with a relative large batch size
        (e.g. 8), it could be desirable to enable batch norm update.批归一化训练:是否在训练期间更新批模参数。在使用较大批处理进行训练时==8,
        启用批处理规范更新可能是可取的。
      reuse_weights: Whether to reuse variables. Default is None.
      weight_decay: float weight decay for feature extractor (default: 0.0).
    """
    self._is_training = is_training
    self._first_stage_features_stride = first_stage_features_stride
    self._train_batch_norm = (batch_norm_trainable and is_training)
    self._reuse_weights = reuse_weights
    self._weight_decay = weight_decay #权重衰减

  @abstractmethod
  def preprocess(self, resized_inputs):
    """Feature-extractor specific preprocessing (minus image resizing)."""
    pass

  def extract_proposal_features(self, preprocessed_inputs, scope):
    """Extracts first stage RPN features.提取第一阶段RPN特征
  
    .该函数负责从预处理的图像中提取特征图。这些特征被区域提案网络(RPN)用来预测提案。

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float tensor
        representing a batch of images.
      scope: A scope name.

    Returns:
      rpn_feature_map: A tensor with shape [batch, height, width, depth] #rpn_feature_map是经过了Resnet101+FPN后结果
      activations: A dictionary mapping activation tensor names to tensors.
    """
    with tf.variable_scope(scope, values=[preprocessed_inputs]):
      return self._extract_proposal_features(preprocessed_inputs, scope)

  @abstractmethod
  def _extract_proposal_features(self, preprocessed_inputs, scope):
    """Extracts first stage RPN features, to be overridden."""
    pass

  def extract_box_classifier_features(self, proposal_feature_maps, scope):
    """Extracts second stage box classifier features.提取第二阶段盒分类器特征,

    Args:
      proposal_feature_maps: A 4-D float tensor with shape
        [batch_size * self.max_num_proposals, crop_height, crop_width, depth]
        representing the feature map cropped to each proposal.代表裁剪到每个候选框的feature map
      scope: A scope name.候选特征图

    Returns:
      proposal_classifier_features: A 4-D float tensor with shape
        [batch_size * self.max_num_proposals, height, width, depth]
        representing box classifier features for each proposal.
    """
    with tf.variable_scope(
        scope, values=[proposal_feature_maps], reuse=tf.AUTO_REUSE):
      return self._extract_box_classifier_features(proposal_feature_maps, scope)

  @abstractmethod
  def _extract_box_classifier_features(self, proposal_feature_maps, scope):
    """Extracts second stage box classifier features, to be overridden."""
    pass

  def restore_from_classification_checkpoint_fn(
      self,
      first_stage_feature_extractor_scope,
      second_stage_feature_extractor_scope):
    """Returns a map of variables to load from a foreign checkpoint.返回要从外部检查点加载的变量的映射

    Args:
      first_stage_feature_extractor_scope: A scope name for the first stage
        feature extractor.第一阶段特性提取器作用域:第一阶段的作用域名称
      second_stage_feature_extractor_scope: A scope name for the second stage
        feature extractor.

    Returns:
      A dict mapping variable names (to load from a checkpoint) to variables in
      the model graph.
    """
    variables_to_restore = {}
    for variable in tf.global_variables():
      for scope_name in [first_stage_feature_extractor_scope,
                         second_stage_feature_extractor_scope]:
        if variable.op.name.startswith(scope_name):
          var_name = variable.op.name.replace(scope_name + '/', '')
          variables_to_restore[var_name] = variable
    return variables_to_restore

2、FasterRCNNMetaArch

predict
def predict(self, preprocessed_inputs, true_image_shapes):
    """
    If `number_of_stages` is 1, this function only returns first stage
    RPN predictions (un-postprocessed).  Otherwise it returns both
    first stage RPN predictions as well as second stage box classifier
    predictions.如果阶段数为1,该函数只返回第一阶段RPN预测(未经过后处理)。否则,它将返回第一阶段RPN预测以及第二阶段box分类器预测。

    Other remarks:
    + Anchor pruning vs. clipping(锚点的剪切与裁剪): following the recommendation of the Faste
    在训练阶段修建窗口外的锚点,测试时,裁剪锚点到图像窗口

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.
      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
        of the form [height, width, channels] indicating the shapes
        of true images in the resized images, as resized images can be padded
        with zeros.

    Returns:
      prediction_dict: a dictionary holding "raw" prediction tensors:
        1) rpn_box_predictor_features: A 4-D float32 tensor with shape
          [batch_size, height, width, depth] to be used for predicting proposal
          boxes and corresponding objectness scores.
        2) rpn_features_to_crop: A 4-D float32 tensor with shape
          [batch_size, height, width, depth] representing image features to crop
          using the proposal boxes predicted by the RPN.
        3) image_shape: a 1-D tensor of shape [4] representing the input
          image shape.
        4) rpn_box_encodings:  3-D float tensor of shape
          [batch_size, num_anchors, self._box_coder.code_size] containing
          predicted boxes.
        5) rpn_objectness_predictions_with_background: 3-D float tensor of shape
          [batch_size, num_anchors, 2] containing class
          predictions (logits) for each of the anchors.  Note that this
          tensor *includes* background class predictions (at class index 0).
        6) anchors: A 2-D tensor of shape [num_anchors, 4] representing anchors
          for the first stage RPN (in absolute coordinates).  Note that
          `num_anchors` can differ depending on whether the model is created in
          training or inference mode.

        (and if number_of_stages > 1):
        7) refined_box_encodings: a 3-D tensor with shape
          [total_num_proposals, num_classes, self._box_coder.code_size]
          representing predicted (final) refined box encodings, where
          total_num_proposals=batch_size*self._max_num_proposals. If using
          a shared box across classes the shape will instead be
          [total_num_proposals, 1, self._box_coder.code_size].
        8) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
          total_num_proposals=batch_size*self._max_num_proposals.
          Note that this tensor *includes* background class predictions
          (at class index 0).
        9) num_proposals: An int32 tensor of shape [batch_size] representing the
          number of proposals generated by the RPN.  `num_proposals` allows us
          to keep track of which entries are to be treated as zero paddings and
          which are not since we always pad the number of proposals to be
          `self.max_num_proposals` for each image.
        10) proposal_boxes: A float32 tensor of shape
          [batch_size, self.max_num_proposals, 4] representing
          decoded proposal bounding boxes in absolute coordinates.
        11) mask_predictions: (optional) a 4-D tensor with shape
          [total_num_padded_proposals, num_classes, mask_height, mask_width]
          containing instance mask predictions.

    Raises:
      ValueError: If `predict` is called before `preprocess`.
    """
    (rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist,
     image_shape) = self._extract_rpn_feature_maps(preprocessed_inputs)#对处理后的数据进行运算
    (rpn_box_encodings, rpn_objectness_predictions_with_background
    ) = self._predict_rpn_proposals(rpn_box_predictor_features)#输入为新卷积层的输出,然后分为双路卷积,进行边框的回归和分类

    # the image window at training time and clipping at inference time.
    clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]]))
    if self._is_training:
      (rpn_box_encodings, rpn_objectness_predictions_with_background,
       anchors_boxlist) = self._remove_invalid_anchors_and_predictions(
           rpn_box_encodings, rpn_objectness_predictions_with_background,
           anchors_boxlist, clip_window)#判断是否在进行训练,并且移除落在图像外面得锚
    else:
      anchors_boxlist = box_list_ops.clip_to_window(
          anchors_boxlist, clip_window)

    self._anchors = anchors_boxlist
    prediction_dict = {
        'rpn_box_predictor_features': rpn_box_predictor_features,
        'rpn_features_to_crop': rpn_features_to_crop,
        'image_shape': image_shape,
        'rpn_box_encodings': rpn_box_encodings,
        'rpn_objectness_predictions_with_background':
        rpn_objectness_predictions_with_background,
        'anchors': self._anchors.get()
    }

    if self._number_of_stages >= 2:
      prediction_dict.update(self._predict_second_stage(
          rpn_box_encodings,
          rpn_objectness_predictions_with_background,
          rpn_features_to_crop,
          self._anchors.get(), image_shape, true_image_shapes))

    if self._number_of_stages == 3:
      prediction_dict = self._predict_third_stage(
          prediction_dict, true_image_shapes)

    return prediction_dict
_extract_rpn_feature_maps
def _extract_rpn_feature_maps(self, preprocessed_inputs):
    """Extracts RPN features.提取RPN特征
    This function extracts two feature maps: a feature map to be directly
    fed to a box predictor (to predict location and objectness scores for
    proposals) and a feature map from which to crop regions which will then
    be sent to the second stage box classifier.该函数提取两幅特征图:一幅特征图将直接输入盒预测器(用于预测位置和提案的客观得分),另一幅特征图将从该特征图发送到作物区域,然后将发送到第二阶段的盒分类器。

    Args:
      preprocessed_inputs: a [batch, height, width, channels] image tensor.

    Returns:
      rpn_box_predictor_features: A 4-D float32 tensor with shape
        [batch, height, width, depth] to be used for predicting proposal boxes
        and corresponding objectness scores.
      rpn_features_to_crop: A 4-D float32 tensor with shape
        [batch, height, width, depth] representing image features to crop using
        the proposals boxes.
      anchors: A BoxList representing anchors (for the RPN) in
        absolute coordinates.
      image_shape: A 1-D tensor representing the input image shape.
    """
    image_shape = tf.shape(preprocessed_inputs)#图处理数据图片的尺寸
    rpn_features_to_crop, _ = self._feature_extractor.extract_proposal_features(
        preprocessed_inputs, scope=self.first_stage_feature_extractor_scope)#提取候选区域的特征图(ResNet+fpn的结果),取block3的输出。调用的是上一个模块的候选区域的特征图(RPN网络)
    feature_map_shape = tf.shape(rpn_features_to_crop)
    anchors = box_list_ops.concatenate(
        self._first_stage_anchor_generator.generate([(feature_map_shape[1],
                                                      feature_map_shape[2])]))
    with slim.arg_scope(self._first_stage_box_predictor_arg_scope_fn()):
      kernel_size = self._first_stage_box_predictor_kernel_size
      rpn_box_predictor_features = slim.conv2d(
          rpn_features_to_crop,
          self._first_stage_box_predictor_depth,
          kernel_size=[kernel_size, kernel_size],
          rate=self._first_stage_atrous_rate,
          activation_fn=tf.nn.relu6)#对blocks3得输出结果再进行卷积操作
    return (rpn_box_predictor_features, rpn_features_to_crop,
            anchors, image_shape)
_predict_rpn_proposals
def _predict_rpn_proposals(self, rpn_box_predictor_features):
    """添加框到RPN特征图提取候选区域
    Args:
      rpn_box_predictor_features: A 4-D float32 tensor with shape
        [batch, height, width, depth] to be used for predicting proposal boxes
        and corresponding objectness scores.
    Returns:
      box_encodings: 3-D float tensor of shape
        [batch_size, num_anchors, self._box_coder.code_size] containing
        predicted boxes.
      objectness_predictions_with_background: 3-D float tensor of shape
        [batch_size, num_anchors, 2] containing class
        predictions (logits) for each of the anchors.  Note that this
        tensor *includes* background class predictions (at class index 0).
    """
    num_anchors_per_location = (
        self._first_stage_anchor_generator.num_anchors_per_location())
    if len(num_anchors_per_location) != 1:
      raise RuntimeError('anchor_generator is expected to generate anchors '
                         'corresponding to a single feature map.')
    box_predictions = self._first_stage_box_predictor.predict(
        [rpn_box_predictor_features],
        num_anchors_per_location,
        scope=self.first_stage_box_predictor_scope)

    box_encodings = tf.concat(
        box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
    objectness_predictions_with_background = tf.concat(
        box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
        axis=1)
    return (tf.squeeze(box_encodings, axis=2),
            objectness_predictions_with_background)
_predict_second_stage
def _predict_second_stage(self, rpn_box_encodings,
                            rpn_objectness_predictions_with_background,
                            rpn_features_to_crop,
                            anchors,
                            image_shape,
                            true_image_shapes):
    """Predicts the output tensors from second stage of Faster R-CNN.

    Args:
      rpn_box_encodings: 4-D float tensor of shape
        [batch_size, num_valid_anchors, self._box_coder.code_size] containing
        predicted boxes.
      rpn_objectness_predictions_with_background: 2-D float tensor of shape
        [batch_size, num_valid_anchors, 2] containing class
        predictions (logits) for each of the anchors.  Note that this
        tensor *includes* background class predictions (at class index 0).
      rpn_features_to_crop: A 4-D float32 tensor with shape
        [batch_size, height, width, depth] representing image features to crop
        using the proposal boxes predicted by the RPN.
      anchors: 2-D float tensor of shape
        [num_anchors, self._box_coder.code_size].
      image_shape: A 1D int32 tensors of size [4] containing the image shape.
      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
        of the form [height, width, channels] indicating the shapes
        of true images in the resized images, as resized images can be padded
        with zeros.

    Returns:
      prediction_dict: a dictionary holding "raw" prediction tensors:
        1) refined_box_encodings: a 3-D tensor with shape
          [total_num_proposals, num_classes, self._box_coder.code_size]
          representing predicted (final) refined box encodings, where
          total_num_proposals=batch_size*self._max_num_proposals. If using a
          shared box across classes the shape will instead be
          [total_num_proposals, 1, self._box_coder.code_size].
        2) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
          total_num_proposals=batch_size*self._max_num_proposals.
          Note that this tensor *includes* background class predictions
          (at class index 0).
        3) num_proposals: An int32 tensor of shape [batch_size] representing the
          number of proposals generated by the RPN.  `num_proposals` allows us
          to keep track of which entries are to be treated as zero paddings and
          which are not since we always pad the number of proposals to be
          `self.max_num_proposals` for each image.
        4) proposal_boxes: A float32 tensor of shape
          [batch_size, self.max_num_proposals, 4] representing
          decoded proposal bounding boxes in absolute coordinates.
        5) proposal_boxes_normalized: A float32 tensor of shape
          [batch_size, self.max_num_proposals, 4] representing decoded proposal
          bounding boxes in normalized coordinates. Can be used to override the
          boxes proposed by the RPN, thus enabling one to extract features and
          get box classification and prediction for externally selected areas
          of the image.
        6) box_classifier_features: a 4-D float32 tensor representing the
          features for each proposal.
    """
    image_shape_2d = self._image_batch_shape_2d(image_shape)
    proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn(
        rpn_box_encodings, rpn_objectness_predictions_with_background,
        anchors, image_shape_2d, true_image_shapes)#进行后处理

    flattened_proposal_feature_maps = (
        self._compute_second_stage_input_feature_maps(
            rpn_features_to_crop, proposal_boxes_normalized))

    box_classifier_features = (
        self._feature_extractor.extract_box_classifier_features(
            flattened_proposal_feature_maps,
            scope=self.second_stage_feature_extractor_scope))

    box_predictions = self._mask_rcnn_box_predictor.predict(
        [box_classifier_features],
        num_predictions_per_location=[1],
        scope=self.second_stage_box_predictor_scope,
        predict_boxes_and_classes=True)

    refined_box_encodings = tf.squeeze(
        box_predictions[box_predictor.BOX_ENCODINGS],
        axis=1, name='all_refined_box_encodings')
    class_predictions_with_background = tf.squeeze(
        box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
        axis=1, name='all_class_predictions_with_background')

    absolute_proposal_boxes = ops.normalized_to_image_coordinates(
        proposal_boxes_normalized, image_shape, self._parallel_iterations)

    prediction_dict = {
        'refined_box_encodings': refined_box_encodings,
        'class_predictions_with_background':
        class_predictions_with_background,
        'num_proposals': num_proposals,
        'proposal_boxes': absolute_proposal_boxes,
        'box_classifier_features': box_classifier_features,
        'proposal_boxes_normalized': proposal_boxes_normalized,
    }

    return prediction_dict
_predict_third_stage
def _predict_third_stage(self, prediction_dict, image_shapes):
    """Predicts non-box, non-class outputs using refined detections.

    For training, masks as predicted directly on the box_classifier_features,
    which are region-features from the initial anchor boxes.
    For inference, this happens after calling the post-processing stage, such
    that masks are only calculated for the top scored boxes.

    Args:
     prediction_dict: a dictionary holding "raw" prediction tensors:
        1) refined_box_encodings: a 3-D tensor with shape
          [total_num_proposals, num_classes, self._box_coder.code_size]
          representing predicted (final) refined box encodings, where
          total_num_proposals=batch_size*self._max_num_proposals. If using a
          shared box across classes the shape will instead be
          [total_num_proposals, 1, self._box_coder.code_size].
        2) class_predictions_with_background: a 3-D tensor with shape
          [total_num_proposals, num_classes + 1] containing class
          predictions (logits) for each of the anchors, where
          total_num_proposals=batch_size*self._max_num_proposals.
          Note that this tensor *includes* background class predictions
          (at class index 0).
        3) num_proposals: An int32 tensor of shape [batch_size] representing the
          number of proposals generated by the RPN.  `num_proposals` allows us
          to keep track of which entries are to be treated as zero paddings and
          which are not since we always pad the number of proposals to be
          `self.max_num_proposals` for each image.
        4) proposal_boxes: A float32 tensor of shape
          [batch_size, self.max_num_proposals, 4] representing
          decoded proposal bounding boxes in absolute coordinates.
        5) box_classifier_features: a 4-D float32 tensor representing the
          features for each proposal.
      image_shapes: A 2-D int32 tensors of shape [batch_size, 3] containing
        shapes of images in the batch.

    Returns:
      prediction_dict: a dictionary that in addition to the input predictions
      does hold the following predictions as well:
        1) mask_predictions: a 4-D tensor with shape
          [batch_size, max_detection, mask_height, mask_width] containing
          instance mask predictions.
    """
    if self._is_training:
      curr_box_classifier_features = prediction_dict['box_classifier_features']
      detection_classes = prediction_dict['class_predictions_with_background']
      mask_predictions = self._mask_rcnn_box_predictor.predict(
          [curr_box_classifier_features],
          num_predictions_per_location=[1],
          scope=self.second_stage_box_predictor_scope,
          predict_boxes_and_classes=False,
          predict_auxiliary_outputs=True)
      prediction_dict['mask_predictions'] = tf.squeeze(mask_predictions[
          box_predictor.MASK_PREDICTIONS], axis=1)
    else:
      detections_dict = self._postprocess_box_classifier(
          prediction_dict['refined_box_encodings'],
          prediction_dict['class_predictions_with_background'],
          prediction_dict['proposal_boxes'],
          prediction_dict['num_proposals'],
          image_shapes)
      prediction_dict.update(detections_dict)
      detection_boxes = detections_dict[
          fields.DetectionResultFields.detection_boxes]
      detection_classes = detections_dict[
          fields.DetectionResultFields.detection_classes]
      rpn_features_to_crop = prediction_dict['rpn_features_to_crop']
      batch_size = tf.shape(detection_boxes)[0]
      max_detection = tf.shape(detection_boxes)[1]
      flattened_detected_feature_maps = (
          self._compute_second_stage_input_feature_maps(
              rpn_features_to_crop, detection_boxes))
      curr_box_classifier_features = (
          self._feature_extractor.extract_box_classifier_features(
              flattened_detected_feature_maps,
              scope=self.second_stage_feature_extractor_scope))

      mask_predictions = self._mask_rcnn_box_predictor.predict(
          [curr_box_classifier_features],
          num_predictions_per_location=[1],
          scope=self.second_stage_box_predictor_scope,
          predict_boxes_and_classes=False,
          predict_auxiliary_outputs=True)

      detection_masks = tf.squeeze(mask_predictions[
          box_predictor.MASK_PREDICTIONS], axis=1)

      _, num_classes, mask_height, mask_width = (
          detection_masks.get_shape().as_list())
      _, max_detection = detection_classes.get_shape().as_list()
      if num_classes > 1:
        detection_masks = self._gather_instance_masks(
            detection_masks, detection_classes)

      prediction_dict[fields.DetectionResultFields.detection_masks] = (
          tf.reshape(detection_masks,
                     [batch_size, max_detection, mask_height, mask_width]))

    return prediction_dict