Source code for mmdet3d.models.dense_heads.anchor_free_mono3d_head

import torch
from abc import abstractmethod
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn

from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from .base_mono3d_dense_head import BaseMono3DDenseHead


[docs]@HEADS.register_module() class AnchorFreeMono3DHead(BaseMono3DDenseHead): """Anchor-free head for monocular 3D object detection. Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. feat_channels (int): Number of hidden channels. Used in child classes. stacked_convs (int): Number of stacking convs of the head. strides (tuple): Downsample factor of each feature map. dcn_on_last_conv (bool): If true, use dcn in the last layer of towers. Default: False. conv_bias (bool | str): If specified as `auto`, it will be decided by the norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise False. Default: "auto". background_label (int | None): Label ID of background, set as 0 for RPN and num_classes for other heads. It will automatically set as num_classes if None is given. use_direction_classifier (bool): Whether to add a direction classifier. diff_rad_by_sin (bool): Whether to change the difference into sin difference for box regression loss. loss_cls (dict): Config of classification loss. loss_bbox (dict): Config of localization loss. loss_dir (dict): Config of direction classifier loss. loss_attr (dict): Config of attribute classifier loss, which is only active when pred_attrs=True. bbox_code_size (int): Dimensions of predicted bounding boxes. pred_attrs (bool): Whether to predict attributes. Default to False. num_attrs (int): The number of attributes to be predicted. Default: 9. pred_velo (bool): Whether to predict velocity. Default to False. pred_bbox2d (bool): Whether to predict 2D boxes. Default to False. group_reg_dims (tuple[int]): The dimension of each regression target group. Default: (2, 1, 3, 1, 2). cls_branch (tuple[int]): Channels for classification branch. Default: (128, 64). reg_branch (tuple[tuple]): Channels for regression branch. Default: ( (128, 64), # offset (128, 64), # depth (64, ), # size (64, ), # rot () # velo ), dir_branch (tuple[int]): Channels for direction classification branch. Default: (64, ). attr_branch (tuple[int]): Channels for classification branch. Default: (64, ). conv_cfg (dict): Config dict for convolution layer. Default: None. norm_cfg (dict): Config dict for normalization layer. Default: None. train_cfg (dict): Training config of anchor head. test_cfg (dict): Testing config of anchor head. """ # noqa: W605 _version = 1 def __init__( self, num_classes, in_channels, feat_channels=256, stacked_convs=4, strides=(4, 8, 16, 32, 64), dcn_on_last_conv=False, conv_bias='auto', background_label=None, use_direction_classifier=True, diff_rad_by_sin=True, dir_offset=0, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_attr=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), bbox_code_size=9, # For nuscenes pred_attrs=False, num_attrs=9, # For nuscenes pred_velo=False, pred_bbox2d=False, group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo, cls_branch=(128, 64), reg_branch=( (128, 64), # offset (128, 64), # depth (64, ), # size (64, ), # rot () # velo ), dir_branch=(64, ), attr_branch=(64, ), conv_cfg=None, norm_cfg=None, train_cfg=None, test_cfg=None, init_cfg=None): super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.cls_out_channels = num_classes self.in_channels = in_channels self.feat_channels = feat_channels self.stacked_convs = stacked_convs self.strides = strides self.dcn_on_last_conv = dcn_on_last_conv assert conv_bias == 'auto' or isinstance(conv_bias, bool) self.conv_bias = conv_bias self.use_direction_classifier = use_direction_classifier self.diff_rad_by_sin = diff_rad_by_sin self.dir_offset = dir_offset self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_dir = build_loss(loss_dir) self.bbox_code_size = bbox_code_size self.group_reg_dims = list(group_reg_dims) self.cls_branch = cls_branch self.reg_branch = reg_branch assert len(reg_branch) == len(group_reg_dims), 'The number of '\ 'element in reg_branch and group_reg_dims should be the same.' self.pred_velo = pred_velo self.pred_bbox2d = pred_bbox2d self.out_channels = [] for reg_branch_channels in reg_branch: if len(reg_branch_channels) > 0: self.out_channels.append(reg_branch_channels[-1]) else: self.out_channels.append(-1) self.dir_branch = dir_branch self.train_cfg = train_cfg self.test_cfg = test_cfg self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.fp16_enabled = False self.background_label = ( num_classes if background_label is None else background_label) # background_label should be either 0 or num_classes assert (self.background_label == 0 or self.background_label == num_classes) self.pred_attrs = pred_attrs self.attr_background_label = -1 self.num_attrs = num_attrs if self.pred_attrs: self.attr_background_label = num_attrs self.loss_attr = build_loss(loss_attr) self.attr_branch = attr_branch self._init_layers() if init_cfg is None: self.init_cfg = dict( type='Normal', layer='Conv2d', std=0.01, override=dict( type='Normal', name='conv_cls', std=0.01, bias_prob=0.01)) def _init_layers(self): """Initialize layers of the head.""" self._init_cls_convs() self._init_reg_convs() self._init_predictor() def _init_cls_convs(self): """Initialize classification conv layers of the head.""" self.cls_convs = nn.ModuleList() for i in range(self.stacked_convs): chn = self.in_channels if i == 0 else self.feat_channels if self.dcn_on_last_conv and i == self.stacked_convs - 1: conv_cfg = dict(type='DCNv2') else: conv_cfg = self.conv_cfg self.cls_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=conv_cfg, norm_cfg=self.norm_cfg, bias=self.conv_bias)) def _init_reg_convs(self): """Initialize bbox regression conv layers of the head.""" self.reg_convs = nn.ModuleList() for i in range(self.stacked_convs): chn = self.in_channels if i == 0 else self.feat_channels if self.dcn_on_last_conv and i == self.stacked_convs - 1: conv_cfg = dict(type='DCNv2') else: conv_cfg = self.conv_cfg self.reg_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=conv_cfg, norm_cfg=self.norm_cfg, bias=self.conv_bias)) def _init_branch(self, conv_channels=(64), conv_strides=(1)): """Initialize conv layers as a prediction branch.""" conv_before_pred = nn.ModuleList() if isinstance(conv_channels, int): conv_channels = [self.feat_channels] + [conv_channels] conv_strides = [conv_strides] else: conv_channels = [self.feat_channels] + list(conv_channels) conv_strides = list(conv_strides) for i in range(len(conv_strides)): conv_before_pred.append( ConvModule( conv_channels[i], conv_channels[i + 1], 3, stride=conv_strides[i], padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, bias=self.conv_bias)) return conv_before_pred def _init_predictor(self): """Initialize predictor layers of the head.""" self.conv_cls_prev = self._init_branch( conv_channels=self.cls_branch, conv_strides=(1, ) * len(self.cls_branch)) self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels, 1) self.conv_reg_prevs = nn.ModuleList() self.conv_regs = nn.ModuleList() for i in range(len(self.group_reg_dims)): reg_dim = self.group_reg_dims[i] reg_branch_channels = self.reg_branch[i] out_channel = self.out_channels[i] if len(reg_branch_channels) > 0: self.conv_reg_prevs.append( self._init_branch( conv_channels=reg_branch_channels, conv_strides=(1, ) * len(reg_branch_channels))) self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1)) else: self.conv_reg_prevs.append(None) self.conv_regs.append( nn.Conv2d(self.feat_channels, reg_dim, 1)) if self.use_direction_classifier: self.conv_dir_cls_prev = self._init_branch( conv_channels=self.dir_branch, conv_strides=(1, ) * len(self.dir_branch)) self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1) if self.pred_attrs: self.conv_attr_prev = self._init_branch( conv_channels=self.attr_branch, conv_strides=(1, ) * len(self.attr_branch)) self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)
[docs] def init_weights(self): super().init_weights() bias_cls = bias_init_with_prob(0.01) if self.use_direction_classifier: normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls) if self.pred_attrs: normal_init(self.conv_attr, std=0.01, bias=bias_cls)
[docs] def forward(self, feats): """Forward features from the upstream network. Args: feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor. Returns: tuple: Usually contain classification scores, bbox predictions, \ and direction class predictions. cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Attribute scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. """ return multi_apply(self.forward_single, feats)[:5]
[docs] def forward_single(self, x): """Forward features of a single scale levle. Args: x (Tensor): FPN feature maps of the specified stride. Returns: tuple: Scores for each class, bbox predictions, direction class, and attributes, features after classification and regression conv layers, some models needs these features like FCOS. """ cls_feat = x reg_feat = x for cls_layer in self.cls_convs: cls_feat = cls_layer(cls_feat) # clone the cls_feat for reusing the feature map afterwards clone_cls_feat = cls_feat.clone() for conv_cls_prev_layer in self.conv_cls_prev: clone_cls_feat = conv_cls_prev_layer(clone_cls_feat) cls_score = self.conv_cls(clone_cls_feat) for reg_layer in self.reg_convs: reg_feat = reg_layer(reg_feat) bbox_pred = [] for i in range(len(self.group_reg_dims)): # clone the reg_feat for reusing the feature map afterwards clone_reg_feat = reg_feat.clone() if len(self.reg_branch[i]) > 0: for conv_reg_prev_layer in self.conv_reg_prevs[i]: clone_reg_feat = conv_reg_prev_layer(clone_reg_feat) bbox_pred.append(self.conv_regs[i](clone_reg_feat)) bbox_pred = torch.cat(bbox_pred, dim=1) dir_cls_pred = None if self.use_direction_classifier: clone_reg_feat = reg_feat.clone() for conv_dir_cls_prev_layer in self.conv_dir_cls_prev: clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat) dir_cls_pred = self.conv_dir_cls(clone_reg_feat) attr_pred = None if self.pred_attrs: # clone the cls_feat for reusing the feature map afterwards clone_cls_feat = cls_feat.clone() for conv_attr_prev_layer in self.conv_attr_prev: clone_cls_feat = conv_attr_prev_layer(clone_cls_feat) attr_pred = self.conv_attr(clone_cls_feat) return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \ reg_feat
[docs] @abstractmethod @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, attr_preds, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each image with shape (num_gts, bbox_code_size). gt_labels_3d (list[Tensor]): 3D class indices of each box. centers2d (list[Tensor]): Projected 3D centers onto 2D images. depths (list[Tensor]): Depth of projected centers on 2D images. attr_labels (list[Tensor], optional): Attribute indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. """ raise NotImplementedError
[docs] @abstractmethod @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, attr_preds, img_metas, cfg=None, rescale=None): """Transform network output for a batch into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_points * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_points * bbox_code_size, H, W) dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Attribute scores for each scale level Has shape (N, num_points * num_attrs, H, W) img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used rescale (bool): If True, return boxes in original image space """ raise NotImplementedError
[docs] @abstractmethod def get_targets(self, points, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, attr_labels_list): """Compute regression, classification and centerss targets for points in multiple images. Args: points (list[Tensor]): Points of each fpn level, each has shape (num_points, 2). gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, each has shape (num_gt, 4). gt_labels_list (list[Tensor]): Ground truth labels of each box, each has shape (num_gt,). gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each image, each has shape (num_gt, bbox_code_size). gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each box, each has shape (num_gt,). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, each has shape (num_gt, 2). depths_list (list[Tensor]): Depth of projected 3D centers onto 2D image, each has shape (num_gt, 1). attr_labels_list (list[Tensor]): Attribute labels of each box, each has shape (num_gt,). """ raise NotImplementedError
def _get_points_single(self, featmap_size, stride, dtype, device, flatten=False): """Get points of a single scale level.""" h, w = featmap_size x_range = torch.arange(w, dtype=dtype, device=device) y_range = torch.arange(h, dtype=dtype, device=device) y, x = torch.meshgrid(y_range, x_range) if flatten: y = y.flatten() x = x.flatten() return y, x
[docs] def get_points(self, featmap_sizes, dtype, device, flatten=False): """Get points according to feature map sizes. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. dtype (torch.dtype): Type of points. device (torch.device): Device of points. Returns: tuple: points of each image. """ mlvl_points = [] for i in range(len(featmap_sizes)): mlvl_points.append( self._get_points_single(featmap_sizes[i], self.strides[i], dtype, device, flatten)) return mlvl_points