import torch
from abc import abstractmethod
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn
from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from .base_mono3d_dense_head import BaseMono3DDenseHead
[docs]@HEADS.register_module()
class AnchorFreeMono3DHead(BaseMono3DDenseHead):
"""Anchor-free head for monocular 3D object detection.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
feat_channels (int): Number of hidden channels. Used in child classes.
stacked_convs (int): Number of stacking convs of the head.
strides (tuple): Downsample factor of each feature map.
dcn_on_last_conv (bool): If true, use dcn in the last layer of
towers. Default: False.
conv_bias (bool | str): If specified as `auto`, it will be decided by
the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
None, otherwise False. Default: "auto".
background_label (int | None): Label ID of background, set as 0 for
RPN and num_classes for other heads. It will automatically set as
num_classes if None is given.
use_direction_classifier (bool): Whether to add a direction classifier.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
loss_attr (dict): Config of attribute classifier loss, which is only
active when pred_attrs=True.
bbox_code_size (int): Dimensions of predicted bounding boxes.
pred_attrs (bool): Whether to predict attributes. Default to False.
num_attrs (int): The number of attributes to be predicted. Default: 9.
pred_velo (bool): Whether to predict velocity. Default to False.
pred_bbox2d (bool): Whether to predict 2D boxes. Default to False.
group_reg_dims (tuple[int]): The dimension of each regression target
group. Default: (2, 1, 3, 1, 2).
cls_branch (tuple[int]): Channels for classification branch.
Default: (128, 64).
reg_branch (tuple[tuple]): Channels for regression branch.
Default: (
(128, 64), # offset
(128, 64), # depth
(64, ), # size
(64, ), # rot
() # velo
),
dir_branch (tuple[int]): Channels for direction classification branch.
Default: (64, ).
attr_branch (tuple[int]): Channels for classification branch.
Default: (64, ).
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Config dict for normalization layer. Default: None.
train_cfg (dict): Training config of anchor head.
test_cfg (dict): Testing config of anchor head.
""" # noqa: W605
_version = 1
def __init__(
self,
num_classes,
in_channels,
feat_channels=256,
stacked_convs=4,
strides=(4, 8, 16, 32, 64),
dcn_on_last_conv=False,
conv_bias='auto',
background_label=None,
use_direction_classifier=True,
diff_rad_by_sin=True,
dir_offset=0,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
bbox_code_size=9, # For nuscenes
pred_attrs=False,
num_attrs=9, # For nuscenes
pred_velo=False,
pred_bbox2d=False,
group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo,
cls_branch=(128, 64),
reg_branch=(
(128, 64), # offset
(128, 64), # depth
(64, ), # size
(64, ), # rot
() # velo
),
dir_branch=(64, ),
attr_branch=(64, ),
conv_cfg=None,
norm_cfg=None,
train_cfg=None,
test_cfg=None,
init_cfg=None):
super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg)
self.num_classes = num_classes
self.cls_out_channels = num_classes
self.in_channels = in_channels
self.feat_channels = feat_channels
self.stacked_convs = stacked_convs
self.strides = strides
self.dcn_on_last_conv = dcn_on_last_conv
assert conv_bias == 'auto' or isinstance(conv_bias, bool)
self.conv_bias = conv_bias
self.use_direction_classifier = use_direction_classifier
self.diff_rad_by_sin = diff_rad_by_sin
self.dir_offset = dir_offset
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_dir = build_loss(loss_dir)
self.bbox_code_size = bbox_code_size
self.group_reg_dims = list(group_reg_dims)
self.cls_branch = cls_branch
self.reg_branch = reg_branch
assert len(reg_branch) == len(group_reg_dims), 'The number of '\
'element in reg_branch and group_reg_dims should be the same.'
self.pred_velo = pred_velo
self.pred_bbox2d = pred_bbox2d
self.out_channels = []
for reg_branch_channels in reg_branch:
if len(reg_branch_channels) > 0:
self.out_channels.append(reg_branch_channels[-1])
else:
self.out_channels.append(-1)
self.dir_branch = dir_branch
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.fp16_enabled = False
self.background_label = (
num_classes if background_label is None else background_label)
# background_label should be either 0 or num_classes
assert (self.background_label == 0
or self.background_label == num_classes)
self.pred_attrs = pred_attrs
self.attr_background_label = -1
self.num_attrs = num_attrs
if self.pred_attrs:
self.attr_background_label = num_attrs
self.loss_attr = build_loss(loss_attr)
self.attr_branch = attr_branch
self._init_layers()
if init_cfg is None:
self.init_cfg = dict(
type='Normal',
layer='Conv2d',
std=0.01,
override=dict(
type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
def _init_layers(self):
"""Initialize layers of the head."""
self._init_cls_convs()
self._init_reg_convs()
self._init_predictor()
def _init_cls_convs(self):
"""Initialize classification conv layers of the head."""
self.cls_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
if self.dcn_on_last_conv and i == self.stacked_convs - 1:
conv_cfg = dict(type='DCNv2')
else:
conv_cfg = self.conv_cfg
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
def _init_reg_convs(self):
"""Initialize bbox regression conv layers of the head."""
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
if self.dcn_on_last_conv and i == self.stacked_convs - 1:
conv_cfg = dict(type='DCNv2')
else:
conv_cfg = self.conv_cfg
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
def _init_branch(self, conv_channels=(64), conv_strides=(1)):
"""Initialize conv layers as a prediction branch."""
conv_before_pred = nn.ModuleList()
if isinstance(conv_channels, int):
conv_channels = [self.feat_channels] + [conv_channels]
conv_strides = [conv_strides]
else:
conv_channels = [self.feat_channels] + list(conv_channels)
conv_strides = list(conv_strides)
for i in range(len(conv_strides)):
conv_before_pred.append(
ConvModule(
conv_channels[i],
conv_channels[i + 1],
3,
stride=conv_strides[i],
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
return conv_before_pred
def _init_predictor(self):
"""Initialize predictor layers of the head."""
self.conv_cls_prev = self._init_branch(
conv_channels=self.cls_branch,
conv_strides=(1, ) * len(self.cls_branch))
self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
1)
self.conv_reg_prevs = nn.ModuleList()
self.conv_regs = nn.ModuleList()
for i in range(len(self.group_reg_dims)):
reg_dim = self.group_reg_dims[i]
reg_branch_channels = self.reg_branch[i]
out_channel = self.out_channels[i]
if len(reg_branch_channels) > 0:
self.conv_reg_prevs.append(
self._init_branch(
conv_channels=reg_branch_channels,
conv_strides=(1, ) * len(reg_branch_channels)))
self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1))
else:
self.conv_reg_prevs.append(None)
self.conv_regs.append(
nn.Conv2d(self.feat_channels, reg_dim, 1))
if self.use_direction_classifier:
self.conv_dir_cls_prev = self._init_branch(
conv_channels=self.dir_branch,
conv_strides=(1, ) * len(self.dir_branch))
self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1)
if self.pred_attrs:
self.conv_attr_prev = self._init_branch(
conv_channels=self.attr_branch,
conv_strides=(1, ) * len(self.attr_branch))
self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)
[docs] def init_weights(self):
super().init_weights()
bias_cls = bias_init_with_prob(0.01)
if self.use_direction_classifier:
normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
if self.pred_attrs:
normal_init(self.conv_attr, std=0.01, bias=bias_cls)
[docs] def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple: Usually contain classification scores, bbox predictions, \
and direction class predictions.
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_attrs.
"""
return multi_apply(self.forward_single, feats)[:5]
[docs] def forward_single(self, x):
"""Forward features of a single scale levle.
Args:
x (Tensor): FPN feature maps of the specified stride.
Returns:
tuple: Scores for each class, bbox predictions, direction class,
and attributes, features after classification and regression
conv layers, some models needs these features like FCOS.
"""
cls_feat = x
reg_feat = x
for cls_layer in self.cls_convs:
cls_feat = cls_layer(cls_feat)
# clone the cls_feat for reusing the feature map afterwards
clone_cls_feat = cls_feat.clone()
for conv_cls_prev_layer in self.conv_cls_prev:
clone_cls_feat = conv_cls_prev_layer(clone_cls_feat)
cls_score = self.conv_cls(clone_cls_feat)
for reg_layer in self.reg_convs:
reg_feat = reg_layer(reg_feat)
bbox_pred = []
for i in range(len(self.group_reg_dims)):
# clone the reg_feat for reusing the feature map afterwards
clone_reg_feat = reg_feat.clone()
if len(self.reg_branch[i]) > 0:
for conv_reg_prev_layer in self.conv_reg_prevs[i]:
clone_reg_feat = conv_reg_prev_layer(clone_reg_feat)
bbox_pred.append(self.conv_regs[i](clone_reg_feat))
bbox_pred = torch.cat(bbox_pred, dim=1)
dir_cls_pred = None
if self.use_direction_classifier:
clone_reg_feat = reg_feat.clone()
for conv_dir_cls_prev_layer in self.conv_dir_cls_prev:
clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat)
dir_cls_pred = self.conv_dir_cls(clone_reg_feat)
attr_pred = None
if self.pred_attrs:
# clone the cls_feat for reusing the feature map afterwards
clone_cls_feat = cls_feat.clone()
for conv_attr_prev_layer in self.conv_attr_prev:
clone_cls_feat = conv_attr_prev_layer(clone_cls_feat)
attr_pred = self.conv_attr(clone_cls_feat)
return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \
reg_feat
[docs] @abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each
image with shape (num_gts, bbox_code_size).
gt_labels_3d (list[Tensor]): 3D class indices of each box.
centers2d (list[Tensor]): Projected 3D centers onto 2D images.
depths (list[Tensor]): Depth of projected centers on 2D images.
attr_labels (list[Tensor], optional): Attribute indices
corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
"""
raise NotImplementedError
[docs] @abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * bbox_code_size, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space
"""
raise NotImplementedError
[docs] @abstractmethod
def get_targets(self, points, gt_bboxes_list, gt_labels_list,
gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
depths_list, attr_labels_list):
"""Compute regression, classification and centerss targets for points
in multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
image, each has shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
box, each has shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
each has shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
attr_labels_list (list[Tensor]): Attribute labels of each box,
each has shape (num_gt,).
"""
raise NotImplementedError
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points of a single scale level."""
h, w = featmap_size
x_range = torch.arange(w, dtype=dtype, device=device)
y_range = torch.arange(h, dtype=dtype, device=device)
y, x = torch.meshgrid(y_range, x_range)
if flatten:
y = y.flatten()
x = x.flatten()
return y, x
[docs] def get_points(self, featmap_sizes, dtype, device, flatten=False):
"""Get points according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
dtype (torch.dtype): Type of points.
device (torch.device): Device of points.
Returns:
tuple: points of each image.
"""
mlvl_points = []
for i in range(len(featmap_sizes)):
mlvl_points.append(
self._get_points_single(featmap_sizes[i], self.strides[i],
dtype, device, flatten))
return mlvl_points