Shortcuts

Source code for mmdet3d.models.dense_heads.monoflex_head

# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmcv.cnn import xavier_init
from torch import nn as nn

from mmdet3d.core.utils import get_ellip_gaussian_2D
from mmdet3d.models.model_utils import EdgeFusionModule
from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
                                  get_keypoints, handle_proj_objs)
from mmdet.core import multi_apply
from mmdet.core.bbox.builder import build_bbox_coder
from mmdet.models.utils import gaussian_radius, gen_gaussian_target
from mmdet.models.utils.gaussian_target import (get_local_maximum,
                                                get_topk_from_heatmap,
                                                transpose_and_gather_feat)
from ..builder import HEADS, build_loss
from .anchor_free_mono3d_head import AnchorFreeMono3DHead


[docs]@HEADS.register_module() class MonoFlexHead(AnchorFreeMono3DHead): r"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_ .. code-block:: none / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls | | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox | | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets | | --> 3 x 3 conv --> 1 x 1 conv --> keypoints offsets | | --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty feature | --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty | | --> 3 x 3 conv --> 1 x 1 conv --> 3d dimensions | | |--- 1 x 1 conv --> ori cls | --> 3 x 3 conv --| | |--- 1 x 1 conv --> ori offsets | | --> 3 x 3 conv --> 1 x 1 conv --> depth | \ --> 3 x 3 conv --> 1 x 1 conv --> depth uncertainty Args: use_edge_fusion (bool): Whether to use edge fusion module while feature extraction. edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion. edge_heatmap_ratio (float): Ratio of generating target heatmap. filter_outside_objs (bool, optional): Whether to filter the outside objects. Default: True. loss_cls (dict, optional): Config of classification loss. Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0). loss_bbox (dict, optional): Config of localization loss. Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0). loss_dir (dict, optional): Config of direction classification loss. Default: dict(type='MultibinLoss', loss_weight=0.1). loss_keypoints (dict, optional): Config of keypoints loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_dims: (dict, optional): Config of dimensions loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_offsets2d: (dict, optional): Config of offsets2d loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_direct_depth: (dict, optional): Config of directly regression depth loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_combined_depth: (dict, optional): Config of combined depth loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_attr (dict, optional): Config of attribute classification loss. In MonoFlex, Default: None. bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes. Default: dict(type='MonoFlexCoder', code_size=7). norm_cfg (dict, optional): Dictionary to construct and config norm layer. Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). init_cfg (dict): Initialization config dict. Default: None. """ # noqa: E501 def __init__(self, num_classes, in_channels, use_edge_fusion, edge_fusion_inds, edge_heatmap_ratio, filter_outside_objs=True, loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0), loss_bbox=dict(type='IoULoss', loss_weight=0.1), loss_dir=dict(type='MultiBinLoss', loss_weight=0.1), loss_keypoints=dict(type='L1Loss', loss_weight=0.1), loss_dims=dict(type='L1Loss', loss_weight=0.1), loss_offsets2d=dict(type='L1Loss', loss_weight=0.1), loss_direct_depth=dict(type='L1Loss', loss_weight=0.1), loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1), loss_combined_depth=dict(type='L1Loss', loss_weight=0.1), loss_attr=None, bbox_coder=dict(type='MonoFlexCoder', code_size=7), norm_cfg=dict(type='BN'), init_cfg=None, init_bias=-2.19, **kwargs): self.use_edge_fusion = use_edge_fusion self.edge_fusion_inds = edge_fusion_inds super().__init__( num_classes, in_channels, loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dir=loss_dir, loss_attr=loss_attr, norm_cfg=norm_cfg, init_cfg=init_cfg, **kwargs) self.filter_outside_objs = filter_outside_objs self.edge_heatmap_ratio = edge_heatmap_ratio self.init_bias = init_bias self.loss_dir = build_loss(loss_dir) self.loss_keypoints = build_loss(loss_keypoints) self.loss_dims = build_loss(loss_dims) self.loss_offsets2d = build_loss(loss_offsets2d) self.loss_direct_depth = build_loss(loss_direct_depth) self.loss_keypoints_depth = build_loss(loss_keypoints_depth) self.loss_combined_depth = build_loss(loss_combined_depth) self.bbox_coder = build_bbox_coder(bbox_coder) def _init_edge_module(self): """Initialize edge fusion module for feature extraction.""" self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256) for i in range(len(self.edge_fusion_inds)): reg_inds, out_inds = self.edge_fusion_inds[i] out_channels = self.group_reg_dims[reg_inds][out_inds] fusion_layer = EdgeFusionModule(out_channels, 256) layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}' self.add_module(layer_name, fusion_layer)
[docs] def init_weights(self): """Initialize weights.""" super().init_weights() self.conv_cls.bias.data.fill_(self.init_bias) xavier_init(self.conv_regs[4][0], gain=0.01) xavier_init(self.conv_regs[7][0], gain=0.01) for m in self.conv_regs.modules(): if isinstance(m, nn.Conv2d): if m.bias is not None: nn.init.constant_(m.bias, 0)
def _init_predictor(self): """Initialize predictor layers of the head.""" self.conv_cls_prev = self._init_branch( conv_channels=self.cls_branch, conv_strides=(1, ) * len(self.cls_branch)) self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels, 1) # init regression head self.conv_reg_prevs = nn.ModuleList() # init output head self.conv_regs = nn.ModuleList() # group_reg_dims: # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, )) for i in range(len(self.group_reg_dims)): reg_dims = self.group_reg_dims[i] reg_branch_channels = self.reg_branch[i] out_channel = self.out_channels[i] reg_list = nn.ModuleList() if len(reg_branch_channels) > 0: self.conv_reg_prevs.append( self._init_branch( conv_channels=reg_branch_channels, conv_strides=(1, ) * len(reg_branch_channels))) for reg_dim in reg_dims: reg_list.append(nn.Conv2d(out_channel, reg_dim, 1)) self.conv_regs.append(reg_list) else: self.conv_reg_prevs.append(None) for reg_dim in reg_dims: reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1)) self.conv_regs.append(reg_list) def _init_layers(self): """Initialize layers of the head.""" self._init_predictor() if self.use_edge_fusion: self._init_edge_module()
[docs] def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, gt_bboxes_ignore, proposal_cfg, **kwargs): """ Args: x (list[Tensor]): Features from FPN. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes (list[Tensor]): Ground truth bboxes of the image, shape (num_gts, 4). gt_labels (list[Tensor]): Ground truth labels of each box, shape (num_gts,). gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image, shape (num_gts, self.bbox_code_size). gt_labels_3d (list[Tensor]): 3D ground truth labels of each box, shape (num_gts,). centers2d (list[Tensor]): Projected 3D center of each box, shape (num_gts, 2). depths (list[Tensor]): Depth of projected 3D center of each box, shape (num_gts,). attr_labels (list[Tensor]): Attribute labels of each box, shape (num_gts,). gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be ignored, shape (num_ignored_gts, 4). proposal_cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used Returns: tuple: losses: (dict[str, Tensor]): A dictionary of loss components. proposal_list (list[Tensor]): Proposals of each image. """ outs = self(x, input_metas) if gt_labels is None: loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths, attr_labels, input_metas) else: loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, input_metas) losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) if proposal_cfg is None: return losses else: proposal_list = self.get_bboxes( *outs, input_metas, cfg=proposal_cfg) return losses, proposal_list
[docs] def forward(self, feats, input_metas): """Forward features from the upstream network. Args: feats (list[Tensor]): Features from the upstream network, each is a 4D-tensor. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. """ mlvl_input_metas = [input_metas for i in range(len(feats))] return multi_apply(self.forward_single, feats, mlvl_input_metas)
[docs] def forward_single(self, x, input_metas): """Forward features of a single scale level. Args: x (Tensor): Feature maps from a specific FPN feature level. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple: Scores for each class, bbox predictions. """ img_h, img_w = input_metas[0]['pad_shape'][:2] batch_size, _, feat_h, feat_w = x.shape downsample_ratio = img_h / feat_h for conv_cls_prev_layer in self.conv_cls_prev: cls_feat = conv_cls_prev_layer(x) out_cls = self.conv_cls(cls_feat) if self.use_edge_fusion: # calculate the edge indices for the batch data edge_indices_list = get_edge_indices( input_metas, downsample_ratio, device=x.device) edge_lens = [ edge_indices.shape[0] for edge_indices in edge_indices_list ] max_edge_len = max(edge_lens) edge_indices = x.new_zeros((batch_size, max_edge_len, 2), dtype=torch.long) for i in range(batch_size): edge_indices[i, :edge_lens[i]] = edge_indices_list[i] # cls feature map edge fusion out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices, edge_lens, feat_h, feat_w) bbox_pred = [] for i in range(len(self.group_reg_dims)): reg_feat = x.clone() # feature regression head if len(self.reg_branch[i]) > 0: for conv_reg_prev_layer in self.conv_reg_prevs[i]: reg_feat = conv_reg_prev_layer(reg_feat) for j, conv_reg in enumerate(self.conv_regs[i]): out_reg = conv_reg(reg_feat) # Use Edge Fusion Module if self.use_edge_fusion and (i, j) in self.edge_fusion_inds: # reg feature map edge fusion out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format( i, j))(reg_feat, out_reg, edge_indices, edge_lens, feat_h, feat_w) bbox_pred.append(out_reg) bbox_pred = torch.cat(bbox_pred, dim=1) cls_score = out_cls.sigmoid() # turn to 0-1 cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4) return cls_score, bbox_pred
[docs] def get_bboxes(self, cls_scores, bbox_preds, input_metas): """Generate bboxes from bbox head predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level. bbox_preds (list[Tensor]): Box regression for each scale. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. rescale (bool): If True, return boxes in original image space. Returns: list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]: Each item in result_list is 4-tuple. """ assert len(cls_scores) == len(bbox_preds) == 1 cam2imgs = torch.stack([ cls_scores[0].new_tensor(input_meta['cam2img']) for input_meta in input_metas ]) batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap( cls_scores[0], bbox_preds[0], input_metas, cam2imgs=cam2imgs, topk=100, kernel=3) result_list = [] for img_id in range(len(input_metas)): bboxes = batch_bboxes[img_id] scores = batch_scores[img_id] labels = batch_topk_labels[img_id] keep_idx = scores > 0.25 bboxes = bboxes[keep_idx] scores = scores[keep_idx] labels = labels[keep_idx] bboxes = input_metas[img_id]['box_type_3d']( bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)) attrs = None result_list.append((bboxes, scores, labels, attrs)) return result_list
[docs] def decode_heatmap(self, cls_score, reg_pred, input_metas, cam2imgs, topk=100, kernel=3): """Transform outputs into detections raw bbox predictions. Args: class_score (Tensor): Center predict heatmap, shape (B, num_classes, H, W). reg_pred (Tensor): Box regression map. shape (B, channel, H , W). input_metas (List[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cam2imgs (Tensor): Camera intrinsic matrix. shape (N, 4, 4) topk (int, optional): Get top k center keypoints from heatmap. Default 100. kernel (int, optional): Max pooling kernel for extract local maximum pixels. Default 3. Returns: tuple[torch.Tensor]: Decoded output of SMOKEHead, containing the following Tensors: - batch_bboxes (Tensor): Coords of each 3D box. shape (B, k, 7) - batch_scores (Tensor): Scores of each 3D box. shape (B, k) - batch_topk_labels (Tensor): Categories of each 3D box. shape (B, k) """ img_h, img_w = input_metas[0]['pad_shape'][:2] batch_size, _, feat_h, feat_w = cls_score.shape downsample_ratio = img_h / feat_h center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel) *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap( center_heatmap_pred, k=topk) batch_scores, batch_index, batch_topk_labels = batch_dets regression = transpose_and_gather_feat(reg_pred, batch_index) regression = regression.view(-1, 8) pred_base_centers2d = torch.cat( [topk_xs.view(-1, 1), topk_ys.view(-1, 1).float()], dim=1) preds = self.bbox_coder.decode(regression, batch_topk_labels, downsample_ratio, cam2imgs) pred_locations = self.bbox_coder.decode_location( pred_base_centers2d, preds['offsets2d'], preds['combined_depth'], cam2imgs, downsample_ratio) pred_yaws = self.bbox_coder.decode_orientation( preds['orientations']).unsqueeze(-1) pred_dims = preds['dimensions'] batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1) batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size) return batch_bboxes, batch_scores, batch_topk_labels
[docs] def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask, batch_indices, input_metas, downsample_ratio): """Prepare predictions for computing loss. Args: pred_reg (Tensor): Box regression map. shape (B, channel, H , W). labels3d (Tensor): Labels of each 3D box. shape (B * max_objs, ) centers2d (Tensor): Coords of each projected 3D box center on image. shape (N, 2) reg_mask (Tensor): Indexes of the existence of the 3D box. shape (B * max_objs, ) batch_indices (Tenosr): Batch indices of the 3D box. shape (N, 3) input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. downsample_ratio (int): The stride of feature map. Returns: dict: The predictions for computing loss. """ batch, channel = pred_reg.shape[0], pred_reg.shape[1] w = pred_reg.shape[3] cam2imgs = torch.stack([ centers2d.new_tensor(input_meta['cam2img']) for input_meta in input_metas ]) # (batch_size, 4, 4) -> (N, 4, 4) cam2imgs = cam2imgs[batch_indices, :, :] centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0] centers2d_inds = centers2d_inds.view(batch, -1) pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds) pred_regression_pois = pred_regression.view(-1, channel)[reg_mask] preds = self.bbox_coder.decode(pred_regression_pois, labels3d, downsample_ratio, cam2imgs) return preds
[docs] def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, feat_shape, img_shape, input_metas): """Get training targets for batch images. `` Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, shape (num_gt, 4). gt_labels_list (list[Tensor]): Ground truth labels of each box, shape (num_gt,). gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D Ground truth bboxes of each image, shape (num_gt, bbox_code_size). gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each box, shape (num_gt,). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, shape (num_gt, 2). depths_list (list[Tensor]): Depth of projected 3D centers onto 2D image, each has shape (num_gt, 1). feat_shape (tuple[int]): Feature map shape with value, shape (B, _, H, W). img_shape (tuple[int]): Image shape in [h, w] format. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple[Tensor, dict]: The Tensor value is the targets of center heatmap, the dict has components below: - base_centers2d_target (Tensor): Coords of each projected 3D box center on image. shape (B * max_objs, 2), [dtype: int] - labels3d (Tensor): Labels of each 3D box. shape (N, ) - reg_mask (Tensor): Mask of the existence of the 3D box. shape (B * max_objs, ) - batch_indices (Tensor): Batch id of the 3D box. shape (N, ) - depth_target (Tensor): Depth target of each 3D box. shape (N, ) - keypoints2d_target (Tensor): Keypoints of each projected 3D box on image. shape (N, 10, 2) - keypoints_mask (Tensor): Keypoints mask of each projected 3D box on image. shape (N, 10) - keypoints_depth_mask (Tensor): Depths decoded from keypoints of each 3D box. shape (N, 3) - orientations_target (Tensor): Orientation (encoded local yaw) target of each 3D box. shape (N, ) - offsets2d_target (Tensor): Offsets target of each projected 3D box. shape (N, 2) - dimensions_target (Tensor): Dimensions target of each 3D box. shape (N, 3) - downsample_ratio (int): The stride of feature map. """ img_h, img_w = img_shape[:2] batch_size, _, feat_h, feat_w = feat_shape width_ratio = float(feat_w / img_w) # 1/4 height_ratio = float(feat_h / img_h) # 1/4 assert width_ratio == height_ratio # Whether to filter the objects which are not in FOV. if self.filter_outside_objs: filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, input_metas) # transform centers2d to base centers2d for regression and # heatmap generation. # centers2d = int(base_centers2d) + offsets2d base_centers2d_list, offsets2d_list, trunc_mask_list = \ handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas) keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \ get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas) center_heatmap_target = gt_bboxes_list[-1].new_zeros( [batch_size, self.num_classes, feat_h, feat_w]) for batch_id in range(batch_size): # project gt_bboxes from input image to feat map gt_bboxes = gt_bboxes_list[batch_id] * width_ratio gt_labels = gt_labels_list[batch_id] # project base centers2d from input image to feat map gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio trunc_masks = trunc_mask_list[batch_id] for j, base_center2d in enumerate(gt_base_centers2d): if trunc_masks[j]: # for outside objects, generate ellipse heatmap base_center2d_x_int, base_center2d_y_int = \ base_center2d.int() scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0], gt_bboxes[j][2] - base_center2d_x_int) scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1], gt_bboxes[j][3] - base_center2d_y_int) radius_x = scale_box_w * self.edge_heatmap_ratio radius_y = scale_box_h * self.edge_heatmap_ratio radius_x, radius_y = max(0, int(radius_x)), max( 0, int(radius_y)) assert min(radius_x, radius_y) == 0 ind = gt_labels[j] get_ellip_gaussian_2D( center_heatmap_target[batch_id, ind], [base_center2d_x_int, base_center2d_y_int], radius_x, radius_y) else: base_center2d_x_int, base_center2d_y_int = \ base_center2d.int() scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1]) scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0]) radius = gaussian_radius([scale_box_h, scale_box_w], min_overlap=0.7) radius = max(0, int(radius)) ind = gt_labels[j] gen_gaussian_target( center_heatmap_target[batch_id, ind], [base_center2d_x_int, base_center2d_y_int], radius) avg_factor = max(1, center_heatmap_target.eq(1).sum()) num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list] max_objs = max(num_ctrs) batch_indices = [ centers2d_list[0].new_full((num_ctrs[i], ), i) for i in range(batch_size) ] batch_indices = torch.cat(batch_indices, dim=0) reg_mask = torch.zeros( (batch_size, max_objs), dtype=torch.bool).to(base_centers2d_list[0].device) gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list) gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device) # encode original local yaw to multibin format orienations_target = self.bbox_coder.encode(gt_bboxes_3d) batch_base_centers2d = base_centers2d_list[0].new_zeros( (batch_size, max_objs, 2)) for i in range(batch_size): reg_mask[i, :num_ctrs[i]] = 1 batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i] flatten_reg_mask = reg_mask.flatten() # transform base centers2d from input scale to output scale batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio dimensions_target = gt_bboxes_3d.tensor[:, 3:6] labels_3d = torch.cat(gt_labels_3d_list) keypoints2d_target = torch.cat(keypoints2d_list) keypoints_mask = torch.cat(keypoints_mask_list) keypoints_depth_mask = torch.cat(keypoints_depth_mask_list) offsets2d_target = torch.cat(offsets2d_list) bboxes2d = torch.cat(gt_bboxes_list) # transform FCOS style bbox into [x1, y1, x2, y2] format. bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]], dim=-1) depths = torch.cat(depths_list) target_labels = dict( base_centers2d_target=batch_base_centers2d.int(), labels3d=labels_3d, reg_mask=flatten_reg_mask, batch_indices=batch_indices, bboxes2d_target=bboxes2d_target, depth_target=depths, keypoints2d_target=keypoints2d_target, keypoints_mask=keypoints_mask, keypoints_depth_mask=keypoints_depth_mask, orienations_target=orienations_target, offsets2d_target=offsets2d_target, dimensions_target=dimensions_target, downsample_ratio=1 / width_ratio) return center_heatmap_target, avg_factor, target_labels
[docs] def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, input_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level. shape (num_gt, 4). bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel number is bbox_code_size. shape (B, 7, H, W). gt_bboxes (list[Tensor]): Ground truth bboxes for each image. shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): Class indices corresponding to each box. shape (num_gts, ). gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground truth. it is the flipped gt_bboxes gt_labels_3d (list[Tensor]): Same as gt_labels. centers2d (list[Tensor]): 2D centers on the image. shape (num_gts, 2). depths (list[Tensor]): Depth ground truth. shape (num_gts, ). attr_labels (list[Tensor]): Attributes indices of each box. In kitti it's None. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): Specify which bounding boxes can be ignored when computing the loss. Default: None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == 1 assert attr_labels is None assert gt_bboxes_ignore is None center2d_heatmap = cls_scores[0] pred_reg = bbox_preds[0] center2d_heatmap_target, avg_factor, target_labels = \ self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, center2d_heatmap.shape, input_metas[0]['pad_shape'], input_metas) preds = self.get_predictions( pred_reg=pred_reg, labels3d=target_labels['labels3d'], centers2d=target_labels['base_centers2d_target'], reg_mask=target_labels['reg_mask'], batch_indices=target_labels['batch_indices'], input_metas=input_metas, downsample_ratio=target_labels['downsample_ratio']) # heatmap loss loss_cls = self.loss_cls( center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor) # bbox2d regression loss loss_bbox = self.loss_bbox(preds['bboxes2d'], target_labels['bboxes2d_target']) # keypoints loss, the keypoints in predictions and target are all # local coordinates. Check the mask dtype should be bool, not int # or float to ensure the indexing is bool index keypoints2d_mask = target_labels['keypoints2d_mask'] loss_keypoints = self.loss_keypoints( preds['keypoints2d'][keypoints2d_mask], target_labels['keypoints2d_target'][keypoints2d_mask]) # orientations loss loss_dir = self.loss_dir(preds['orientations'], target_labels['orientations_target']) # dimensions loss loss_dims = self.loss_dims(preds['dimensions'], target_labels['dimensions_target']) # offsets for center heatmap loss_offsets2d = self.loss_offsets2d(preds['offsets2d'], target_labels['offsets2d_target']) # directly regressed depth loss with direct depth uncertainty loss direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty']) loss_weight_1 = self.loss_direct_depth.loss_weight loss_direct_depth = self.loss_direct_depth( preds['direct_depth'], target_labels['depth_target'], direct_depth_weights) loss_uncertainty_1 =\ preds['direct_depth_uncertainty'] * loss_weight_1 loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean() # keypoints decoded depth loss with keypoints depth uncertainty loss depth_mask = target_labels['keypoints_depth_mask'] depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3) valid_keypoints_depth_uncertainty = preds[ 'keypoints_depth_uncertainty'][depth_mask] valid_keypoints_depth_weights = torch.exp( -valid_keypoints_depth_uncertainty) loss_keypoints_depth = self.loss_keypoint_depth( preds['keypoints_depth'][depth_mask], depth_target[depth_mask], valid_keypoints_depth_weights) loss_weight_2 = self.loss_keypoints_depth.loss_weight loss_uncertainty_2 =\ valid_keypoints_depth_uncertainty * loss_weight_2 loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean() # combined depth loss for optimiaze the uncertainty loss_combined_depth = self.loss_combined_depth( preds['combined_depth'], target_labels['depth_target']) loss_dict = dict( loss_cls=loss_cls, loss_bbox=loss_bbox, loss_keypoints=loss_keypoints, loss_dir=loss_dir, loss_dims=loss_dims, loss_offsets2d=loss_offsets2d, loss_direct_depth=loss_direct_depth, loss_keypoints_depth=loss_keypoints_depth, loss_combined_depth=loss_combined_depth) return loss_dict
Read the Docs v: dev
Versions
latest
stable
v1.0.0rc1
v1.0.0rc0
v0.18.1
v0.18.0
v0.17.3
v0.17.2
v0.17.1
v0.17.0
v0.16.0
v0.15.0
v0.14.0
v0.13.0
v0.12.0
v0.11.0
v0.10.0
v0.9.0
dev
Downloads
pdf
html
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.