Source code for mmpose.models.necks.hybrid_encoder

# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from mmengine.model import BaseModule, ModuleList
from torch import Tensor

from mmpose.models.utils import (DetrTransformerEncoder, RepVGGBlock,
                                 SinePositionalEncoding)
from mmpose.registry import MODELS
from mmpose.utils.typing import ConfigType, OptConfigType


class CSPRepLayer(BaseModule):
    """CSPRepLayer, a layer that combines Cross Stage Partial Networks with
    RepVGG Blocks.

    Args:
        in_channels (int): Number of input channels to the layer.
        out_channels (int): Number of output channels from the layer.
        num_blocks (int): The number of RepVGG blocks to be used in the layer.
            Defaults to 3.
        widen_factor (float): Expansion factor for intermediate channels.
            Determines the hidden channel size based on out_channels.
            Defaults to 1.0.
        norm_cfg (dict): Configuration for normalization layers.
            Defaults to Batch Normalization with trainable parameters.
        act_cfg (dict): Configuration for activation layers.
            Defaults to SiLU (Swish) with in-place operation.
    """

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 num_blocks: int = 3,
                 widen_factor: float = 1.0,
                 norm_cfg: OptConfigType = dict(type='BN', requires_grad=True),
                 act_cfg: OptConfigType = dict(type='SiLU', inplace=True)):
        super(CSPRepLayer, self).__init__()
        hidden_channels = int(out_channels * widen_factor)
        self.conv1 = ConvModule(
            in_channels,
            hidden_channels,
            kernel_size=1,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg)
        self.conv2 = ConvModule(
            in_channels,
            hidden_channels,
            kernel_size=1,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg)

        self.bottlenecks = nn.Sequential(*[
            RepVGGBlock(hidden_channels, hidden_channels, act_cfg=act_cfg)
            for _ in range(num_blocks)
        ])
        if hidden_channels != out_channels:
            self.conv3 = ConvModule(
                hidden_channels,
                out_channels,
                kernel_size=1,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg)
        else:
            self.conv3 = nn.Identity()

    def forward(self, x: Tensor) -> Tensor:
        """Forward function.

        Args:
            x (Tensor): The input tensor.

        Returns:
            Tensor: The output tensor.
        """
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        return self.conv3(x_1 + x_2)


[docs]@MODELS.register_module()
class HybridEncoder(BaseModule):
    """Hybrid encoder neck introduced in `RT-DETR` by Lyu et al (2023),
    combining transformer encoders with a Feature Pyramid Network (FPN) and a
    Path Aggregation Network (PAN).

    Args:
        encoder_cfg (ConfigType): Configuration for the transformer encoder.
        projector (OptConfigType, optional): Configuration for an optional
            projector module. Defaults to None.
        num_encoder_layers (int, optional): Number of encoder layers.
            Defaults to 1.
        in_channels (List[int], optional): Input channels of feature maps.
            Defaults to [512, 1024, 2048].
        feat_strides (List[int], optional): Strides of feature maps.
            Defaults to [8, 16, 32].
        hidden_dim (int, optional): Hidden dimension of the MLP.
            Defaults to 256.
        use_encoder_idx (List[int], optional): Indices of encoder layers to
            use. Defaults to [2].
        pe_temperature (int, optional): Positional encoding temperature.
            Defaults to 10000.
        widen_factor (float, optional): Expansion factor for CSPRepLayer.
            Defaults to 1.0.
        deepen_factor (float, optional): Depth multiplier for CSPRepLayer.
            Defaults to 1.0.
        spe_learnable (bool, optional): Whether positional encoding is
            learnable. Defaults to False.
        output_indices (Optional[List[int]], optional): Indices of output
            layers. Defaults to None.
        norm_cfg (OptConfigType, optional): Configuration for normalization
            layers. Defaults to Batch Normalization.
        act_cfg (OptConfigType, optional): Configuration for activation
            layers. Defaults to SiLU (Swish) with in-place operation.

    .. _`RT-DETR`: https://arxiv.org/abs/2304.08069
    """

    def __init__(self,
                 encoder_cfg: ConfigType = dict(),
                 projector: OptConfigType = None,
                 num_encoder_layers: int = 1,
                 in_channels: List[int] = [512, 1024, 2048],
                 feat_strides: List[int] = [8, 16, 32],
                 hidden_dim: int = 256,
                 use_encoder_idx: List[int] = [2],
                 pe_temperature: int = 10000,
                 widen_factor: float = 1.0,
                 deepen_factor: float = 1.0,
                 spe_learnable: bool = False,
                 output_indices: Optional[List[int]] = None,
                 norm_cfg: OptConfigType = dict(type='BN', requires_grad=True),
                 act_cfg: OptConfigType = dict(type='SiLU', inplace=True)):
        super(HybridEncoder, self).__init__()
        self.in_channels = in_channels
        self.feat_strides = feat_strides
        self.hidden_dim = hidden_dim
        self.use_encoder_idx = use_encoder_idx
        self.num_encoder_layers = num_encoder_layers
        self.pe_temperature = pe_temperature
        self.output_indices = output_indices

        # channel projection
        self.input_proj = ModuleList()
        for in_channel in in_channels:
            self.input_proj.append(
                ConvModule(
                    in_channel,
                    hidden_dim,
                    kernel_size=1,
                    padding=0,
                    norm_cfg=norm_cfg,
                    act_cfg=None))

        # encoder transformer
        if len(use_encoder_idx) > 0:
            pos_enc_dim = self.hidden_dim // 2
            self.encoder = ModuleList([
                DetrTransformerEncoder(num_encoder_layers, encoder_cfg)
                for _ in range(len(use_encoder_idx))
            ])

        self.sincos_pos_enc = SinePositionalEncoding(
            pos_enc_dim,
            learnable=spe_learnable,
            temperature=self.pe_temperature,
            spatial_dim=2)

        # top-down fpn
        lateral_convs = list()
        fpn_blocks = list()
        for idx in range(len(in_channels) - 1, 0, -1):
            lateral_convs.append(
                ConvModule(
                    hidden_dim,
                    hidden_dim,
                    1,
                    1,
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg))
            fpn_blocks.append(
                CSPRepLayer(
                    hidden_dim * 2,
                    hidden_dim,
                    round(3 * deepen_factor),
                    act_cfg=act_cfg,
                    widen_factor=widen_factor))
        self.lateral_convs = ModuleList(lateral_convs)
        self.fpn_blocks = ModuleList(fpn_blocks)

        # bottom-up pan
        downsample_convs = list()
        pan_blocks = list()
        for idx in range(len(in_channels) - 1):
            downsample_convs.append(
                ConvModule(
                    hidden_dim,
                    hidden_dim,
                    3,
                    stride=2,
                    padding=1,
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg))
            pan_blocks.append(
                CSPRepLayer(
                    hidden_dim * 2,
                    hidden_dim,
                    round(3 * deepen_factor),
                    act_cfg=act_cfg,
                    widen_factor=widen_factor))
        self.downsample_convs = ModuleList(downsample_convs)
        self.pan_blocks = ModuleList(pan_blocks)

        if projector is not None:
            self.projector = MODELS.build(projector)
        else:
            self.projector = None

[docs]    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
        """Forward function."""
        assert len(inputs) == len(self.in_channels)

        proj_feats = [
            self.input_proj[i](inputs[i]) for i in range(len(inputs))
        ]
        # encoder
        if self.num_encoder_layers > 0:
            for i, enc_ind in enumerate(self.use_encoder_idx):
                h, w = proj_feats[enc_ind].shape[2:]
                # flatten [B, C, H, W] to [B, HxW, C]
                src_flatten = proj_feats[enc_ind].flatten(2).permute(
                    0, 2, 1).contiguous()

                if torch.onnx.is_in_onnx_export():
                    pos_enc = getattr(self, f'pos_enc_{i}')
                else:
                    pos_enc = self.sincos_pos_enc(size=(h, w))
                    pos_enc = pos_enc.transpose(-1, -2).reshape(1, h * w, -1)
                memory = self.encoder[i](
                    src_flatten, query_pos=pos_enc, key_padding_mask=None)

                proj_feats[enc_ind] = memory.permute(
                    0, 2, 1).contiguous().view([-1, self.hidden_dim, h, w])

        # top-down fpn
        inner_outs = [proj_feats[-1]]
        for idx in range(len(self.in_channels) - 1, 0, -1):
            feat_high = inner_outs[0]
            feat_low = proj_feats[idx - 1]
            feat_high = self.lateral_convs[len(self.in_channels) - 1 - idx](
                feat_high)
            inner_outs[0] = feat_high

            upsample_feat = F.interpolate(
                feat_high, scale_factor=2., mode='nearest')
            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
                torch.cat([upsample_feat, feat_low], axis=1))
            inner_outs.insert(0, inner_out)

        # bottom-up pan
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels) - 1):
            feat_low = outs[-1]
            feat_high = inner_outs[idx + 1]
            downsample_feat = self.downsample_convs[idx](feat_low)  # Conv
            out = self.pan_blocks[idx](  # CSPRepLayer
                torch.cat([downsample_feat, feat_high], axis=1))
            outs.append(out)

        if self.output_indices is not None:
            outs = [outs[i] for i in self.output_indices]

        if self.projector is not None:
            outs = self.projector(outs)

        return tuple(outs)

[docs]    def switch_to_deploy(self, test_cfg):
        """Switch to deploy mode."""

        if getattr(self, 'deploy', False):
            return

        if self.num_encoder_layers > 0:
            for i, enc_ind in enumerate(self.use_encoder_idx):
                h, w = test_cfg['input_size']
                h = int(h / 2**(3 + enc_ind))
                w = int(w / 2**(3 + enc_ind))
                pos_enc = self.sincos_pos_enc(size=(h, w))
                pos_enc = pos_enc.transpose(-1, -2).reshape(1, h * w, -1)
                self.register_buffer(f'pos_enc_{i}', pos_enc)

        self.deploy = True