Source code for mmpose.codecs.motionbert_label

# Copyright (c) OpenMMLab. All rights reserved.

from copy import deepcopy
from typing import Optional, Tuple

import numpy as np

from mmpose.registry import KEYPOINT_CODECS
from .base import BaseKeypointCodec
from .utils import camera_to_image_coord


[docs]@KEYPOINT_CODECS.register_module()
class MotionBERTLabel(BaseKeypointCodec):
    r"""Generate keypoint and label coordinates for `MotionBERT`_ by Zhu et al
    (2022).

    Note:

        - instance number: N
        - keypoint number: K
        - keypoint dimension: D
        - pose-lifitng target dimension: C

    Args:
        num_keypoints (int): The number of keypoints in the dataset.
        root_index (int): Root keypoint index in the pose. Default: 0.
        remove_root (bool): If true, remove the root keypoint from the pose.
            Default: ``False``.
        save_index (bool): If true, store the root position separated from the
            original pose, only takes effect if ``remove_root`` is ``True``.
            Default: ``False``.
        concat_vis (bool): If true, concat the visibility item of keypoints.
            Default: ``False``.
        rootrel (bool): If true, the root keypoint will be set to the
            coordinate origin. Default: ``False``.
        mode (str): Indicating whether the current mode is 'train' or 'test'.
            Default: ``'test'``.
    """

    auxiliary_encode_keys = {
        'lifting_target', 'lifting_target_visible', 'camera_param', 'factor'
    }

    instance_mapping_table = dict(
        lifting_target='lifting_target',
        lifting_target_visible='lifting_target_visible',
    )
    label_mapping_table = dict(
        trajectory_weights='trajectory_weights',
        lifting_target_label='lifting_target_label',
        lifting_target_weight='lifting_target_weight')

    def __init__(self,
                 num_keypoints: int,
                 root_index: int = 0,
                 remove_root: bool = False,
                 save_index: bool = False,
                 concat_vis: bool = False,
                 rootrel: bool = False,
                 mode: str = 'test'):
        super().__init__()

        self.num_keypoints = num_keypoints
        self.root_index = root_index
        self.remove_root = remove_root
        self.save_index = save_index
        self.concat_vis = concat_vis
        self.rootrel = rootrel
        assert mode.lower() in {'train', 'test'
                                }, (f'Unsupported mode {mode}, '
                                    'mode should be one of ("train", "test").')
        self.mode = mode.lower()

[docs]    def encode(self,
               keypoints: np.ndarray,
               keypoints_visible: Optional[np.ndarray] = None,
               lifting_target: Optional[np.ndarray] = None,
               lifting_target_visible: Optional[np.ndarray] = None,
               camera_param: Optional[dict] = None,
               factor: Optional[np.ndarray] = None) -> dict:
        """Encoding keypoints from input image space to normalized space.

        Args:
            keypoints (np.ndarray): Keypoint coordinates in shape (B, T, K, D).
            keypoints_visible (np.ndarray, optional): Keypoint visibilities in
                shape (B, T, K).
            lifting_target (np.ndarray, optional): 3d target coordinate in
                shape (T, K, C).
            lifting_target_visible (np.ndarray, optional): Target coordinate in
                shape (T, K, ).
            camera_param (dict, optional): The camera parameter dictionary.
            factor (np.ndarray, optional): The factor mapping camera and image
                  coordinate in shape (T, ).

        Returns:
            encoded (dict): Contains the following items:

                - keypoint_labels (np.ndarray): The processed keypoints in
                  shape like (N, K, D).
                - keypoint_labels_visible (np.ndarray): The processed
                  keypoints' weights in shape (N, K, ) or (N, K-1, ).
                - lifting_target_label: The processed target coordinate in
                  shape (K, C) or (K-1, C).
                - lifting_target_weight (np.ndarray): The target weights in
                  shape (K, ) or (K-1, ).
                - factor (np.ndarray): The factor mapping camera and image
                  coordinate in shape (T, 1).
        """
        if keypoints_visible is None:
            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)

        # set initial value for `lifting_target_weight`
        if lifting_target_visible is None:
            lifting_target_visible = np.ones(
                lifting_target.shape[:-1], dtype=np.float32)
            lifting_target_weight = lifting_target_visible
        else:
            valid = lifting_target_visible > 0.5
            lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32)

        if camera_param is None:
            camera_param = dict()

        encoded = dict()

        assert lifting_target is not None
        lifting_target_label = lifting_target.copy()
        keypoint_labels = keypoints.copy()

        assert keypoint_labels.ndim in {
            2, 3
        }, (f'Keypoint labels should have 2 or 3 dimensions, '
            f'but got {keypoint_labels.ndim}.')
        if keypoint_labels.ndim == 2:
            keypoint_labels = keypoint_labels[None, ...]

        # Normalize the 2D keypoint coordinate with image width and height
        _camera_param = deepcopy(camera_param)
        assert 'w' in _camera_param and 'h' in _camera_param, (
            'Camera parameters should contain "w" and "h".')
        w, h = _camera_param['w'], _camera_param['h']
        keypoint_labels[
            ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w]

        # convert target to image coordinate
        T = keypoint_labels.shape[0]
        factor_ = np.array([4] * T, dtype=np.float32).reshape(T, )
        if 'f' in _camera_param and 'c' in _camera_param:
            lifting_target_label, factor_ = camera_to_image_coord(
                self.root_index, lifting_target_label, _camera_param)
        if self.mode == 'train':
            w, h = w / 1000, h / 1000
            lifting_target_label[
                ..., :2] = lifting_target_label[..., :2] / w * 2 - [1, h / w]
            lifting_target_label[..., 2] = lifting_target_label[..., 2] / w * 2
        lifting_target_label[..., :, :] = lifting_target_label[
            ..., :, :] - lifting_target_label[...,
                                              self.root_index:self.root_index +
                                              1, :]
        if factor is None or factor[0] == 0:
            factor = factor_
        if factor.ndim == 1:
            factor = factor[:, None]
        if self.mode == 'test':
            lifting_target_label *= factor[..., None]

        if self.concat_vis:
            keypoints_visible_ = keypoints_visible
            if keypoints_visible.ndim == 2:
                keypoints_visible_ = keypoints_visible[..., None]
            keypoint_labels = np.concatenate(
                (keypoint_labels, keypoints_visible_), axis=2)

        encoded['keypoint_labels'] = keypoint_labels
        encoded['keypoint_labels_visible'] = keypoints_visible
        encoded['lifting_target_label'] = lifting_target_label
        encoded['lifting_target_weight'] = lifting_target_weight
        encoded['lifting_target'] = lifting_target_label
        encoded['lifting_target_visible'] = lifting_target_visible
        encoded['factor'] = factor

        return encoded

[docs]    def decode(
        self,
        encoded: np.ndarray,
        w: Optional[np.ndarray] = None,
        h: Optional[np.ndarray] = None,
        factor: Optional[np.ndarray] = None,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Decode keypoint coordinates from normalized space to input image
        space.

        Args:
            encoded (np.ndarray): Coordinates in shape (N, K, C).
            w (np.ndarray, optional): The image widths in shape (N, ).
                Default: ``None``.
            h (np.ndarray, optional): The image heights in shape (N, ).
                Default: ``None``.
            factor (np.ndarray, optional): The factor for projection in shape
                (N, ). Default: ``None``.

        Returns:
            keypoints (np.ndarray): Decoded coordinates in shape (N, K, C).
            scores (np.ndarray): The keypoint scores in shape (N, K).
        """
        keypoints = encoded.copy()
        scores = np.ones(keypoints.shape[:-1], dtype=np.float32)

        if self.rootrel:
            keypoints[..., 0, :] = 0

        if w is not None and w.size > 0:
            assert w.shape == h.shape, (f'w and h should have the same shape, '
                                        f'but got {w.shape} and {h.shape}.')
            assert w.shape[0] == keypoints.shape[0], (
                f'w and h should have the same batch size, '
                f'but got {w.shape[0]} and {keypoints.shape[0]}.')
            assert w.ndim in {1,
                              2}, (f'w and h should have 1 or 2 dimensions, '
                                   f'but got {w.ndim}.')
            if w.ndim == 1:
                w = w[:, None]
                h = h[:, None]
            trans = np.append(
                np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :]
            keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2
            keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2

        if factor is not None and factor.size > 0:
            assert factor.shape[0] == keypoints.shape[0], (
                f'factor should have the same batch size, '
                f'but got {factor.shape[0]} and {keypoints.shape[0]}.')
            keypoints *= factor[..., None]

        keypoints[..., :, :] = keypoints[..., :, :] - keypoints[
            ..., self.root_index:self.root_index + 1, :]
        keypoints /= 1000.
        return keypoints, scores