Shortcuts

Source code for mmpose.codecs.motionbert_label

# Copyright (c) OpenMMLab. All rights reserved.

from copy import deepcopy
from typing import Optional, Tuple

import numpy as np

from mmpose.registry import KEYPOINT_CODECS
from .base import BaseKeypointCodec
from .utils import camera_to_image_coord


[docs]@KEYPOINT_CODECS.register_module() class MotionBERTLabel(BaseKeypointCodec): r"""Generate keypoint and label coordinates for `MotionBERT`_ by Zhu et al (2022). Note: - instance number: N - keypoint number: K - keypoint dimension: D - pose-lifitng target dimension: C Args: num_keypoints (int): The number of keypoints in the dataset. root_index (int): Root keypoint index in the pose. Default: 0. remove_root (bool): If true, remove the root keypoint from the pose. Default: ``False``. save_index (bool): If true, store the root position separated from the original pose, only takes effect if ``remove_root`` is ``True``. Default: ``False``. concat_vis (bool): If true, concat the visibility item of keypoints. Default: ``False``. rootrel (bool): If true, the root keypoint will be set to the coordinate origin. Default: ``False``. mode (str): Indicating whether the current mode is 'train' or 'test'. Default: ``'test'``. """ auxiliary_encode_keys = { 'lifting_target', 'lifting_target_visible', 'camera_param', 'factor' } instance_mapping_table = dict( lifting_target='lifting_target', lifting_target_visible='lifting_target_visible', ) label_mapping_table = dict( trajectory_weights='trajectory_weights', lifting_target_label='lifting_target_label', lifting_target_weight='lifting_target_weight') def __init__(self, num_keypoints: int, root_index: int = 0, remove_root: bool = False, save_index: bool = False, concat_vis: bool = False, rootrel: bool = False, mode: str = 'test'): super().__init__() self.num_keypoints = num_keypoints self.root_index = root_index self.remove_root = remove_root self.save_index = save_index self.concat_vis = concat_vis self.rootrel = rootrel assert mode.lower() in {'train', 'test' }, (f'Unsupported mode {mode}, ' 'mode should be one of ("train", "test").') self.mode = mode.lower()
[docs] def encode(self, keypoints: np.ndarray, keypoints_visible: Optional[np.ndarray] = None, lifting_target: Optional[np.ndarray] = None, lifting_target_visible: Optional[np.ndarray] = None, camera_param: Optional[dict] = None, factor: Optional[np.ndarray] = None) -> dict: """Encoding keypoints from input image space to normalized space. Args: keypoints (np.ndarray): Keypoint coordinates in shape (B, T, K, D). keypoints_visible (np.ndarray, optional): Keypoint visibilities in shape (B, T, K). lifting_target (np.ndarray, optional): 3d target coordinate in shape (T, K, C). lifting_target_visible (np.ndarray, optional): Target coordinate in shape (T, K, ). camera_param (dict, optional): The camera parameter dictionary. factor (np.ndarray, optional): The factor mapping camera and image coordinate in shape (T, ). Returns: encoded (dict): Contains the following items: - keypoint_labels (np.ndarray): The processed keypoints in shape like (N, K, D). - keypoint_labels_visible (np.ndarray): The processed keypoints' weights in shape (N, K, ) or (N, K-1, ). - lifting_target_label: The processed target coordinate in shape (K, C) or (K-1, C). - lifting_target_weight (np.ndarray): The target weights in shape (K, ) or (K-1, ). - factor (np.ndarray): The factor mapping camera and image coordinate in shape (T, 1). """ if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) # set initial value for `lifting_target_weight` if lifting_target_visible is None: lifting_target_visible = np.ones( lifting_target.shape[:-1], dtype=np.float32) lifting_target_weight = lifting_target_visible else: valid = lifting_target_visible > 0.5 lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32) if camera_param is None: camera_param = dict() encoded = dict() assert lifting_target is not None lifting_target_label = lifting_target.copy() keypoint_labels = keypoints.copy() assert keypoint_labels.ndim in { 2, 3 }, (f'Keypoint labels should have 2 or 3 dimensions, ' f'but got {keypoint_labels.ndim}.') if keypoint_labels.ndim == 2: keypoint_labels = keypoint_labels[None, ...] # Normalize the 2D keypoint coordinate with image width and height _camera_param = deepcopy(camera_param) assert 'w' in _camera_param and 'h' in _camera_param, ( 'Camera parameters should contain "w" and "h".') w, h = _camera_param['w'], _camera_param['h'] keypoint_labels[ ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w] # convert target to image coordinate T = keypoint_labels.shape[0] factor_ = np.array([4] * T, dtype=np.float32).reshape(T, ) if 'f' in _camera_param and 'c' in _camera_param: lifting_target_label, factor_ = camera_to_image_coord( self.root_index, lifting_target_label, _camera_param) if self.mode == 'train': w, h = w / 1000, h / 1000 lifting_target_label[ ..., :2] = lifting_target_label[..., :2] / w * 2 - [1, h / w] lifting_target_label[..., 2] = lifting_target_label[..., 2] / w * 2 lifting_target_label[..., :, :] = lifting_target_label[ ..., :, :] - lifting_target_label[..., self.root_index:self.root_index + 1, :] if factor is None or factor[0] == 0: factor = factor_ if factor.ndim == 1: factor = factor[:, None] if self.mode == 'test': lifting_target_label *= factor[..., None] if self.concat_vis: keypoints_visible_ = keypoints_visible if keypoints_visible.ndim == 2: keypoints_visible_ = keypoints_visible[..., None] keypoint_labels = np.concatenate( (keypoint_labels, keypoints_visible_), axis=2) encoded['keypoint_labels'] = keypoint_labels encoded['keypoint_labels_visible'] = keypoints_visible encoded['lifting_target_label'] = lifting_target_label encoded['lifting_target_weight'] = lifting_target_weight encoded['lifting_target'] = lifting_target_label encoded['lifting_target_visible'] = lifting_target_visible encoded['factor'] = factor return encoded
[docs] def decode( self, encoded: np.ndarray, w: Optional[np.ndarray] = None, h: Optional[np.ndarray] = None, factor: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: """Decode keypoint coordinates from normalized space to input image space. Args: encoded (np.ndarray): Coordinates in shape (N, K, C). w (np.ndarray, optional): The image widths in shape (N, ). Default: ``None``. h (np.ndarray, optional): The image heights in shape (N, ). Default: ``None``. factor (np.ndarray, optional): The factor for projection in shape (N, ). Default: ``None``. Returns: keypoints (np.ndarray): Decoded coordinates in shape (N, K, C). scores (np.ndarray): The keypoint scores in shape (N, K). """ keypoints = encoded.copy() scores = np.ones(keypoints.shape[:-1], dtype=np.float32) if self.rootrel: keypoints[..., 0, :] = 0 if w is not None and w.size > 0: assert w.shape == h.shape, (f'w and h should have the same shape, ' f'but got {w.shape} and {h.shape}.') assert w.shape[0] == keypoints.shape[0], ( f'w and h should have the same batch size, ' f'but got {w.shape[0]} and {keypoints.shape[0]}.') assert w.ndim in {1, 2}, (f'w and h should have 1 or 2 dimensions, ' f'but got {w.ndim}.') if w.ndim == 1: w = w[:, None] h = h[:, None] trans = np.append( np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :] keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2 keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2 if factor is not None and factor.size > 0: assert factor.shape[0] == keypoints.shape[0], ( f'factor should have the same batch size, ' f'but got {factor.shape[0]} and {keypoints.shape[0]}.') keypoints *= factor[..., None] keypoints[..., :, :] = keypoints[..., :, :] - keypoints[ ..., self.root_index:self.root_index + 1, :] keypoints /= 1000. return keypoints, scores
Read the Docs v: latest
Versions
latest
0.x
dev-1.x
Downloads
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.