Source code for mmpose.codecs.edpose_label

# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional

import numpy as np

from mmpose.registry import KEYPOINT_CODECS
from mmpose.structures import bbox_cs2xyxy, bbox_xyxy2cs
from .base import BaseKeypointCodec


[docs]@KEYPOINT_CODECS.register_module()
class EDPoseLabel(BaseKeypointCodec):
    r"""Generate keypoint and label coordinates for `ED-Pose`_ by
    Yang J. et al (2023).

    Note:

        - instance number: N
        - keypoint number: K
        - keypoint dimension: D
        - image size: [w, h]

    Encoded:

        - keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
        - keypoints_visible (np.ndarray): Keypoint visibility in shape
                (N, K, D)
        - area (np.ndarray): Area in shape (N)
        - bbox (np.ndarray): Bbox in shape (N, 4)

    Args:
        num_select (int): The number of candidate instances
        num_keypoints (int): The Number of keypoints
    """

    auxiliary_encode_keys = {'area', 'bboxes', 'img_shape'}
    instance_mapping_table = dict(
        bbox='bboxes',
        keypoints='keypoints',
        keypoints_visible='keypoints_visible',
        area='areas',
    )

    def __init__(self, num_select: int = 100, num_keypoints: int = 17):
        super().__init__()

        self.num_select = num_select
        self.num_keypoints = num_keypoints

[docs]    def encode(
        self,
        img_shape,
        keypoints: np.ndarray,
        keypoints_visible: Optional[np.ndarray] = None,
        area: Optional[np.ndarray] = None,
        bboxes: Optional[np.ndarray] = None,
    ) -> dict:
        """Encoding keypoints, area and bbox from input image space to
        normalized space.

        Args:
            - img_shape (Sequence[int]): The shape of image in the format
                of (width, height).
            - keypoints (np.ndarray): Keypoint coordinates in
                shape (N, K, D).
            - keypoints_visible (np.ndarray): Keypoint visibility in shape
                (N, K)
            - area (np.ndarray):
            - bboxes (np.ndarray):

        Returns:
            encoded (dict): Contains the following items:

                - keypoint_labels (np.ndarray): The processed keypoints in
                    shape like (N, K, D).
                - keypoints_visible (np.ndarray): Keypoint visibility in shape
                    (N, K, D)
                - area_labels (np.ndarray): The processed target
                    area in shape (N).
                - bboxes_labels: The processed target bbox in
                    shape (N, 4).
        """
        w, h = img_shape

        if keypoints_visible is None:
            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)

        if bboxes is not None:
            bboxes = np.concatenate(bbox_xyxy2cs(bboxes), axis=-1)
            bboxes = bboxes / np.array([w, h, w, h], dtype=np.float32)

        if area is not None:
            area = area / float(w * h)

        if keypoints is not None:
            keypoints = keypoints / np.array([w, h], dtype=np.float32)

        encoded = dict(
            keypoints=keypoints,
            area=area,
            bbox=bboxes,
            keypoints_visible=keypoints_visible)

        return encoded

[docs]    def decode(self, input_shapes: np.ndarray, pred_logits: np.ndarray,
               pred_boxes: np.ndarray, pred_keypoints: np.ndarray):
        """Select the final top-k keypoints, and decode the results from
        normalize size to origin input size.

        Args:
            input_shapes (Tensor): The size of input image resize.
            test_cfg (ConfigType): Config of testing.
            pred_logits (Tensor): The result of score.
            pred_boxes (Tensor): The result of bbox.
            pred_keypoints (Tensor): The result of keypoints.

        Returns:
            tuple: Decoded boxes, keypoints, and keypoint scores.
        """

        # Initialization
        num_keypoints = self.num_keypoints
        prob = pred_logits.reshape(-1)

        # Select top-k instances based on prediction scores
        topk_indexes = np.argsort(-prob)[:self.num_select]
        topk_values = np.take_along_axis(prob, topk_indexes, axis=0)
        scores = np.tile(topk_values[:, np.newaxis], [1, num_keypoints])

        # Decode bounding boxes
        topk_boxes = topk_indexes // pred_logits.shape[1]
        boxes = bbox_cs2xyxy(*np.split(pred_boxes, [2], axis=-1))
        boxes = np.take_along_axis(
            boxes, np.tile(topk_boxes[:, np.newaxis], [1, 4]), axis=0)

        # Convert from relative to absolute coordinates
        img_h, img_w = np.split(input_shapes, 2, axis=0)
        scale_fct = np.hstack([img_w, img_h, img_w, img_h])
        boxes = boxes * scale_fct[np.newaxis, :]

        # Decode keypoints
        topk_keypoints = topk_indexes // pred_logits.shape[1]
        keypoints = np.take_along_axis(
            pred_keypoints,
            np.tile(topk_keypoints[:, np.newaxis], [1, num_keypoints * 3]),
            axis=0)
        keypoints = keypoints[:, :(num_keypoints * 2)]
        keypoints = keypoints * np.tile(
            np.hstack([img_w, img_h]), [num_keypoints])[np.newaxis, :]
        keypoints = keypoints.reshape(-1, num_keypoints, 2)

        return boxes, keypoints, scores