Source code for mmpose.codecs.annotation_processors

# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List, Optional, Tuple

import numpy as np

from mmpose.registry import KEYPOINT_CODECS
from .base import BaseKeypointCodec

INF = 1e6
NEG_INF = -1e6


class BaseAnnotationProcessor(BaseKeypointCodec):
    """Base class for annotation processors."""

    def decode(self, *args, **kwargs):
        pass


[docs]@KEYPOINT_CODECS.register_module()
class YOLOXPoseAnnotationProcessor(BaseAnnotationProcessor):
    """Convert dataset annotations to the input format of YOLOX-Pose.

    This processor expands bounding boxes and converts category IDs to labels.

    Args:
        expand_bbox (bool, optional): Whether to expand the bounding box
            to include all keypoints. Defaults to False.
        input_size (tuple, optional): The size of the input image for the
            model, formatted as (h, w). This argument is necessary for the
            codec in deployment but is not used indeed.
    """

    auxiliary_encode_keys = {'category_id', 'bbox'}
    label_mapping_table = dict(
        bbox='bboxes',
        bbox_labels='labels',
        keypoints='keypoints',
        keypoints_visible='keypoints_visible',
        area='areas',
    )
    instance_mapping_table = dict(
        bbox='bboxes',
        bbox_score='bbox_scores',
        keypoints='keypoints',
        keypoints_visible='keypoints_visible',
        # remove 'bbox_scales' in default instance_mapping_table to avoid
        # length mismatch during training with multiple datasets
    )

    def __init__(self,
                 expand_bbox: bool = False,
                 input_size: Optional[Tuple] = None):
        super().__init__()
        self.expand_bbox = expand_bbox

[docs]    def encode(self,
               keypoints: Optional[np.ndarray] = None,
               keypoints_visible: Optional[np.ndarray] = None,
               bbox: Optional[np.ndarray] = None,
               category_id: Optional[List[int]] = None
               ) -> Dict[str, np.ndarray]:
        """Encode keypoints, bounding boxes, and category IDs.

        Args:
            keypoints (np.ndarray, optional): Keypoints array. Defaults
                to None.
            keypoints_visible (np.ndarray, optional): Visibility array for
                keypoints. Defaults to None.
            bbox (np.ndarray, optional): Bounding box array. Defaults to None.
            category_id (List[int], optional): List of category IDs. Defaults
                to None.

        Returns:
            Dict[str, np.ndarray]: Encoded annotations.
        """
        results = {}

        if self.expand_bbox and bbox is not None:
            # Handle keypoints visibility
            if keypoints_visible.ndim == 3:
                keypoints_visible = keypoints_visible[..., 0]

            # Expand bounding box to include keypoints
            kpts_min = keypoints.copy()
            kpts_min[keypoints_visible == 0] = INF
            bbox[..., :2] = np.minimum(bbox[..., :2], kpts_min.min(axis=1))

            kpts_max = keypoints.copy()
            kpts_max[keypoints_visible == 0] = NEG_INF
            bbox[..., 2:] = np.maximum(bbox[..., 2:], kpts_max.max(axis=1))

            results['bbox'] = bbox

        if category_id is not None:
            # Convert category IDs to labels
            bbox_labels = np.array(category_id).astype(np.int8) - 1
            results['bbox_labels'] = bbox_labels

        return results