Source code for mmpose.datasets.datasets.hand.interhand2d_double_dataset

# Copyright (c) OpenMMLab. All rights reserved.
import copy
import json
import os.path as osp
from typing import Callable, List, Optional, Sequence, Tuple, Union

import numpy as np
from mmengine.fileio import exists, get_local_path
from mmengine.utils import is_abs
from xtcocotools.coco import COCO

from mmpose.codecs.utils import camera_to_pixel
from mmpose.datasets.datasets import BaseCocoStyleDataset
from mmpose.registry import DATASETS
from mmpose.structures.bbox import bbox_xywh2xyxy


[docs]@DATASETS.register_module()
class InterHand2DDoubleDataset(BaseCocoStyleDataset):
    """InterHand2.6M dataset for 2d double hands.

    "InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose
    Estimation from a Single RGB Image", ECCV'2020.
    More details can be found in the `paper
    <https://arxiv.org/pdf/2008.09309.pdf>`__ .

    The dataset loads raw features and apply specified transforms
    to return a dict containing the image tensors and other information.

    InterHand2.6M keypoint indexes::

        0: 'r_thumb4',
        1: 'r_thumb3',
        2: 'r_thumb2',
        3: 'r_thumb1',
        4: 'r_index4',
        5: 'r_index3',
        6: 'r_index2',
        7: 'r_index1',
        8: 'r_middle4',
        9: 'r_middle3',
        10: 'r_middle2',
        11: 'r_middle1',
        12: 'r_ring4',
        13: 'r_ring3',
        14: 'r_ring2',
        15: 'r_ring1',
        16: 'r_pinky4',
        17: 'r_pinky3',
        18: 'r_pinky2',
        19: 'r_pinky1',
        20: 'r_wrist',
        21: 'l_thumb4',
        22: 'l_thumb3',
        23: 'l_thumb2',
        24: 'l_thumb1',
        25: 'l_index4',
        26: 'l_index3',
        27: 'l_index2',
        28: 'l_index1',
        29: 'l_middle4',
        30: 'l_middle3',
        31: 'l_middle2',
        32: 'l_middle1',
        33: 'l_ring4',
        34: 'l_ring3',
        35: 'l_ring2',
        36: 'l_ring1',
        37: 'l_pinky4',
        38: 'l_pinky3',
        39: 'l_pinky2',
        40: 'l_pinky1',
        41: 'l_wrist'

    Args:
        ann_file (str): Annotation file path. Default: ''.
        camera_param_file (str): Cameras' parameters file. Default: ''.
        joint_file (str): Path to the joint file. Default: ''.
        use_gt_root_depth (bool): Using the ground truth depth of the wrist
            or given depth from rootnet_result_file. Default: ``True``.
        rootnet_result_file (str): Path to the wrist depth file.
            Default: ``None``.
        data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
            ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
            one instance; while in ``'bottomup'`` mode, each data sample
            contains all instances in a image. Default: ``'topdown'``
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Default: ``None``.
        data_root (str, optional): The root directory for ``data_prefix`` and
            ``ann_file``. Default: ``None``.
        data_prefix (dict, optional): Prefix for training data.
            Default: ``dict(img='')``.
        filter_cfg (dict, optional): Config for filter data. Default: `None`.
        indices (int or Sequence[int], optional): Support using first few
            data in annotation file to facilitate training/testing on a smaller
            dataset. Default: ``None`` which means using all ``data_infos``.
        serialize_data (bool, optional): Whether to hold memory using
            serialized objects, when enabled, data loader workers can use
            shared RAM from master process instead of making a copy.
            Default: ``True``.
        pipeline (list, optional): Processing pipeline. Default: [].
        test_mode (bool, optional): ``test_mode=True`` means in test phase.
            Default: ``False``.
        lazy_init (bool, optional): Whether to load annotation during
            instantiation. In some cases, such as visualization, only the meta
            information of the dataset is needed, which is not necessary to
            load annotation file. ``Basedataset`` can skip load annotations to
            save time by set ``lazy_init=False``. Default: ``False``.
        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
            None img. The maximum extra number of cycles to get a valid
            image. Default: 1000.
        sample_interval (int, optional): The sample interval of the dataset.
            Default: 1.
    """

    METAINFO: dict = dict(from_file='configs/_base_/datasets/interhand3d.py')

    def __init__(self,
                 ann_file: str = '',
                 camera_param_file: str = '',
                 joint_file: str = '',
                 use_gt_root_depth: bool = True,
                 rootnet_result_file: Optional[str] = None,
                 data_mode: str = 'topdown',
                 metainfo: Optional[dict] = None,
                 data_root: Optional[str] = None,
                 data_prefix: dict = dict(img=''),
                 filter_cfg: Optional[dict] = None,
                 indices: Optional[Union[int, Sequence[int]]] = None,
                 serialize_data: bool = True,
                 pipeline: List[Union[dict, Callable]] = [],
                 test_mode: bool = False,
                 lazy_init: bool = False,
                 max_refetch: int = 1000,
                 sample_interval: int = 1):
        _ann_file = ann_file
        if data_root is not None and not is_abs(_ann_file):
            _ann_file = osp.join(data_root, _ann_file)
        assert exists(_ann_file), 'Annotation file does not exist.'
        self.ann_file = _ann_file

        _camera_param_file = camera_param_file
        if data_root is not None and not is_abs(_camera_param_file):
            _camera_param_file = osp.join(data_root, _camera_param_file)
        assert exists(_camera_param_file), 'Camera file does not exist.'
        self.camera_param_file = _camera_param_file

        _joint_file = joint_file
        if data_root is not None and not is_abs(_joint_file):
            _joint_file = osp.join(data_root, _joint_file)
        assert exists(_joint_file), 'Joint file does not exist.'
        self.joint_file = _joint_file

        self.use_gt_root_depth = use_gt_root_depth
        if not self.use_gt_root_depth:
            assert rootnet_result_file is not None
            _rootnet_result_file = rootnet_result_file
            if data_root is not None and not is_abs(_rootnet_result_file):
                _rootnet_result_file = osp.join(data_root,
                                                _rootnet_result_file)
            assert exists(
                _rootnet_result_file), 'Rootnet result file does not exist.'
            self.rootnet_result_file = _rootnet_result_file

        super().__init__(
            ann_file=ann_file,
            metainfo=metainfo,
            data_mode=data_mode,
            data_root=data_root,
            data_prefix=data_prefix,
            filter_cfg=filter_cfg,
            indices=indices,
            serialize_data=serialize_data,
            pipeline=pipeline,
            test_mode=test_mode,
            lazy_init=lazy_init,
            max_refetch=max_refetch,
            sample_interval=sample_interval)

    def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
        """Load data from annotations in COCO format."""

        assert exists(self.ann_file), 'Annotation file does not exist'

        with get_local_path(self.ann_file) as local_path:
            self.coco = COCO(local_path)
        # set the metainfo about categories, which is a list of dict
        # and each dict contains the 'id', 'name', etc. about this category
        if 'categories' in self.coco.dataset:
            self._metainfo['CLASSES'] = self.coco.loadCats(
                self.coco.getCatIds())

        with get_local_path(self.camera_param_file) as local_path:
            with open(local_path, 'r') as f:
                self.cameras = json.load(f)
        with get_local_path(self.joint_file) as local_path:
            with open(local_path, 'r') as f:
                self.joints = json.load(f)

        instance_list = []
        image_list = []

        for idx, img_id in enumerate(self.coco.getImgIds()):
            if idx % self.sample_interval != 0:
                continue
            img = self.coco.loadImgs(img_id)[0]
            img.update({
                'img_id':
                img_id,
                'img_path':
                osp.join(self.data_prefix['img'], img['file_name']),
            })
            image_list.append(img)

            ann_ids = self.coco.getAnnIds(imgIds=img_id)
            ann = self.coco.loadAnns(ann_ids)[0]

            instance_info = self.parse_data_info(
                dict(raw_ann_info=ann, raw_img_info=img))

            # skip invalid instance annotation.
            if not instance_info:
                continue

            instance_list.append(instance_info)
        return instance_list, image_list

[docs]    def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
        """Parse raw COCO annotation of an instance.

        Args:
            raw_data_info (dict): Raw data information loaded from
                ``ann_file``. It should have following contents:

                - ``'raw_ann_info'``: Raw annotation of an instance
                - ``'raw_img_info'``: Raw information of the image that
                    contains the instance

        Returns:
            dict | None: Parsed instance annotation
        """

        ann = raw_data_info['raw_ann_info']
        img = raw_data_info['raw_img_info']

        if not self.use_gt_root_depth:
            rootnet_result = {}
            with get_local_path(self.rootnet_result_file) as local_path:
                rootnet_annot = json.load(local_path)
            for i in range(len(rootnet_annot)):
                rootnet_result[str(
                    rootnet_annot[i]['annot_id'])] = rootnet_annot[i]

        num_keypoints = self.metainfo['num_keypoints']

        capture_id = str(img['capture'])
        camera_name = img['camera']
        frame_idx = str(img['frame_idx'])
        camera_pos = np.array(
            self.cameras[capture_id]['campos'][camera_name], dtype=np.float32)
        camera_rot = np.array(
            self.cameras[capture_id]['camrot'][camera_name], dtype=np.float32)
        focal = np.array(
            self.cameras[capture_id]['focal'][camera_name], dtype=np.float32)
        principal_pt = np.array(
            self.cameras[capture_id]['princpt'][camera_name], dtype=np.float32)
        joint_world = np.array(
            self.joints[capture_id][frame_idx]['world_coord'],
            dtype=np.float32)
        joint_valid = np.array(ann['joint_valid'], dtype=np.float32).flatten()

        keypoints_cam = np.dot(
            camera_rot,
            joint_world.transpose(1, 0) -
            camera_pos.reshape(3, 1)).transpose(1, 0)

        if self.use_gt_root_depth:
            bbox_xywh = np.array(ann['bbox'], dtype=np.float32).reshape(1, 4)

        else:
            rootnet_ann_data = rootnet_result[str(ann['id'])]
            bbox_xywh = np.array(
                rootnet_ann_data['bbox'], dtype=np.float32).reshape(1, 4)

        bbox = bbox_xywh2xyxy(bbox_xywh)

        # 41: 'l_wrist', left hand root
        # 20: 'r_wrist', right hand root

        # if root is not valid -> root-relative 3D pose is also not valid.
        # Therefore, mark all joints as invalid
        joint_valid[:20] *= joint_valid[20]
        joint_valid[21:] *= joint_valid[41]

        joints_3d_visible = np.minimum(1,
                                       joint_valid.reshape(-1,
                                                           1)).reshape(1, -1)
        keypoints_img = camera_to_pixel(
            keypoints_cam,
            focal[0],
            focal[1],
            principal_pt[0],
            principal_pt[1],
            shift=True)[..., :2]
        joints_3d = np.zeros((keypoints_cam.shape[-2], 3),
                             dtype=np.float32).reshape(1, -1, 3)
        joints_3d[..., :2] = keypoints_img
        joints_3d[..., :21,
                  2] = keypoints_cam[..., :21, 2] - keypoints_cam[..., 20, 2]
        joints_3d[..., 21:,
                  2] = keypoints_cam[..., 21:, 2] - keypoints_cam[..., 41, 2]

        data_info = {
            'img_id': ann['image_id'],
            'img_path': img['img_path'],
            'keypoints': joints_3d[:, :, :2],
            'keypoints_visible': joints_3d_visible,
            'hand_type': self.encode_handtype(ann['hand_type']),
            'hand_type_valid': np.array([ann['hand_type_valid']]),
            'dataset': self.metainfo['dataset_name'],
            'bbox': bbox,
            'bbox_score': np.ones(1, dtype=np.float32),
            'num_keypoints': num_keypoints,
            'iscrowd': ann.get('iscrowd', False),
            'id': ann['id'],
            # store the raw annotation of the instance
            # it is useful for evaluation without providing ann_file
            'raw_ann_info': copy.deepcopy(ann),
        }

        return data_info

    @staticmethod
    def encode_handtype(hand_type):
        if hand_type == 'right':
            return np.array([[1, 0]], dtype=np.float32)
        elif hand_type == 'left':
            return np.array([[0, 1]], dtype=np.float32)
        elif hand_type == 'interacting':
            return np.array([[1, 1]], dtype=np.float32)
        else:
            assert 0, f'Not support hand type: {hand_type}'