Source code for mmpose.datasets.datasets.body.posetrack18_video_dataset

# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from typing import Callable, List, Optional, Sequence, Union

import numpy as np
from mmengine.fileio import exists, get_local_path, load
from mmengine.utils import is_list_of
from xtcocotools.coco import COCO

from mmpose.registry import DATASETS
from mmpose.structures.bbox import bbox_xywh2xyxy
from ..base import BaseCocoStyleDataset


[docs]@DATASETS.register_module()
class PoseTrack18VideoDataset(BaseCocoStyleDataset):
    """PoseTrack18 dataset for video pose estimation.

    "Posetrack: A benchmark for human pose estimation and tracking", CVPR'2018.
    More details can be found in the `paper
    <https://arxiv.org/abs/1710.10000>`__ .

    PoseTrack2018 keypoints::

        0: 'nose',
        1: 'head_bottom',
        2: 'head_top',
        3: 'left_ear',
        4: 'right_ear',
        5: 'left_shoulder',
        6: 'right_shoulder',
        7: 'left_elbow',
        8: 'right_elbow',
        9: 'left_wrist',
        10: 'right_wrist',
        11: 'left_hip',
        12: 'right_hip',
        13: 'left_knee',
        14: 'right_knee',
        15: 'left_ankle',
        16: 'right_ankle'

    Args:
        ann_file (str): Annotation file path. Default: ''.
        bbox_file (str, optional): Detection result file path. If
            ``bbox_file`` is set, detected bboxes loaded from this file will
            be used instead of ground-truth bboxes. This setting is only for
            evaluation, i.e., ignored when ``test_mode`` is ``False``.
            Default: ``None``.
        data_mode (str): Specifies the mode of data samples: ``'topdown'`` or
            ``'bottomup'``. In ``'topdown'`` mode, each data sample contains
            one instance; while in ``'bottomup'`` mode, each data sample
            contains all instances in a image. Default: ``'topdown'``
        frame_weights (List[Union[int, float]] ): The weight of each frame
            for aggregation. The first weight is for the center frame, then on
            ascending order of frame indices. Note that the length of
            ``frame_weights`` should be consistent with the number of sampled
            frames. Default: [0.0, 1.0]
        frame_sampler_mode (str): Specifies the mode of frame sampler:
            ``'fixed'`` or ``'random'``. In ``'fixed'`` mode, each frame
            index relative to the center frame is fixed, specified by
            ``frame_indices``, while in ``'random'`` mode, each frame index
            relative to the center frame is sampled from ``frame_range``
            with certain randomness. Default: ``'random'``.
        frame_range (int | List[int], optional): The sampling range of
            supporting frames in the same video for center frame.
            Only valid when ``frame_sampler_mode`` is ``'random'``.
            Default: ``None``.
        num_sampled_frame(int, optional): The number of sampled frames, except
            the center frame. Only valid when ``frame_sampler_mode`` is
            ``'random'``. Default: 1.
        frame_indices (Sequence[int], optional): The sampled frame indices,
            including the center frame indicated by 0. Only valid when
            ``frame_sampler_mode`` is ``'fixed'``. Default: ``None``.
        ph_fill_len (int): The length of the placeholder to fill in the
            image filenames.  Default: 6
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Default: ``None``.
        data_root (str, optional): The root directory for ``data_prefix`` and
            ``ann_file``. Default: ``None``.
        data_prefix (dict, optional): Prefix for training data. Default:
            ``dict(img='')``.
        filter_cfg (dict, optional): Config for filter data. Default: `None`.
        indices (int or Sequence[int], optional): Support using first few
            data in annotation file to facilitate training/testing on a smaller
            dataset. Default: ``None`` which means using all ``data_infos``.
        serialize_data (bool, optional): Whether to hold memory using
            serialized objects, when enabled, data loader workers can use
            shared RAM from master process instead of making a copy.
            Default: ``True``.
        pipeline (list, optional): Processing pipeline. Default: [].
        test_mode (bool, optional): ``test_mode=True`` means in test phase.
            Default: ``False``.
        lazy_init (bool, optional): Whether to load annotation during
            instantiation. In some cases, such as visualization, only the meta
            information of the dataset is needed, which is not necessary to
            load annotation file. ``Basedataset`` can skip load annotations to
            save time by set ``lazy_init=False``. Default: ``False``.
        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
            None img. The maximum extra number of cycles to get a valid
            image. Default: 1000.
    """

    METAINFO: dict = dict(from_file='configs/_base_/datasets/posetrack18.py')

    def __init__(self,
                 ann_file: str = '',
                 bbox_file: Optional[str] = None,
                 data_mode: str = 'topdown',
                 frame_weights: List[Union[int, float]] = [0.0, 1.0],
                 frame_sampler_mode: str = 'random',
                 frame_range: Optional[Union[int, List[int]]] = None,
                 num_sampled_frame: Optional[int] = None,
                 frame_indices: Optional[Sequence[int]] = None,
                 ph_fill_len: int = 6,
                 metainfo: Optional[dict] = None,
                 data_root: Optional[str] = None,
                 data_prefix: dict = dict(img=''),
                 filter_cfg: Optional[dict] = None,
                 indices: Optional[Union[int, Sequence[int]]] = None,
                 serialize_data: bool = True,
                 pipeline: List[Union[dict, Callable]] = [],
                 test_mode: bool = False,
                 lazy_init: bool = False,
                 max_refetch: int = 1000):
        assert sum(frame_weights) == 1, 'Invalid `frame_weights`: should sum'\
            f' to 1.0, but got {frame_weights}.'
        for weight in frame_weights:
            assert weight >= 0, 'frame_weight can not be a negative value.'
        self.frame_weights = np.array(frame_weights)

        if frame_sampler_mode not in {'fixed', 'random'}:
            raise ValueError(
                f'{self.__class__.__name__} got invalid frame_sampler_mode: '
                f'{frame_sampler_mode}. Should be `"fixed"` or `"random"`.')
        self.frame_sampler_mode = frame_sampler_mode

        if frame_sampler_mode == 'random':
            assert frame_range is not None, \
                '`frame_sampler_mode` is set as `random`, ' \
                'please specify the `frame_range`.'

            if isinstance(frame_range, int):
                assert frame_range >= 0, \
                    'frame_range can not be a negative value.'
                self.frame_range = [-frame_range, frame_range]

            elif isinstance(frame_range, Sequence):
                assert len(frame_range) == 2, 'The length must be 2.'
                assert frame_range[0] <= 0 and frame_range[
                    1] >= 0 and frame_range[1] > frame_range[
                        0], 'Invalid `frame_range`'
                for i in frame_range:
                    assert isinstance(i, int), 'Each element must be int.'
                self.frame_range = frame_range
            else:
                raise TypeError(
                    f'The type of `frame_range` must be int or Sequence, '
                    f'but got {type(frame_range)}.')

            assert num_sampled_frame is not None, \
                '`frame_sampler_mode` is set as `random`, please specify ' \
                '`num_sampled_frame`, e.g. the number of sampled frames.'

            assert len(frame_weights) == num_sampled_frame + 1, \
                f'the length of frame_weights({len(frame_weights)}) '\
                f'does not match the number of sampled adjacent '\
                f'frames({num_sampled_frame})'
            self.frame_indices = None
            self.num_sampled_frame = num_sampled_frame

        if frame_sampler_mode == 'fixed':
            assert frame_indices is not None, \
                '`frame_sampler_mode` is set as `fixed`, ' \
                'please specify the `frame_indices`.'
            assert len(frame_weights) == len(frame_indices), \
                f'the length of frame_weights({len(frame_weights)}) does not '\
                f'match the length of frame_indices({len(frame_indices)}).'
            frame_indices.sort()
            self.frame_indices = frame_indices
            self.frame_range = None
            self.num_sampled_frame = None

        self.ph_fill_len = ph_fill_len

        super().__init__(
            ann_file=ann_file,
            bbox_file=bbox_file,
            data_mode=data_mode,
            metainfo=metainfo,
            data_root=data_root,
            data_prefix=data_prefix,
            filter_cfg=filter_cfg,
            indices=indices,
            serialize_data=serialize_data,
            pipeline=pipeline,
            test_mode=test_mode,
            lazy_init=lazy_init,
            max_refetch=max_refetch)

[docs]    def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
        """Parse raw annotation of an instance.

        Args:
            raw_data_info (dict): Raw data information loaded from
                ``ann_file``. It should have following contents:

                - ``'raw_ann_info'``: Raw annotation of an instance
                - ``'raw_img_info'``: Raw information of the image that
                    contains the instance

        Returns:
            dict: Parsed instance annotation
        """

        ann = raw_data_info['raw_ann_info']
        img = raw_data_info['raw_img_info']

        # filter invalid instance
        if 'bbox' not in ann or 'keypoints' not in ann or max(
                ann['keypoints']) == 0:
            return None

        img_w, img_h = img['width'], img['height']
        # get the bbox of the center frame
        # get bbox in shape [1, 4], formatted as xywh
        x, y, w, h = ann['bbox']
        x1 = np.clip(x, 0, img_w - 1)
        y1 = np.clip(y, 0, img_h - 1)
        x2 = np.clip(x + w, 0, img_w - 1)
        y2 = np.clip(y + h, 0, img_h - 1)

        bbox = np.array([x1, y1, x2, y2], dtype=np.float32).reshape(1, 4)

        # get the keypoints of the center frame
        # keypoints in shape [1, K, 2] and keypoints_visible in [1, K]
        _keypoints = np.array(
            ann['keypoints'], dtype=np.float32).reshape(1, -1, 3)
        keypoints = _keypoints[..., :2]
        keypoints_visible = np.minimum(1, _keypoints[..., 2])

        # deal with multiple image paths
        img_paths: list = []
        # get the image path of the center frame
        center_img_path = osp.join(self.data_prefix['img'], img['file_name'])
        # append the center image path first
        img_paths.append(center_img_path)

        # select the frame indices
        if self.frame_sampler_mode == 'fixed':
            indices = self.frame_indices
        else:  # self.frame_sampler_mode == 'random':
            low, high = self.frame_range
            indices = np.random.randint(low, high + 1, self.num_sampled_frame)

        nframes = int(img['nframes'])
        file_name = img['file_name']
        ref_idx = int(osp.splitext(osp.basename(file_name))[0])

        for idx in indices:
            if self.test_mode and idx == 0:
                continue
            # the supporting frame index
            support_idx = ref_idx + idx
            # clip the frame index to make sure that it does not exceed
            # the boundings of frame indices
            support_idx = np.clip(support_idx, 0, nframes - 1)
            sup_img_path = osp.join(
                osp.dirname(center_img_path),
                str(support_idx).zfill(self.ph_fill_len) + '.jpg')

            img_paths.append(sup_img_path)

        data_info = {
            'img_id': int(img['frame_id']),
            'img_path': img_paths,
            'bbox': bbox,
            'bbox_score': np.ones(1, dtype=np.float32),
            'num_keypoints': ann['num_keypoints'],
            'keypoints': keypoints,
            'keypoints_visible': keypoints_visible,
            'frame_weights': self.frame_weights,
            'id': ann['id'],
        }

        return data_info

    def _load_detection_results(self) -> List[dict]:
        """Load data from detection results with dummy keypoint annotations."""
        assert exists(self.ann_file), (
            f'Annotation file `{self.ann_file}` does not exist')
        assert exists(
            self.bbox_file), (f'Bbox file `{self.bbox_file}` does not exist')

        # load detection results
        det_results = load(self.bbox_file)
        assert is_list_of(det_results, dict), (
            f'annotation file `{self.bbox_file}` should be a list of dicts, '
            f'but got type {type(det_results)}')

        # load coco annotations to build image id-to-name index
        with get_local_path(self.ann_file) as local_path:
            self.coco = COCO(local_path)

        # mapping image name to id
        name2id = {}
        # mapping image id to name
        id2name = {}
        for img_id, image in self.coco.imgs.items():
            file_name = image['file_name']
            id2name[img_id] = file_name
            name2id[file_name] = img_id

        num_keypoints = self.metainfo['num_keypoints']
        data_list = []
        id_ = 0
        for det in det_results:
            # remove non-human instances
            if det['category_id'] != 1:
                continue

            # get the predicted bbox and bbox_score
            bbox_xywh = np.array(
                det['bbox'][:4], dtype=np.float32).reshape(1, 4)
            bbox = bbox_xywh2xyxy(bbox_xywh)
            bbox_score = np.array(det['score'], dtype=np.float32).reshape(1)

            # use dummy keypoint location and visibility
            keypoints = np.zeros((1, num_keypoints, 2), dtype=np.float32)
            keypoints_visible = np.ones((1, num_keypoints), dtype=np.float32)

            # deal with different bbox file formats
            if 'nframes' in det:
                nframes = int(det['nframes'])
            else:
                if 'image_name' in det:
                    img_id = name2id[det['image_name']]
                else:
                    img_id = det['image_id']
                img_ann = self.coco.loadImgs(img_id)[0]
                nframes = int(img_ann['nframes'])

            # deal with multiple image paths
            img_paths: list = []
            if 'image_name' in det:
                image_name = det['image_name']
            else:
                image_name = id2name[det['image_id']]
            # get the image path of the center frame
            center_img_path = osp.join(self.data_prefix['img'], image_name)
            # append the center image path first
            img_paths.append(center_img_path)

            # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg"
            center_image_name = image_name.split('/')[-1]
            ref_idx = int(center_image_name.replace('.jpg', ''))

            # select the frame indices
            if self.frame_sampler_mode == 'fixed':
                indices = self.frame_indices
            else:  # self.frame_sampler_mode == 'random':
                low, high = self.frame_range
                indices = np.random.randint(low, high + 1,
                                            self.num_sampled_frame)

            for idx in indices:
                if self.test_mode and idx == 0:
                    continue
                # the supporting frame index
                support_idx = ref_idx + idx
                # clip the frame index to make sure that it does not exceed
                # the boundings of frame indices
                support_idx = np.clip(support_idx, 0, nframes - 1)
                sup_img_path = center_img_path.replace(
                    center_image_name,
                    str(support_idx).zfill(self.ph_fill_len) + '.jpg')

                img_paths.append(sup_img_path)

            data_list.append({
                'img_id': det['image_id'],
                'img_path': img_paths,
                'frame_weights': self.frame_weights,
                'bbox': bbox,
                'bbox_score': bbox_score,
                'keypoints': keypoints,
                'keypoints_visible': keypoints_visible,
                'id': id_,
            })

            id_ += 1

        return data_list