Datasets ¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

Animal-Pose (ICCV'2019)

@InProceedings{Cao_2019_ICCV,
    author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
    title = {Cross-Domain Adaptation for Animal Pose Estimation},
    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
    month = {October},
    year = {2019}
}

Results on AnimalPose validation set (1117 instances)

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32	256x256	0.740	0.959	0.833	0.780	0.965	ckpt	log
pose_hrnet_w48	256x256	0.738	0.958	0.831	0.778	0.962	ckpt	log

Topdown Heatmap + Resnet on Animalpose¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

Animal-Pose (ICCV'2019)

@InProceedings{Cao_2019_ICCV,
    author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
    title = {Cross-Domain Adaptation for Animal Pose Estimation},
    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
    month = {October},
    year = {2019}
}

Results on AnimalPose validation set (1117 instances)

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnet_50	256x256	0.691	0.947	0.770	0.736	0.955	ckpt	log
pose_resnet_101	256x256	0.696	0.948	0.774	0.736	0.951	ckpt	log
pose_resnet_152	256x256	0.704	0.938	0.786	0.748	0.946	ckpt	log

Desert Locust (Elife’2019)¶

Topdown Heatmap + Resnet on Locust¶

Desert Locust (Elife'2019)

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@article{graving2019deepposekit,
  title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning},
  author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D},
  journal={Elife},
  volume={8},
  pages={e47994},
  year={2019},
  publisher={eLife Sciences Publications Limited}
}

Results on Desert Locust test set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_resnet_50	160x160	1.000	0.900	2.27	ckpt	log
pose_resnet_101	160x160	1.000	0.907	2.03	ckpt	log
pose_resnet_152	160x160	1.000	0.925	1.49	ckpt	log

FreiHand (ICCV’2019)¶

Topdown Heatmap + Resnet on Freihand2d¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

FreiHand (ICCV'2019)

@inproceedings{zimmermann2019freihand,
  title={Freihand: A dataset for markerless capture of hand pose and shape from single rgb images},
  author={Zimmermann, Christian and Ceylan, Duygu and Yang, Jimei and Russell, Bryan and Argus, Max and Brox, Thomas},
  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
  pages={813--822},
  year={2019}
}

Results on FreiHand val & test set

Set	Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
test	pose_resnet_50	224x224	0.999	0.868	3.27	ckpt	log

COCO-WholeBody-Hand (ECCV’2020)¶

Topdown Heatmap + Mobilenetv2 + Coco + Wholebody + Hand on Coco_wholebody_hand¶

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_mobilenetv2	256x256	0.795	0.829	4.77	ckpt	log

Topdown Heatmap + Resnet + Coco + Wholebody + Hand on Coco_wholebody_hand¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_resnet_50	256x256	0.800	0.833	4.64	ckpt	log

Topdown Heatmap + Hrnetv2 + Dark + Coco + Wholebody + Hand on Coco_wholebody_hand¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hrnetv2_w18_dark	256x256	0.814	0.840	4.37	ckpt	log

Topdown Heatmap + Hourglass + Coco + Wholebody + Hand on Coco_wholebody_hand¶

@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hourglass_52	256x256	0.804	0.835	4.54	ckpt	log

Topdown Heatmap + Hrnetv2 + Coco + Wholebody + Hand on Coco_wholebody_hand¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hrnetv2_w18	256x256	0.813	0.840	4.39	ckpt	log

Topdown Heatmap + Litehrnet + Coco + Wholebody + Hand on Coco_wholebody_hand¶

LiteHRNet (CVPR'2021)

@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
LiteHRNet-18	256x256	0.795	0.830	4.77	ckpt	log

Topdown Heatmap + Scnet + Coco + Wholebody + Hand on Coco_wholebody_hand¶

@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_scnet_50	256x256	0.803	0.834	4.55	ckpt	log

Rtmpose + Rtmpose + Coco + Wholebody + Hand on Coco_wholebody_hand¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
rtmpose_m	256x256	0.815	0.837	4.51	ckpt	log

AP-10K (NeurIPS’2021)¶

Topdown Heatmap + Resnet on Ap10k¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP^M	AP^L	ckpt	log
pose_resnet_50	256x256	0.680	0.926	0.738	0.552	0.687	ckpt	log
pose_resnet_101	256x256	0.681	0.921	0.751	0.545	0.690	ckpt	log

Topdown Heatmap + Hrnet on Ap10k¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP^M	AP^L	ckpt	log
pose_hrnet_w32	256x256	0.722	0.935	0.789	0.557	0.729	ckpt	log
pose_hrnet_w48	256x256	0.728	0.936	0.802	0.577	0.735	ckpt	log

Topdown Heatmap + Cspnext + Udp on Ap10k¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP^M	AP^L	ckpt	log
pose_cspnext_m	256x256	0.703	0.944	0.776	0.513	0.710	ckpt	log

Rtmpose + Rtmpose on Ap10k¶

@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP^M	AP^L	ckpt	log
rtmpose-m	256x256	0.722	0.939	0.788	0.569	0.728	ckpt	log

Human-Art (CVPR’2023)¶

Topdown Heatmap + Vitpose on Humanart¶

To utilize ViTPose, you’ll need to have MMPreTrain. To install the required version, run the following command:

mim install 'mmpretrain>=1.0.0'

ViTPose (NeurIPS'2022)

@inproceedings{
  xu2022vitpose,
  title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation},
  author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022},
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S-coco	256x192	0.228	0.371	0.229	0.298	0.467	ckpt	log
ViTPose-S-humanart-coco	256x192	0.381	0.532	0.405	0.448	0.602	ckpt	log
ViTPose-B-coco	256x192	0.270	0.423	0.272	0.340	0.510	ckpt	log
ViTPose-B-humanart-coco	256x192	0.410	0.549	0.434	0.475	0.615	ckpt	log
ViTPose-L-coco	256x192	0.342	0.498	0.357	0.413	0.577	ckpt	log
ViTPose-L-humanart-coco	256x192	0.459	0.592	0.487	0.525	0.656	ckpt	log
ViTPose-H-coco	256x192	0.377	0.541	0.391	0.447	0.615	ckpt	log
ViTPose-H-humanart-coco	256x192	0.468	0.594	0.498	0.534	0.655	ckpt	log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S-coco	256x192	0.507	0.758	0.531	0.551	0.780	ckpt	log
ViTPose-S-humanart-coco	256x192	0.738	0.905	0.802	0.768	0.911	ckpt	log
ViTPose-B-coco	256x192	0.555	0.782	0.590	0.599	0.809	ckpt	log
ViTPose-B-humanart-coco	256x192	0.759	0.905	0.823	0.790	0.917	ckpt	log
ViTPose-L-coco	256x192	0.637	0.838	0.689	0.677	0.859	ckpt	log
ViTPose-L-humanart-coco	256x192	0.789	0.916	0.845	0.819	0.929	ckpt	log
ViTPose-H-coco	256x192	0.665	0.860	0.715	0.701	0.871	ckpt	log
ViTPose-H-humanart-coco	256x192	0.800	0.926	0.855	0.828	0.933	ckpt	log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S-coco	256x192	0.739	0.903	0.816	0.792	0.942	ckpt	log
ViTPose-S-humanart-coco	256x192	0.737	0.902	0.811	0.792	0.942	ckpt	log
ViTPose-B-coco	256x192	0.757	0.905	0.829	0.810	0.946	ckpt	log
ViTPose-B-humanart-coco	256x192	0.758	0.906	0.829	0.812	0.946	ckpt	log
ViTPose-L-coco	256x192	0.782	0.914	0.850	0.834	0.952	ckpt	log
ViTPose-L-humanart-coco	256x192	0.782	0.914	0.849	0.835	0.953	ckpt	log
ViTPose-H-coco	256x192	0.788	0.917	0.855	0.839	0.954	ckpt	log
ViTPose-H-humanart-coco	256x192	0.788	0.914	0.853	0.841	0.956	ckpt	log

Topdown Heatmap + Hrnet on Humanart¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32-coco	256x192	0.252	0.397	0.255	0.321	0.485	ckpt	log
pose_hrnet_w32-humanart-coco	256x192	0.399	0.545	0.420	0.466	0.613	ckpt	log
pose_hrnet_w48-coco	256x192	0.271	0.413	0.277	0.339	0.499	ckpt	log
pose_hrnet_w48-humanart-coco	256x192	0.417	0.553	0.442	0.481	0.617	ckpt	log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32-coco	256x192	0.533	0.771	0.562	0.574	0.792	ckpt	log
pose_hrnet_w32-humanart-coco	256x192	0.754	0.906	0.812	0.783	0.916	ckpt	log
pose_hrnet_w48-coco	256x192	0.557	0.782	0.593	0.595	0.804	ckpt	log
pose_hrnet_w48-humanart-coco	256x192	0.769	0.906	0.825	0.796	0.919	ckpt	log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32-coco	256x192	0.749	0.906	0.821	0.804	0.945	ckpt	log
pose_hrnet_w32-humanart-coco	256x192	0.741	0.902	0.814	0.795	0.941	ckpt	log
pose_hrnet_w48-coco	256x192	0.756	0.908	0.826	0.809	0.945	ckpt	log
pose_hrnet_w48-humanart-coco	256x192	0.751	0.905	0.822	0.805	0.943	ckpt	log

Rtmpose + Rtmpose on Humanart¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-coco	256x192	0.161	0.283	0.154	0.221	0.373	ckpt	log
rtmpose-t-humanart-coco	256x192	0.249	0.395	0.256	0.323	0.485	ckpt	log
rtmpose-s-coco	256x192	0.199	0.328	0.198	0.261	0.418	ckpt	log
rtmpose-s-humanart-coco	256x192	0.311	0.462	0.323	0.381	0.540	ckpt	log
rtmpose-m-coco	256x192	0.239	0.372	0.243	0.302	0.455	ckpt	log
rtmpose-m-humanart-coco	256x192	0.355	0.503	0.377	0.417	0.568	ckpt	log
rtmpose-l-coco	256x192	0.260	0.393	0.267	0.323	0.472	ckpt	log
rtmpose-l-humanart-coco	256x192	0.378	0.521	0.399	0.442	0.584	ckpt	log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-coco	256x192	0.444	0.725	0.453	0.488	0.750	ckpt	log
rtmpose-t-humanart-coco	256x192	0.655	0.872	0.720	0.693	0.890	ckpt	log
rtmpose-s-coco	256x192	0.480	0.739	0.498	0.521	0.763	ckpt	log
rtmpose-s-humanart-coco	256x192	0.698	0.893	0.768	0.732	0.903	ckpt	log
rtmpose-m-coco	256x192	0.532	0.765	0.563	0.571	0.789	ckpt	log
rtmpose-m-humanart-coco	256x192	0.728	0.895	0.791	0.759	0.906	ckpt	log
rtmpose-l-coco	256x192	0.564	0.789	0.602	0.599	0.808	ckpt	log
rtmpose-l-humanart-coco	256x192	0.753	0.905	0.812	0.783	0.915	ckpt	log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-coco	256x192	0.682	0.883	0.759	0.736	0.920	ckpt	log
rtmpose-t-humanart-coco	256x192	0.665	0.875	0.739	0.721	0.916	ckpt	log
rtmpose-s-coco	256x192	0.716	0.892	0.789	0.768	0.929	ckpt	log
rtmpose-s-humanart-coco	256x192	0.706	0.888	0.780	0.759	0.928	ckpt	log
rtmpose-m-coco	256x192	0.746	0.899	0.817	0.795	0.935	ckpt	log
rtmpose-m-humanart-coco	256x192	0.725	0.892	0.795	0.775	0.929	ckpt	log
rtmpose-l-coco	256x192	0.758	0.906	0.826	0.806	0.942	ckpt	log
rtmpose-l-humanart-coco	256x192	0.748	0.901	0.816	0.796	0.938	ckpt	log

Results on COCO val2017 with ground-truth bounding box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-humanart-coco	256x192	0.679	0.895	0.755	0.710	0.907	ckpt	log
rtmpose-s-humanart-coco	256x192	0.725	0.916	0.798	0.753	0.925	ckpt	log
rtmpose-m-humanart-coco	256x192	0.744	0.916	0.818	0.770	0.930	ckpt	log
rtmpose-l-humanart-coco	256x192	0.770	0.927	0.840	0.794	0.939	ckpt	log

UBody (CVPR’2023)¶

Topdown Heatmap + Hrnet + Ubody-Coco-Wholebody on Ubody2d¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

UBody (CVPR'2023)

@article{lin2023one,
  title={One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer},
  author={Lin, Jing and Zeng, Ailing and Wang, Haoqian and Zhang, Lei and Li, Yu},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2023},
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
pose_hrnet_w32	256x192	0.685	0.759	0.564	0.675	0.625	0.705	0.516	0.609	0.549	0.646	ckpt	log

InterHand2.6M (ECCV’2020)¶

Internet + Internet on Interhand3d¶

InterNet (ECCV'2020)

@InProceedings{Moon_2020_ECCV_InterHand2.6M,
author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2020}
}

InterHand2.6M (ECCV'2020)

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@InProceedings{Moon_2020_ECCV_InterHand2.6M,
author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2020}
}

Results on InterHand2.6M val & test set

Train Set	Set	Arch	Input Size	MPJPE-single	MPJPE-interacting	MPJPE-all	MRRPE	APh	ckpt	log
All	test(H+M)	InterNet_resnet_50	256x256	9.69	13.72	11.86	29.27	0.99	ckpt	log
All	val(M)	InterNet_resnet_50	256x256	11.30	15.57	13.36	32.15	0.98	ckpt	log
All	test(H+M)	InterNet_resnet_50*	256x256	9.47	13.40	11.59	29.28	0.99	ckpt	log
All	val(M)	InterNet_resnet_50*	256x256	11.22	15.23	13.16	31.73	0.98	ckpt	log

Models with * are trained in MMPose 0.x. The checkpoints and logs are only for validation.

JHMDB (ICCV’2013)¶

Rtmo + Rtmo on Body7¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log	onnx
RTMO-t	640x640	0.574	0.803	0.613	0.611	0.836	ckpt	log	onnx
RTMO-s	640x640	0.686	0.879	0.744	0.723	0.908	ckpt	log	onnx
RTMO-m	640x640	0.726	0.899	0.790	0.763	0.926	ckpt	log	onnx
RTMO-l	640x640	0.748	0.911	0.813	0.786	0.939	ckpt	log	onnx

Topdown Heatmap + CPM on JHMDB¶

CPM (CVPR'2016)

@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

Results on Sub-JHMDB dataset

The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.

Normalized by Person Size

Split	Arch	Input Size	Head	Sho	Elb	Wri	Hip	Knee	Ank	Mean	ckpt	log
Sub1	cpm	368x368	96.1	91.9	81.0	78.9	96.6	90.8	87.3	89.5	ckpt	log
Sub2	cpm	368x368	98.1	93.6	77.1	70.9	94.0	89.1	84.7	87.4	ckpt	log
Sub3	cpm	368x368	97.9	94.9	87.3	84.0	98.6	94.4	86.2	92.4	ckpt	log
Average	cpm	368x368	97.4	93.5	81.5	77.9	96.4	91.4	86.1	89.8	-	-

Normalized by Torso Size

Split	Arch	Input Size	Head	Sho	Elb	Wri	Hip	Knee	Ank	Mean	ckpt	log
Sub1	cpm	368x368	89.0	63.0	54.0	54.9	68.2	63.1	61.2	66.0	ckpt	log
Sub2	cpm	368x368	90.3	57.9	46.8	44.3	60.8	58.2	62.4	61.1	ckpt	log
Sub3	cpm	368x368	91.0	72.6	59.9	54.0	73.2	68.5	65.8	70.3	ckpt	log
Average	cpm	368x368	90.1	64.5	53.6	51.1	67.4	63.3	63.1	65.7	-	-

Topdown Heatmap + Resnet on JHMDB¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

Results on Sub-JHMDB dataset

The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.

Normalized by Person Size

Split	Arch	Input Size	Head	Sho	Elb	Wri	Hip	Knee	Ank	Mean	ckpt	log
Sub1	pose_resnet_50	256x256	99.1	98.0	93.8	91.3	99.4	96.5	92.8	96.1	ckpt	log
Sub2	pose_resnet_50	256x256	99.3	97.1	90.6	87.0	98.9	96.3	94.1	95.0	ckpt	log
Sub3	pose_resnet_50	256x256	99.0	97.9	94.0	91.6	99.7	98.0	94.7	96.7	ckpt	log
Average	pose_resnet_50	256x256	99.2	97.7	92.8	90.0	99.3	96.9	93.9	96.0	-	-
Sub1	pose_resnet_50 (2 Deconv.)	256x256	99.1	98.5	94.6	92.0	99.4	94.6	92.5	96.1	ckpt	log
Sub2	pose_resnet_50 (2 Deconv.)	256x256	99.3	97.8	91.0	87.0	99.1	96.5	93.8	95.2	ckpt	log
Sub3	pose_resnet_50 (2 Deconv.)	256x256	98.8	98.4	94.3	92.1	99.8	97.5	93.8	96.7	ckpt	log
Average	pose_resnet_50 (2 Deconv.)	256x256	99.1	98.2	93.3	90.4	99.4	96.2	93.4	96.0	-	-

Normalized by Torso Size

Split	Arch	Input Size	Head	Sho	Elb	Wri	Hip	Knee	Ank	Mean	ckpt	log
Sub1	pose_resnet_50	256x256	93.3	83.2	74.4	72.7	85.0	81.2	78.9	81.9	ckpt	log
Sub2	pose_resnet_50	256x256	94.1	74.9	64.5	62.5	77.9	71.9	78.6	75.5	ckpt	log
Sub3	pose_resnet_50	256x256	97.0	82.2	74.9	70.7	84.7	83.7	84.2	82.9	ckpt	log
Average	pose_resnet_50	256x256	94.8	80.1	71.3	68.6	82.5	78.9	80.6	80.1	-	-
Sub1	pose_resnet_50 (2 Deconv.)	256x256	92.4	80.6	73.2	70.5	82.3	75.4	75.0	79.2	ckpt	log
Sub2	pose_resnet_50 (2 Deconv.)	256x256	93.4	73.6	63.8	60.5	75.1	68.4	75.5	73.7	ckpt	log
Sub3	pose_resnet_50 (2 Deconv.)	256x256	96.1	81.2	72.6	67.9	83.6	80.9	81.5	81.2	ckpt	log
Average	pose_resnet_50 (2 Deconv.)	256x256	94.0	78.5	69.9	66.3	80.3	74.9	77.3	78.0	-	-

DeepFashion (CVPR’2016)¶

Topdown Heatmap + Resnet on Deepfashion¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

DeepFashion (CVPR'2016)

@inproceedings{liuLQWTcvpr16DeepFashion,
 author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
 title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
 booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 month = {June},
 year = {2016}
}

DeepFashion (ECCV'2016)

@inproceedings{liuYLWTeccv16FashionLandmark,
 author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
 title = {Fashion Landmark Detection in the Wild},
 booktitle = {European Conference on Computer Vision (ECCV)},
 month = {October},
 year = {2016}
 }

Results on DeepFashion val set

Set	Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
upper	pose_resnet_50	256x192	95.4	57.8	16.8	ckpt	log
lower	pose_resnet_50	256x192	96.5	74.4	10.5	ckpt	log
full	pose_resnet_50	256x192	97.7	66.4	12.7	ckpt	log

Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper!

Topdown Heatmap + Hrnet on Deepfashion¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

DeepFashion (CVPR'2016)

@inproceedings{liuLQWTcvpr16DeepFashion,
 author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
 title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
 booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 month = {June},
 year = {2016}
}

DeepFashion (ECCV'2016)

@inproceedings{liuYLWTeccv16FashionLandmark,
 author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
 title = {Fashion Landmark Detection in the Wild},
 booktitle = {European Conference on Computer Vision (ECCV)},
 month = {October},
 year = {2016}
 }

Results on DeepFashion val set

Set	Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
upper	pose_hrnet_w48_udp	256x192	96.1	60.9	15.1	ckpt	log
lower	pose_hrnet_w48_udp	256x192	97.8	76.1	8.9	ckpt	log
full	pose_hrnet_w48_udp	256x192	98.3	67.3	11.7	ckpt	log

Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper!

WFLW (CVPR’2018)¶

Topdown Regression + Resnet + Softwingloss on WFLW¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

SoftWingloss (TIP'2021)

@article{lin2021structure,
  title={Structure-Coherent Deep Feature Learning for Robust Face Alignment},
  author={Lin, Chunze and Zhu, Beier and Wang, Quan and Liao, Renjie and Qian, Chen and Lu, Jiwen and Zhou, Jie},
  journal={IEEE Transactions on Image Processing},
  year={2021},
  publisher={IEEE}
}

@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train set.

Model	Input Size	NME	ckpt	log
ResNet-50+SoftWingLoss	256x256	4.44	ckpt	log

Topdown Regression + Resnet on WFLW¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train set.

Model	Input Size	NME	ckpt	log
ResNet-50	256x256	4.88	ckpt	log

Topdown Regression + Resnet + Wingloss on WFLW¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

Wingloss (CVPR'2018)

@inproceedings{feng2018wing,
  title={Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural Networks},
  author={Feng, Zhen-Hua and Kittler, Josef and Awais, Muhammad and Huber, Patrik and Wu, Xiao-Jun},
  booktitle={Computer Vision and Pattern Recognition (CVPR), 2018 IEEE Conference on},
  year={2018},
  pages ={2235-2245},
  organization={IEEE}
}

@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train set.

Model	Input Size	NME	ckpt	log
ResNet-50+WingLoss	256x256	4.67	ckpt	log

Topdown Heatmap + Hrnetv2 + Awing on WFLW¶

AdaptiveWingloss (ICCV'2019)

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{wang2019adaptive,
  title={Adaptive wing loss for robust face alignment via heatmap regression},
  author={Wang, Xinyao and Bo, Liefeng and Fuxin, Li},
  booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
  pages={6971--6981},
  year={2019}
}

@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch	Input Size	NME_test	NME_pose	NME_illumination	NME_occlusion	NME_blur	NME_makeup	NME_expression	ckpt	log
pose_hrnetv2_w18_awing	256x256	4.02	6.94	3.97	4.78	4.59	3.87	4.28	ckpt	log

Topdown Heatmap + Hrnetv2 on WFLW¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch	Input Size	NME_test	NME_pose	NME_illumination	NME_occlusion	NME_blur	NME_makeup	NME_expression	ckpt	log
pose_hrnetv2_w18	256x256	4.06	6.97	3.99	4.83	4.58	3.94	4.33	ckpt	log

Topdown Heatmap + Hrnetv2 + Dark on WFLW¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch	Input Size	NME_test	NME_pose	NME_illumination	NME_occlusion	NME_blur	NME_makeup	NME_expression	ckpt	log
pose_hrnetv2_w18_dark	256x256	3.98	6.98	3.96	4.78	4.56	3.89	4.29	ckpt	log

Rtmpose + Rtmpose on WFLW¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch	Input Size	NME	ckpt	log
pose_rtmpose_m	256x256	4.01	ckpt	log

AI Challenger (ArXiv’2017)¶

Topdown Heatmap + Hrnet + Aic on Coco¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

MMPose supports training model with combined datasets. coco-aic-merge and coco-aic-combine are two examples.

coco-aic-merge leverages AIC data with partial keypoints as auxiliary data to train a COCO model
coco-aic-combine constructs a combined dataset whose keypoints are the union of COCO and AIC keypoints to train a model that predicts keypoints of both datasets.

Evaluation results on COCO val2017 of models trained with solely COCO dataset and combined dataset as shown below. These models are evaluated with detector having human AP of 56.4 on COCO val2017 dataset.

Train Set	Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
coco	pose_hrnet_w32	256x192	0.749	0.906	0.821	0.804	0.945	ckpt	log
coco-aic-merge	pose_hrnet_w32	256x192	0.756	0.907	0.828	0.809	0.944	ckpt	log
coco-aic-combine	pose_hrnet_w32	256x192	0.755	0.904	0.825	0.807	0.942	ckpt	log

Rtmo + Rtmo on Body7¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log	onnx
RTMO-t	640x640	0.574	0.803	0.613	0.611	0.836	ckpt	log	onnx
RTMO-s	640x640	0.686	0.879	0.744	0.723	0.908	ckpt	log	onnx
RTMO-m	640x640	0.726	0.899	0.790	0.763	0.926	ckpt	log	onnx
RTMO-l	640x640	0.748	0.911	0.813	0.786	0.939	ckpt	log	onnx

Topdown Heatmap + Resnet on Aic¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC val set with ground-truth bounding boxes

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnet_101	256x192	0.294	0.736	0.172	0.337	0.762	ckpt	log

Topdown Heatmap + Hrnet on Aic¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC val set with ground-truth bounding boxes

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32	256x192	0.323	0.761	0.218	0.366	0.789	ckpt	log

PoseTrack18 (CVPR’2018)¶

Rtmo + Rtmo on Body7¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log	onnx
RTMO-t	640x640	0.574	0.803	0.613	0.611	0.836	ckpt	log	onnx
RTMO-s	640x640	0.686	0.879	0.744	0.723	0.908	ckpt	log	onnx
RTMO-m	640x640	0.726	0.899	0.790	0.763	0.926	ckpt	log	onnx
RTMO-l	640x640	0.748	0.911	0.813	0.786	0.939	ckpt	log	onnx

Topdown Heatmap + Hrnet on Posetrack18¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

Results on PoseTrack2018 val with ground-truth bounding boxes

Arch	Input Size	Head	Shou	Elb	Wri	Hip	Knee	Ankl	Total	ckpt	log
pose_hrnet_w32	256x192	86.2	89.0	84.5	79.2	82.3	82.5	78.7	83.4	ckpt	log
pose_hrnet_w32	384x288	87.1	89.0	85.1	80.2	80.6	82.8	79.6	83.7	ckpt	log
pose_hrnet_w48	256x192	88.3	90.2	86.0	81.0	80.7	83.3	80.6	84.6	ckpt	log
pose_hrnet_w48	384x288	87.8	90.0	86.2	81.3	81.0	83.4	80.9	84.6	ckpt	log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.

Results on PoseTrack2018 val with MMDetection pre-trained Cascade R-CNN (X-101-64x4d-FPN) human detector

Arch	Input Size	Head	Shou	Elb	Wri	Hip	Knee	Ankl	Total	ckpt	log
pose_hrnet_w32	256x192	78.0	82.9	79.5	73.8	76.9	76.6	70.2	76.9	ckpt	log
pose_hrnet_w32	384x288	79.9	83.6	80.4	74.5	74.8	76.1	70.5	77.3	ckpt	log
pose_hrnet_w48	256x192	80.1	83.4	80.6	74.8	74.3	76.8	70.5	77.4	ckpt	log
pose_hrnet_w48	384x288	80.2	83.8	80.9	75.2	74.7	76.7	71.7	77.8	ckpt	log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.

Topdown Heatmap + Resnet on Posetrack18¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

SimpleBaseline3D (ICCV'2017)

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

Results on PoseTrack2018 val with ground-truth bounding boxes

Arch	Input Size	Head	Shou	Elb	Wri	Hip	Knee	Ankl	Total	ckpt	log
pose_resnet_50	256x192	86.5	87.7	82.5	75.8	80.1	78.8	74.2	81.2	ckpt	log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.

Human3.6M (TPAMI’2014)¶

Image Pose Lift + Simplebaseline3d on H36m¶

@inproceedings{martinez_2017_3dbaseline,
  title={A simple yet effective baseline for 3d human pose estimation},
  author={Martinez, Julieta and Hossain, Rayat and Romero, Javier and Little, James J.},
  booktitle={ICCV},
  year={2017}
}

Human3.6M (TPAMI'2014)

@article{h36m_pami,
  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  publisher = {IEEE Computer Society},
  volume = {36},
  number = {7},
  pages = {1325-1339},
  month = {jul},
  year = {2014}
}

Results on Human3.6M dataset with ground truth 2D detections

Arch	MPJPE	P-MPJPE	ckpt	log
SimpleBaseline3D¹	43.4	34.3	ckpt	log

¹ Differing from the original paper, we didn’t apply the max-norm constraint because we found this led to a better convergence and performance.

Motionbert + Motionbert on H36m¶

MotionBERT (2022)

 @misc{Zhu_Ma_Liu_Liu_Wu_Wang_2022,
 title={Learning Human Motion Representations: A Unified Perspective},
 author={Zhu, Wentao and Ma, Xiaoxuan and Liu, Zhaoyang and Liu, Libin and Wu, Wayne and Wang, Yizhou},
 year={2022},
 month={Oct},
 language={en-US}
 }

Human3.6M (TPAMI'2014)

@article{h36m_pami,
author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
publisher = {IEEE Computer Society},
volume = {36},
number = {7},
pages = {1325-1339},
month = {jul},
year = {2014}
}

Results on Human3.6M dataset with ground truth 2D detections

Arch	MPJPE	average MPJPE	P-MPJPE	ckpt
MotionBERT*	34.5	34.6	27.1	ckpt
MotionBERT-finetuned*	26.9	26.8	21.0	ckpt

Results on Human3.6M dataset converted from the official repo¹ with ground truth 2D detections

Arch	MPJPE	average MPJPE	P-MPJPE	ckpt	log
MotionBERT*	39.8	39.2	33.4	ckpt	/
MotionBERT-finetuned*	37.7	37.2	32.2	ckpt	/

¹ By default, we test models with Human 3.6m dataset processed by MMPose. The official repo’s dataset includes more data and applies a different pre-processing technique. To achieve the same result with the official repo, please download the test annotation file, train annotation file and factors under $MMPOSE/data/h36m/annotation_body3d/fps50 and test with the configs we provided.

Models with * are converted from the official repo. The config files of these models are only for validation. We don’t ensure these config files’ training accuracy and welcome you to contribute your reproduction results.

Video Pose Lift + Videopose3d on H36m¶

VideoPose3D (CVPR'2019)

@inproceedings{pavllo20193d,
title={3d human pose estimation in video with temporal convolutions and semi-supervised training},
author={Pavllo, Dario and Feichtenhofer, Christoph and Grangier, David and Auli, Michael},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={7753--7762},
year={2019}
}

Human3.6M (TPAMI'2014)

@article{h36m_pami,
author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
publisher = {IEEE Computer Society},
volume = {36},
number = {7},
pages = {1325-1339},
month = {jul},
year = {2014}
}

Testing results on Human3.6M dataset with ground truth 2D detections, supervised training

Arch	Receptive Field	MPJPE	P-MPJPE	ckpt	log
VideoPose3D-supervised-27frm	27	40.1	30.1	ckpt	log
VideoPose3D-supervised-81frm	81	39.1	29.3	ckpt	log
VideoPose3D-supervised-243frm	243	37.6	28.3	ckpt	log

Testing results on Human3.6M dataset with CPN 2D detections¹, supervised training

Arch	Receptive Field	MPJPE	P-MPJPE	ckpt	log
VideoPose3D-supervised-CPN-1frm	1	53.0	41.3	ckpt	log
VideoPose3D-supervised-CPN-243frm	243	47.9	38.0	ckpt	log

Testing results on Human3.6M dataset with ground truth 2D detections, semi-supervised training

Training Data	Arch	Receptive Field	MPJPE	P-MPJPE	N-MPJPE	ckpt	log
10% S1	VideoPose3D-semi-supervised-27frm	27	57.2	42.4	54.2	ckpt	log

Testing results on Human3.6M dataset with CPN 2D detections¹, semi-supervised training

Training Data	Arch	Receptive Field	MPJPE	P-MPJPE	N-MPJPE	ckpt	log
10% S1	VideoPose3D-semi-supervised-CPN-27frm	27	67.3	50.4	63.6	ckpt	log

¹ CPN 2D detections are provided by official repo. The reformatted version used in this repository can be downloaded from train_detection and test_detection.

300WLP (IEEE’2017)¶

Topdown Heatmap + Hrnetv2 on 300wlp¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

300WLP (IEEE'2017)

@article{zhu2017face,
  title={Face alignment in full pose range: A 3d total solution},
  author={Zhu, Xiangyu and Liu, Xiaoming and Lei, Zhen and Li, Stan Z},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  year={2017},
  publisher={IEEE}
}

Results on 300W-LP dataset

The model is trained on 300W-LP train.

Arch	Input Size	NME_full	NME_test	ckpt	log
pose_hrnetv2_w18	256x256	0.0413	0.04125	ckpt	log

Grévy’s Zebra (Elife’2019)¶

Topdown Heatmap + Resnet on Zebra¶

Grévy’s Zebra (Elife'2019)

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@article{graving2019deepposekit,
  title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning},
  author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D},
  journal={Elife},
  volume={8},
  pages={e47994},
  year={2019},
  publisher={eLife Sciences Publications Limited}
}

Results on Grévy’s Zebra test set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_resnet_50	160x160	1.000	0.914	1.87	ckpt	log
pose_resnet_101	160x160	1.000	0.915	1.83	ckpt	log
pose_resnet_152	160x160	1.000	0.921	1.67	ckpt	log

CrowdPose (CVPR’2019)¶

Rtmo + Rtmo on Crowdpose¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP (E)	AP (M)	AP (H)	ckpt	log
RTMO-s	640x640	0.673	0.882	0.729	0.737	0.682	0.591	ckpt	log
RTMO-m	640x640	0.711	0.897	0.771	0.774	0.719	0.634	ckpt	log
RTMO-l	640x640	0.732	0.907	0.793	0.792	0.741	0.653	ckpt	log
RTMO-l*	640x640	0.838	0.947	0.893	0.888	0.847	0.772	ckpt	log

* indicates the model is trained using a combined dataset composed of AI Challenger, COCO, CrowdPose, Halpe, MPII, PoseTrack18 and sub-JHMDB.

Topdown Heatmap + Resnet on Crowdpose¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP (E)	AP (M)	AP (H)	ckpt	log
pose_resnet_50	256x192	0.637	0.808	0.692	0.738	0.650	0.506	ckpt	log
pose_resnet_101	256x192	0.647	0.810	0.703	0.745	0.658	0.521	ckpt	log
pose_resnet_101	320x256	0.661	0.821	0.714	0.759	0.672	0.534	ckpt	log
pose_resnet_152	256x192	0.656	0.818	0.712	0.754	0.666	0.533	ckpt	log

Topdown Heatmap + Hrnet on Crowdpose¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP (E)	AP (M)	AP (H)	ckpt	log
pose_hrnet_w32	256x192	0.675	0.825	0.729	0.770	0.687	0.553	ckpt	log

Topdown Heatmap + Cspnext + Udp on Crowdpose¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP (E)	AP (M)	AP (H)	ckpt	log
pose_cspnext_m	256x192	0.662	0.821	0.723	0.759	0.675	0.539	ckpt	log

Dekr + Hrnet on Crowdpose¶

DEKR (CVPR'2021)

@inproceedings{geng2021bottom,
  title={Bottom-up human pose estimation via disentangled keypoint regression},
  author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={14676--14686},
  year={2021}
}

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test without multi-scale test

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP (E)	AP (M)	AP (H)	ckpt	log
HRNet-w32	512x512	0.663	0.857	0.714	0.740	0.671	0.576	ckpt	log
HRNet-w48	640x640	0.679	0.869	0.731	0.753	0.688	0.593	ckpt	log

Rtmpose + Rtmpose on Crowdpose¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AP (E)	AP (M)	AP (H)	ckpt	log
rtmpose-m	256x192	0.706	0.841	0.765	0.799	0.719	0.582	ckpt	log

Rtmo + Rtmo on Body7¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log	onnx
RTMO-t	640x640	0.574	0.803	0.613	0.611	0.836	ckpt	log	onnx
RTMO-s	640x640	0.686	0.879	0.744	0.723	0.908	ckpt	log	onnx
RTMO-m	640x640	0.726	0.899	0.790	0.763	0.926	ckpt	log	onnx
RTMO-l	640x640	0.748	0.911	0.813	0.786	0.939	ckpt	log	onnx

Halpe (CVPR’2020)¶

Rtmo + Rtmo on Body7¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log	onnx
RTMO-t	640x640	0.574	0.803	0.613	0.611	0.836	ckpt	log	onnx
RTMO-s	640x640	0.686	0.879	0.744	0.723	0.908	ckpt	log	onnx
RTMO-m	640x640	0.726	0.899	0.790	0.763	0.926	ckpt	log	onnx
RTMO-l	640x640	0.748	0.911	0.813	0.786	0.939	ckpt	log	onnx

COCO-WholeBody-Face (ECCV’2020)¶

Topdown Heatmap + Mobilenetv2 + Coco + Wholebody + Face on Coco_wholebody_face¶

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch	Input Size	NME	ckpt	log
pose_mobilenetv2	256x256	0.0611	ckpt	log

Topdown Heatmap + Hourglass + Coco + Wholebody + Face on Coco_wholebody_face¶

@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch	Input Size	NME	ckpt	log
pose_hourglass_52	256x256	0.0587	ckpt	log

Topdown Heatmap + Hrnetv2 + Coco + Wholebody + Face on Coco_wholebody_face¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch	Input Size	NME	ckpt	log
pose_hrnetv2_w18	256x256	0.0569	ckpt	log

Topdown Heatmap + Scnet + Coco + Wholebody + Face on Coco_wholebody_face¶

@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch	Input Size	NME	ckpt	log
pose_scnet_50	256x256	0.0567	ckpt	log

Topdown Heatmap + Hrnetv2 + Dark + Coco + Wholebody + Face on Coco_wholebody_face¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch	Input Size	NME	ckpt	log
pose_hrnetv2_w18_dark	256x256	0.0513	ckpt	log

Topdown Heatmap + Resnet + Coco + Wholebody + Face on Coco_wholebody_face¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch	Input Size	NME	ckpt	log
pose_res50	256x256	0.0582	ckpt	log

Rtmpose + Rtmpose + Coco + Wholebody + Face on Coco_wholebody_face¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch	Input Size	NME	ckpt	log
pose_rtmpose_m	256x256	0.0466	ckpt	log

COCO-WholeBody (ECCV’2020)¶

Topdown Heatmap + Vipnas + Dark on Coco-Wholebody¶

@article{xu2021vipnas,
  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  year={2021}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
S-ViPNAS-MobileNetV3_dark	256x192	0.632	0.710	0.530	0.660	0.672	0.771	0.404	0.519	0.508	0.607	ckpt	log
S-ViPNAS-Res50_dark	256x192	0.650	0.732	0.550	0.686	0.684	0.783	0.437	0.554	0.528	0.632	ckpt	log

Topdown Heatmap + Resnet on Coco-Wholebody¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
pose_resnet_50	256x192	0.652	0.738	0.615	0.749	0.606	0.715	0.460	0.584	0.521	0.633	ckpt	log
pose_resnet_50	384x288	0.666	0.747	0.634	0.763	0.731	0.811	0.536	0.646	0.574	0.670	ckpt	log
pose_resnet_101	256x192	0.669	0.753	0.637	0.766	0.611	0.722	0.463	0.589	0.531	0.645	ckpt	log
pose_resnet_101	384x288	0.692	0.770	0.680	0.799	0.746	0.820	0.548	0.657	0.597	0.693	ckpt	log
pose_resnet_152	256x192	0.682	0.764	0.661	0.787	0.623	0.728	0.481	0.607	0.548	0.661	ckpt	log
pose_resnet_152	384x288	0.704	0.780	0.693	0.813	0.751	0.824	0.559	0.666	0.610	0.705	ckpt	log

Topdown Heatmap + Hrnet on Coco-Wholebody¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
pose_hrnet_w32	256x192	0.678	0.755	0.543	0.661	0.630	0.708	0.467	0.566	0.536	0.636	ckpt	log
pose_hrnet_w32	384x288	0.700	0.772	0.585	0.691	0.726	0.783	0.515	0.603	0.586	0.673	ckpt	log
pose_hrnet_w48	256x192	0.701	0.776	0.675	0.787	0.656	0.743	0.535	0.639	0.579	0.681	ckpt	log
pose_hrnet_w48	384x288	0.722	0.791	0.696	0.801	0.776	0.834	0.587	0.678	0.632	0.717	ckpt	log

Topdown Heatmap + Vipnas on Coco-Wholebody¶

@article{xu2021vipnas,
  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  year={2021}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
S-ViPNAS-MobileNetV3	256x192	0.619	0.700	0.477	0.608	0.585	0.689	0.386	0.505	0.473	0.578	ckpt	log
S-ViPNAS-Res50	256x192	0.643	0.726	0.553	0.694	0.587	0.698	0.410	0.529	0.495	0.607	ckpt	log

Topdown Heatmap + Cspnext + Udp on Coco-Wholebody¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
pose_cspnext_m_udp	256x192	0.687	0.735	0.680	0.763	0.697	0.755	0.460	0.543	0.567	0.641	ckpt	log

Topdown Heatmap + Hrnet + Dark on Coco-Wholebody¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
pose_hrnet_w32_dark	256x192	0.693	0.764	0.564	0.674	0.737	0.809	0.503	0.602	0.582	0.671	ckpt	log
pose_hrnet_w48_dark+	384x288	0.742	0.807	0.707	0.806	0.841	0.892	0.602	0.694	0.661	0.743	ckpt	log

Note: + means the model is first pre-trained on original COCO dataset, and then fine-tuned on COCO-WholeBody dataset. We find this will lead to better performance.

Rtmpose + Rtmpose on Coco-Wholebody¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
rtmpose-m	256x192	0.673	0.750	0.615	0.752	0.813	0.871	0.475	0.589	0.582	0.674	ckpt	log
rtmpose-l	256x192	0.695	0.769	0.658	0.785	0.833	0.887	0.519	0.628	0.611	0.700	ckpt	log
rtmpose-l	384x288	0.712	0.781	0.693	0.811	0.882	0.919	0.579	0.677	0.648	0.730	ckpt	log

Rtmpose + RTMW on Cocktail14¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Cocktail14 denotes model trained on 14 public datasets:
- AI Challenger
- CrowdPose
- MPII
- sub-JHMDB
- Halpe
- PoseTrack18
- COCO-Wholebody
- UBody
- Human-Art
- WFLW
- 300W
- COFW
- LaPa
- InterHand

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	Body AP	Body AR	Foot AP	Foot AR	Face AP	Face AR	Hand AP	Hand AR	Whole AP	Whole AR	ckpt	log
rtmw-m	256x192	0.676	0.747	0.671	0.794	0.783	0.854	0.491	0.604	0.582	0.673	ckpt	-
rtmw-l	256x192	0.743	0.807	0.763	0.868	0.834	0.889	0.598	0.701	0.660	0.746	ckpt	-
rtmw-x	256x192	0.746	0.808	0.770	0.869	0.844	0.896	0.610	0.710	0.672	0.752	ckpt	-
rtmw-l	384x288	0.761	0.824	0.793	0.885	0.884	0.921	0.663	0.752	0.701	0.780	ckpt	-
rtmw-x	384x288	0.763	0.826	0.796	0.888	0.884	0.923	0.664	0.755	0.702	0.781	ckpt	-

Topdown Heatmap + Vitpose on Coco¶

To utilize ViTPose, you’ll need to have MMPreTrain. To install the required version, run the following command:

mim install 'mmpretrain>=1.0.0'

ViTPose (NeurIPS'2022)

@inproceedings{
  xu2022vitpose,
  title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation},
  author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022},
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S	256x192	0.739	0.903	0.816	0.792	0.942	ckpt	log
ViTPose-B	256x192	0.757	0.905	0.829	0.810	0.946	ckpt	log
ViTPose-L	256x192	0.782	0.914	0.850	0.834	0.952	ckpt	log
ViTPose-H	256x192	0.788	0.917	0.855	0.839	0.954	ckpt	log
ViTPose-H*	256x192	0.790	0.916	0.857	0.840	0.953	ckpt	-

Models with * are converted from the official repo. The config files of these models are only for validation.

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S	256x192	0.736	0.900	0.811	0.790	0.940	ckpt	log
ViTPose-B	256x192	0.756	0.906	0.826	0.809	0.946	ckpt	log
ViTPose-L	256x192	0.780	0.914	0.851	0.833	0.952	ckpt	log
ViTPose-H	256x192	0.789	0.916	0.856	0.839	0.953	ckpt	log

Topdown Heatmap + Vitpose on Humanart¶

To utilize ViTPose, you’ll need to have MMPreTrain. To install the required version, run the following command:

mim install 'mmpretrain>=1.0.0'

ViTPose (NeurIPS'2022)

@inproceedings{
  xu2022vitpose,
  title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation},
  author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022},
}

@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S-coco	256x192	0.228	0.371	0.229	0.298	0.467	ckpt	log
ViTPose-S-humanart-coco	256x192	0.381	0.532	0.405	0.448	0.602	ckpt	log
ViTPose-B-coco	256x192	0.270	0.423	0.272	0.340	0.510	ckpt	log
ViTPose-B-humanart-coco	256x192	0.410	0.549	0.434	0.475	0.615	ckpt	log
ViTPose-L-coco	256x192	0.342	0.498	0.357	0.413	0.577	ckpt	log
ViTPose-L-humanart-coco	256x192	0.459	0.592	0.487	0.525	0.656	ckpt	log
ViTPose-H-coco	256x192	0.377	0.541	0.391	0.447	0.615	ckpt	log
ViTPose-H-humanart-coco	256x192	0.468	0.594	0.498	0.534	0.655	ckpt	log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S-coco	256x192	0.507	0.758	0.531	0.551	0.780	ckpt	log
ViTPose-S-humanart-coco	256x192	0.738	0.905	0.802	0.768	0.911	ckpt	log
ViTPose-B-coco	256x192	0.555	0.782	0.590	0.599	0.809	ckpt	log
ViTPose-B-humanart-coco	256x192	0.759	0.905	0.823	0.790	0.917	ckpt	log
ViTPose-L-coco	256x192	0.637	0.838	0.689	0.677	0.859	ckpt	log
ViTPose-L-humanart-coco	256x192	0.789	0.916	0.845	0.819	0.929	ckpt	log
ViTPose-H-coco	256x192	0.665	0.860	0.715	0.701	0.871	ckpt	log
ViTPose-H-humanart-coco	256x192	0.800	0.926	0.855	0.828	0.933	ckpt	log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ViTPose-S-coco	256x192	0.739	0.903	0.816	0.792	0.942	ckpt	log
ViTPose-S-humanart-coco	256x192	0.737	0.902	0.811	0.792	0.942	ckpt	log
ViTPose-B-coco	256x192	0.757	0.905	0.829	0.810	0.946	ckpt	log
ViTPose-B-humanart-coco	256x192	0.758	0.906	0.829	0.812	0.946	ckpt	log
ViTPose-L-coco	256x192	0.782	0.914	0.850	0.834	0.952	ckpt	log
ViTPose-L-humanart-coco	256x192	0.782	0.914	0.849	0.835	0.953	ckpt	log
ViTPose-H-coco	256x192	0.788	0.917	0.855	0.839	0.954	ckpt	log
ViTPose-H-humanart-coco	256x192	0.788	0.914	0.853	0.841	0.956	ckpt	log

OneHand10K (TCSVT’2019)¶

Topdown Regression + Resnet on Onehand10k¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
deeppose_resnet_50	256x256	0.990	0.485	34.21	ckpt	log

Topdown Heatmap + Resnet on Onehand10k¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_resnet_50	256x256	0.989	0.555	25.16	ckpt	log

Topdown Heatmap + Hrnetv2 + Udp on Onehand10k¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hrnetv2_w18_udp	256x256	0.990	0.571	23.88	ckpt	log

Topdown Heatmap + Hrnetv2 on Onehand10k¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hrnetv2_w18	256x256	0.990	0.567	24.26	ckpt	log

Topdown Heatmap + Hrnetv2 + Dark on Onehand10k¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hrnetv2_w18_dark	256x256	0.990	0.572	23.96	ckpt	log

Topdown Heatmap + Mobilenetv2 on Onehand10k¶

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_mobilenet_v2	256x256	0.986	0.537	28.56	ckpt	log

COFW (ICCV’2013)¶

Topdown Heatmap + Hrnetv2 on Cofw¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

COFW (ICCV'2013)

@inproceedings{burgos2013robust,
  title={Robust face landmark estimation under occlusion},
  author={Burgos-Artizzu, Xavier P and Perona, Pietro and Doll{\'a}r, Piotr},
  booktitle={Proceedings of the IEEE international conference on computer vision},
  pages={1513--1520},
  year={2013}
}

Results on COFW dataset

The model is trained on COFW train.

Arch	Input Size	NME	ckpt	log
pose_hrnetv2_w18	256x256	3.48	ckpt	log

AFLW (ICCVW’2011)¶

Topdown Heatmap + Hrnetv2 + Dark on Aflw¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

AFLW (ICCVW'2011)

@inproceedings{koestinger2011annotated,
  title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
  author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
  booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
  pages={2144--2151},
  year={2011},
  organization={IEEE}
}

Results on AFLW dataset

The model is trained on AFLW train and evaluated on AFLW full and frontal.

Arch	Input Size	NME_full	NME_frontal	ckpt	log
pose_hrnetv2_w18_dark	256x256	1.35	1.19	ckpt	log

Topdown Heatmap + Hrnetv2 on Aflw¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

AFLW (ICCVW'2011)

@inproceedings{koestinger2011annotated,
  title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
  author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
  booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
  pages={2144--2151},
  year={2011},
  organization={IEEE}
}

Results on AFLW dataset

The model is trained on AFLW train and evaluated on AFLW full and frontal.

Arch	Input Size	NME_full	NME_frontal	ckpt	log
pose_hrnetv2_w18	256x256	1.41	1.27	ckpt	log

RHD (ICCV’2017)¶

Topdown Regression + Resnet on Rhd2d¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
deeppose_resnet_50	256x256	0.988	0.865	3.32	ckpt	log

Topdown Heatmap + Hrnetv2 + Dark on Rhd2d¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hrnetv2_w18_dark	256x256	0.992	0.903	2.18	ckpt	log

Topdown Heatmap + Mobilenetv2 on Rhd2d¶

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_mobilenet_v2	256x256	0.985	0.883	2.79	ckpt	log

Topdown Heatmap + Hrnetv2 + Udp on Rhd2d¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch	Input Size	PCKh@0.7	AUC	EPE	ckpt	log
pose_hrnetv2_w18_udp	256x256	0.992	0.902	2.19	ckpt	log

Topdown Heatmap + Resnet on Rhd2d¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_resnet50	256x256	0.991	0.898	2.32	ckpt	log

Topdown Heatmap + Hrnetv2 on Rhd2d¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch	Input Size	PCK@0.2	AUC	EPE	ckpt	log
pose_hrnetv2_w18	256x256	0.992	0.902	2.21	ckpt	log

LaPa (AAAI’2020)¶

Rtmpose + Rtmpose on Lapa¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

LaPa (AAAI'2020)

@inproceedings{liu2020new,
  title={A New Dataset and Boundary-Attention Semantic Segmentation for Face Parsing.},
  author={Liu, Yinglu and Shi, Hailin and Shen, Hao and Si, Yue and Wang, Xiaobo and Mei, Tao},
  booktitle={AAAI},
  pages={11637--11644},
  year={2020}
}

Results on LaPa val set

Arch	Input Size	NME	ckpt	log
pose_rtmpose_m	256x256	1.29	ckpt	log

300W (IMAVIS’2016)¶

Topdown Heatmap + Hrnetv2 on 300w¶

@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}

300W (IMAVIS'2016)

@article{sagonas2016300,
  title={300 faces in-the-wild challenge: Database and results},
  author={Sagonas, Christos and Antonakos, Epameinondas and Tzimiropoulos, Georgios and Zafeiriou, Stefanos and Pantic, Maja},
  journal={Image and vision computing},
  volume={47},
  pages={3--18},
  year={2016},
  publisher={Elsevier}
}

Results on 300W dataset

The model is trained on 300W train.

Arch	Input Size	NME_common	NME_challenge	NME_full	NME_test	ckpt	log
pose_hrnetv2_w18	256x256	2.92	5.64	3.45	4.10	ckpt	log

COCO (ECCV’2014)¶

Topdown Regression + Mobilenetv2 + Rle on Coco¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

RLE (ICCV'2021)

@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
deeppose_mobilenetv2_rle_pretrained	256x192	0.593	0.836	0.660	0.644	0.877	ckpt	log

Topdown Regression + Resnet on Coco¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
deeppose_resnet_50	256x192	0.541	0.824	0.601	0.649	0.893	ckpt	log
deeppose_resnet_101	256x192	0.562	0.831	0.629	0.670	0.900	ckpt	log
deeppose_resnet_152	256x192	0.584	0.842	0.659	0.688	0.907	ckpt	log

Topdown Regression + Resnet + Rle on Coco¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

RLE (ICCV'2021)

@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
deeppose_resnet_50_rle	256x192	0.706	0.888	0.776	0.753	0.924	ckpt	log
deeppose_resnet_50_rle_pretrained	256x192	0.719	0.891	0.788	0.764	0.925	ckpt	log
deeppose_resnet_101_rle	256x192	0.722	0.894	0.794	0.768	0.930	ckpt	log
deeppose_resnet_152_rle	256x192	0.731	0.897	0.805	0.777	0.933	ckpt	log
deeppose_resnet_152_rle	384x288	0.749	0.901	0.815	0.793	0.935	ckpt	log

Rtmo + Rtmo on Coco¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
RTMO-s	640x640	0.677	0.878	0.737	0.715	0.908	ckpt	log
RTMO-m	640x640	0.709	0.890	0.778	0.747	0.920	ckpt	log
RTMO-l	640x640	0.724	0.899	0.788	0.762	0.927	ckpt	log

Yoloxpose + Yoloxpose on Coco¶

YOLO-Pose (CVPRW'2022)

@inproceedings{maji2022yolo,
  title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss},
  author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={2637--2646},
  year={2022}
}

YOLOX

@article{ge2021yolox,
  title={Yolox: Exceeding yolo series in 2021},
  author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
  journal={arXiv preprint arXiv:2107.08430},
  year={2021}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
yoloxpose_tiny	416x416	0.526	0.793	0.556	0.571	0.833	ckpt	log
yoloxpose_s	640x640	0.641	0.872	0.702	0.682	0.902	ckpt	log
yoloxpose_m	640x640	0.695	0.899	0.766	0.733	0.926	ckpt	log
yoloxpose_l	640x640	0.712	0.901	0.782	0.749	0.926	ckpt	log

Edpose + Edpose on Coco¶

ED-Pose (ICLR'2023)

@inproceedings{
yang2023explicit,
title={Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation},
author={Jie Yang and Ailing Zeng and Shilong Liu and Feng Li and Ruimao Zhang and Lei Zhang},
booktitle={International Conference on Learning Representations},
year={2023},
url={https://openreview.net/forum?id=s4WVupnJjmX}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017.

Arch	BackBone	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
edpose_res50_coco	ResNet-50	0.716	0.897	0.783	0.793	0.943	ckpt	log

The checkpoint is converted from the official repo. The training of EDPose is not supported yet. It will be supported in the future updates.

The above config follows Pure Python style. Please install mmengine>=0.8.2 to use this config.

Simcc + Vipnas on Coco¶

SimCC (ECCV'2022)

@misc{https://doi.org/10.48550/arxiv.2107.03332,
  title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
  author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
  year={2021}
}

@article{xu2021vipnas,
  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  year={2021}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
simcc_S-ViPNAS-MobileNetV3	256x192	0.695	0.883	0.772	0.755	0.927	ckpt	log

Simcc + Mobilenetv2 on Coco¶

SimCC (ECCV'2022)

@misc{https://doi.org/10.48550/arxiv.2107.03332,
  title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
  author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
  year={2021}
}

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
simcc_mobilenetv2_wo_deconv	256x192	0.620	0.855	0.697	0.678	0.902	ckpt	log

Simcc + Resnet on Coco¶

SimCC (ECCV'2022)

@misc{https://doi.org/10.48550/arxiv.2107.03332,
  title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
  author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
  year={2021}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
simcc_resnet_50	256x192	0.721	0.897	0.798	0.781	0.937	ckpt	log
simcc_resnet_50	384x288	0.735	0.899	0.800	0.790	0.939	ckpt	log

Topdown Heatmap + Litehrnet on Coco¶

LiteHRNet (CVPR'2021)

@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
LiteHRNet-18	256x192	0.642	0.867	0.719	0.705	0.911	ckpt	log
LiteHRNet-18	384x288	0.676	0.876	0.746	0.735	0.919	ckpt	log
LiteHRNet-30	256x192	0.676	0.880	0.756	0.736	0.922	ckpt	log
LiteHRNet-30	384x288	0.700	0.883	0.776	0.758	0.926	ckpt	log

Topdown Heatmap + Hrnet + Augmentation on Coco¶

Albumentations (Information'2020)

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@article{buslaev2020albumentations,
  title={Albumentations: fast and flexible image augmentations},
  author={Buslaev, Alexander and Iglovikov, Vladimir I and Khvedchenya, Eugene and Parinov, Alex and Druzhinin, Mikhail and Kalinin, Alexandr A},
  journal={Information},
  volume={11},
  number={2},
  pages={125},
  year={2020},
  publisher={Multidisciplinary Digital Publishing Institute}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
coarsedropout	256x192	0.753	0.908	0.822	0.805	0.944	ckpt	log
gridmask	256x192	0.752	0.906	0.825	0.804	0.943	ckpt	log
photometric	256x192	0.754	0.908	0.825	0.805	0.943	ckpt	log

Topdown Heatmap + Seresnet on Coco¶

SEResNet (CVPR'2018)

@inproceedings{hu2018squeeze,
  title={Squeeze-and-excitation networks},
  author={Hu, Jie and Shen, Li and Sun, Gang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7132--7141},
  year={2018}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_seresnet_50	256x192	0.729	0.903	0.807	0.784	0.941	ckpt	log
pose_seresnet_50	384x288	0.748	0.904	0.819	0.799	0.941	ckpt	log
pose_seresnet_101	256x192	0.734	0.905	0.814	0.790	0.941	ckpt	log
pose_seresnet_101	384x288	0.754	0.907	0.823	0.805	0.943	ckpt	log
pose_seresnet_152*	256x192	0.730	0.899	0.810	0.787	0.939	ckpt	log
pose_seresnet_152*	384x288	0.753	0.906	0.824	0.806	0.945	ckpt	log

Note that * means without imagenet pre-training.

Topdown Heatmap + Hrnet on Coco¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32	256x192	0.749	0.906	0.821	0.804	0.945	ckpt	log
pose_hrnet_w32	384x288	0.761	0.908	0.826	0.811	0.944	ckpt	log
pose_hrnet_w48	256x192	0.756	0.908	0.826	0.809	0.945	ckpt	log
pose_hrnet_w48	384x288	0.767	0.911	0.832	0.817	0.947	ckpt	log

Topdown Heatmap + Shufflenetv1 on Coco¶

ShufflenetV1 (CVPR'2018)

@inproceedings{zhang2018shufflenet,
  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={6848--6856},
  year={2018}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_shufflenetv1	256x192	0.587	0.849	0.654	0.654	0.896	ckpt	log
pose_shufflenetv1	384x288	0.626	0.862	0.696	0.687	0.903	ckpt	log

Topdown Heatmap + VGG on Coco¶

VGG (ICLR'2015)

@article{simonyan2014very,
  title={Very deep convolutional networks for large-scale image recognition},
  author={Simonyan, Karen and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1409.1556},
  year={2014}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
vgg	256x192	0.699	0.890	0.769	0.754	0.927	ckpt	log

Topdown Heatmap + Resnet + Fp16 on Coco¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

FP16 (ArXiv'2017)

@article{micikevicius2017mixed,
  title={Mixed precision training},
  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
  journal={arXiv preprint arXiv:1710.03740},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnet_50_fp16	256x192	0.716	0.898	0.798	0.772	0.937	ckpt	log

Topdown Heatmap + Hrnet + Aic on Coco¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

MMPose supports training model with combined datasets. coco-aic-merge and coco-aic-combine are two examples.

coco-aic-merge leverages AIC data with partial keypoints as auxiliary data to train a COCO model
coco-aic-combine constructs a combined dataset whose keypoints are the union of COCO and AIC keypoints to train a model that predicts keypoints of both datasets.

Evaluation results on COCO val2017 of models trained with solely COCO dataset and combined dataset as shown below. These models are evaluated with detector having human AP of 56.4 on COCO val2017 dataset.

Train Set	Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
coco	pose_hrnet_w32	256x192	0.749	0.906	0.821	0.804	0.945	ckpt	log
coco-aic-merge	pose_hrnet_w32	256x192	0.756	0.907	0.828	0.809	0.944	ckpt	log
coco-aic-combine	pose_hrnet_w32	256x192	0.755	0.904	0.825	0.807	0.942	ckpt	log

Topdown Heatmap + Vipnas on Coco¶

@article{xu2021vipnas,
  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  year={2021}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
S-ViPNAS-MobileNetV3	256x192	0.700	0.887	0.783	0.758	0.929	ckpt	log
S-ViPNAS-Res50	256x192	0.711	0.894	0.787	0.769	0.934	ckpt	log

Topdown Heatmap + Shufflenetv2 on Coco¶

ShufflenetV2 (ECCV'2018)

@inproceedings{ma2018shufflenet,
  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={116--131},
  year={2018}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_shufflenetv2	256x192	0.602	0.857	0.672	0.668	0.902	ckpt	log
pose_shufflenetv2	384x288	0.638	0.866	0.707	0.699	0.910	ckpt	log

Topdown Heatmap + Resnet + Dark on Coco¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnet_50_dark	256x192	0.724	0.897	0.797	0.777	0.934	ckpt	log
pose_resnet_50_dark	384x288	0.735	0.902	0.801	0.786	0.938	ckpt	log
pose_resnet_101_dark	256x192	0.733	0.900	0.810	0.786	0.938	ckpt	log
pose_resnet_101_dark	384x288	0.749	0.905	0.818	0.799	0.940	ckpt	log
pose_resnet_152_dark	256x192	0.743	0.906	0.819	0.796	0.943	ckpt	log
pose_resnet_152_dark	384x288	0.755	0.907	0.825	0.805	0.943	ckpt	log

Topdown Heatmap + Hrnet + Udp on Coco¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32_udp	256x192	0.762	0.907	0.829	0.810	0.942	ckpt	log
pose_hrnet_w32_udp	384x288	0.768	0.909	0.832	0.815	0.945	ckpt	log
pose_hrnet_w48_udp	256x192	0.768	0.908	0.833	0.817	0.945	ckpt	log
pose_hrnet_w48_udp	384x288	0.773	0.911	0.836	0.821	0.946	ckpt	log
pose_hrnet_w32_udp_regress	256x192	0.759	0.907	0.827	0.813	0.943	ckpt	log

Note that, UDP also adopts the unbiased encoding/decoding algorithm of DARK.

Topdown Heatmap + Cspnext + Udp on Coco¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_cspnext_t_udp	256x192	0.665	0.874	0.723	0.723	0.917	ckpt	log
pose_cspnext_s_udp	256x192	0.697	0.886	0.776	0.753	0.929	ckpt	log
pose_cspnext_m_udp	256x192	0.732	0.896	0.806	0.785	0.937	ckpt	log
pose_cspnext_l_udp	256x192	0.750	0.904	0.822	0.800	0.941	ckpt	log
pose_cspnext_t_udp_aic_coco	256x192	0.655	0.884	0.731	0.689	0.890	ckpt	log
pose_cspnext_s_udp_aic_coco	256x192	0.700	0.905	0.783	0.733	0.918	ckpt	log
pose_cspnext_m_udp_aic_coco	256x192	0.748	0.925	0.818	0.777	0.933	ckpt	log
pose_cspnext_l_udp_aic_coco	256x192	0.772	0.936	0.839	0.799	0.943	ckpt	log

Note that, UDP also adopts the unbiased encoding/decoding algorithm of DARK.

Flip test and detector is not used in the result of aic-coco training.

Topdown Heatmap + Alexnet on Coco¶

AlexNet (NeurIPS'2012)

@inproceedings{krizhevsky2012imagenet,
  title={Imagenet classification with deep convolutional neural networks},
  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle={Advances in neural information processing systems},
  pages={1097--1105},
  year={2012}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_alexnet	256x192	0.448	0.767	0.461	0.521	0.829	ckpt	log

Topdown Heatmap + Mobilenetv2 on Coco¶

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_mobilenetv2	256x192	0.648	0.874	0.725	0.709	0.918	ckpt	log
pose_mobilenetv2	384x288	0.677	0.882	0.746	0.734	0.920	ckpt	log

Topdown Heatmap + Resnet on Coco¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnet_50	256x192	0.718	0.898	0.796	0.774	0.934	ckpt	log
pose_resnet_50	384x288	0.731	0.900	0.799	0.782	0.937	ckpt	log
pose_resnet_101	256x192	0.728	0.904	0.809	0.783	0.942	ckpt	log
pose_resnet_101	384x288	0.749	0.906	0.817	0.799	0.941	ckpt	log
pose_resnet_152	256x192	0.736	0.904	0.818	0.791	0.942	ckpt	log
pose_resnet_152	384x288	0.750	0.908	0.821	0.800	0.942	ckpt	log

The following model is equipped with a visibility prediction head and has been trained using COCO and AIC datasets.

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnet_50	256x192	0.729	0.900	0.807	0.783	0.938	ckpt	log

Topdown Heatmap + Hrnet + Fp16 on Coco¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

FP16 (ArXiv'2017)

@article{micikevicius2017mixed,
  title={Mixed precision training},
  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
  journal={arXiv preprint arXiv:1710.03740},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32_fp16	256x192	0.749	0.907	0.822	0.802	0.946	ckpt	log

Topdown Heatmap + MSPN on Coco¶

MSPN (ArXiv'2019)

@article{li2019rethinking,
  title={Rethinking on Multi-Stage Networks for Human Pose Estimation},
  author={Li, Wenbo and Wang, Zhicheng and Yin, Binyi and Peng, Qixiang and Du, Yuming and Xiao, Tianzi and Yu, Gang and Lu, Hongtao and Wei, Yichen and Sun, Jian},
  journal={arXiv preprint arXiv:1901.00148},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
mspn_50	256x192	0.723	0.895	0.794	0.788	0.934	ckpt	log
2xmspn_50	256x192	0.754	0.903	0.826	0.816	0.942	ckpt	log
3xmspn_50	256x192	0.758	0.904	0.830	0.821	0.943	ckpt	log
4xmspn_50	256x192	0.765	0.906	0.835	0.826	0.943	ckpt	log

Topdown Heatmap + CPM on Coco¶

CPM (CVPR'2016)

@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
cpm	256x192	0.627	0.862	0.709	0.689	0.906	ckpt	log
cpm	384x288	0.652	0.865	0.730	0.710	0.907	ckpt	log

Topdown Heatmap + RSN on Coco¶

RSN (ECCV'2020)

@misc{cai2020learning,
    title={Learning Delicate Local Representations for Multi-Person Pose Estimation},
    author={Yuanhao Cai and Zhicheng Wang and Zhengxiong Luo and Binyi Yin and Angang Du and Haoqian Wang and Xinyu Zhou and Erjin Zhou and Xiangyu Zhang and Jian Sun},
    year={2020},
    eprint={2003.04030},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rsn_18	256x192	0.704	0.887	0.781	0.773	0.927	ckpt	log
rsn_50	256x192	0.724	0.894	0.799	0.790	0.935	ckpt	log
2xrsn_50	256x192	0.748	0.900	0.821	0.810	0.939	ckpt	log
3xrsn_50	256x192	0.750	0.900	0.824	0.814	0.941	ckpt	log

Topdown Heatmap + Resnest on Coco¶

ResNeSt (ArXiv'2020)

@article{zhang2020resnest,
  title={ResNeSt: Split-Attention Networks},
  author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
  journal={arXiv preprint arXiv:2004.08955},
  year={2020}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnest_50	256x192	0.720	0.899	0.800	0.775	0.939	ckpt	log
pose_resnest_50	384x288	0.737	0.900	0.811	0.789	0.937	ckpt	log
pose_resnest_101	256x192	0.725	0.900	0.807	0.781	0.939	ckpt	log
pose_resnest_101	384x288	0.745	0.905	0.818	0.798	0.942	ckpt	log
pose_resnest_200	256x192	0.731	0.905	0.812	0.787	0.943	ckpt	log
pose_resnest_200	384x288	0.753	0.907	0.827	0.805	0.943	ckpt	log
pose_resnest_269	256x192	0.737	0.907	0.819	0.792	0.943	ckpt	log
pose_resnest_269	384x288	0.754	0.908	0.828	0.805	0.943	ckpt	log

Topdown Heatmap + Hourglass on Coco¶

@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hourglass_52	256x256	0.726	0.896	0.799	0.780	0.934	ckpt	log
pose_hourglass_52	384x384	0.746	0.900	0.812	0.797	0.939	ckpt	log

Topdown Heatmap + Resnetv1d on Coco¶

ResNetV1D (CVPR'2019)

@inproceedings{he2019bag,
  title={Bag of tricks for image classification with convolutional neural networks},
  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={558--567},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnetv1d_50	256x192	0.722	0.897	0.796	0.777	0.936	ckpt	log
pose_resnetv1d_50	384x288	0.730	0.899	0.800	0.782	0.935	ckpt	log
pose_resnetv1d_101	256x192	0.732	0.901	0.808	0.785	0.940	ckpt	log
pose_resnetv1d_101	384x288	0.748	0.906	0.817	0.798	0.941	ckpt	log
pose_resnetv1d_152	256x192	0.737	0.904	0.814	0.790	0.940	ckpt	log
pose_resnetv1d_152	384x288	0.751	0.907	0.821	0.801	0.942	ckpt	log

Topdown Heatmap + Hrnet + Dark on Coco¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32_dark	256x192	0.757	0.907	0.825	0.807	0.943	ckpt	log
pose_hrnet_w32_dark	384x288	0.766	0.907	0.829	0.815	0.943	ckpt	log
pose_hrnet_w48_dark	256x192	0.764	0.907	0.831	0.814	0.942	ckpt	log
pose_hrnet_w48_dark	384x288	0.772	0.911	0.833	0.821	0.948	ckpt	log

Topdown Heatmap + Hrformer on Coco¶

HRFormer (NIPS'2021)

@article{yuan2021hrformer,
  title={HRFormer: High-Resolution Vision Transformer for Dense Predict},
  author={Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  year={2021}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrformer_small	256x192	0.738	0.904	0.812	0.793	0.941	ckpt	log
pose_hrformer_small	384x288	0.757	0.905	0.824	0.807	0.941	ckpt	log
pose_hrformer_base	256x192	0.754	0.906	0.827	0.807	0.943	ckpt	log
pose_hrformer_base	384x288	0.774	0.909	0.842	0.823	0.945	ckpt	log

Topdown Heatmap + Resnext on Coco¶

ResNext (CVPR'2017)

@inproceedings{xie2017aggregated,
  title={Aggregated residual transformations for deep neural networks},
  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1492--1500},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_resnext_50	256x192	0.715	0.897	0.791	0.771	0.935	ckpt	log
pose_resnext_50	384x288	0.724	0.899	0.794	0.777	0.936	ckpt	log
pose_resnext_101	256x192	0.726	0.900	0.801	0.781	0.939	ckpt	log
pose_resnext_101	384x288	0.744	0.903	0.815	0.794	0.939	ckpt	log
pose_resnext_152	256x192	0.730	0.903	0.808	0.785	0.940	ckpt	log
pose_resnext_152	384x288	0.742	0.904	0.810	0.794	0.940	ckpt	log

Topdown Heatmap + PVT on Coco¶

PVT (ICCV'2021)

@inproceedings{wang2021pyramid,
  title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions},
  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={568--578},
  year={2021}
}

PVTV2 (CVMJ'2022)

@article{wang2022pvt,
  title={PVT v2: Improved baselines with Pyramid Vision Transformer},
  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
  journal={Computational Visual Media},
  pages={1--10},
  year={2022},
  publisher={Springer}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_pvt-s	256x192	0.714	0.896	0.794	0.773	0.936	ckpt	log
pose_pvtv2-b2	256x192	0.737	0.905	0.812	0.791	0.942	ckpt	log

Topdown Heatmap + Scnet on Coco¶

@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_scnet_50	256x192	0.728	0.899	0.807	0.784	0.938	ckpt	log
pose_scnet_50	384x288	0.751	0.906	0.818	0.802	0.942	ckpt	log
pose_scnet_101	256x192	0.733	0.902	0.811	0.789	0.940	ckpt	log
pose_scnet_101	384x288	0.752	0.906	0.823	0.804	0.943	ckpt	log

Topdown Heatmap + Swin on Coco¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

Swin (ICCV'2021)

@inproceedings{liu2021swin,
  title={Swin transformer: Hierarchical vision transformer using shifted windows},
  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={10012--10022},
  year={2021}
}

FPN (CVPR'2017)

@inproceedings{lin2017feature,
  title={Feature pyramid networks for object detection},
  author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2117--2125},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_swin_t	256x192	0.724	0.901	0.806	0.782	0.940	ckpt	log
pose_swin_b	256x192	0.737	0.904	0.820	0.794	0.942	ckpt	log
pose_swin_b	384x288	0.759	0.910	0.832	0.811	0.946	ckpt	log
pose_swin_l	256x192	0.743	0.906	0.821	0.798	0.943	ckpt	log
pose_swin_l	384x288	0.763	0.912	0.830	0.814	0.949	ckpt	log

Integral Regression + Resnet + Debias on Coco¶

Debias IPR (ICCV'2021)

@inproceedings{gu2021removing,
    title={Removing the Bias of Integral Pose Regression},
    author={Gu, Kerui and Yang, Linlin and Yao, Angela},
    booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
    pages={11067--11076},
    year={2021}
  }

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
debias-ipr_resnet_50	256x256	0.675	0.872	0.740	0.765	0.928	ckpt	log

Integral Regression + Resnet + DSNT on Coco¶

DSNT (2018)

@article{nibali2018numerical,
  title={Numerical Coordinate Regression with Convolutional Neural Networks},
  author={Nibali, Aiden and He, Zhen and Morgan, Stuart and Prendergast, Luke},
  journal={arXiv preprint arXiv:1801.07372},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ipr_resnet_50_dsnt	256x256	0.674	0.870	0.744	0.764	0.928	ckpt	log

Integral Regression + Resnet + Ipr on Coco¶

IPR (ECCV'2018)

@inproceedings{sun2018integral,
  title={Integral human pose regression},
  author={Sun, Xiao and Xiao, Bin and Wei, Fangyin and Liang, Shuang and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={529--545},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
ipr_resnet_50	256x256	0.633	0.860	0.703	0.730	0.919	ckpt	log

Dekr + Hrnet on Coco¶

DEKR (CVPR'2021)

@inproceedings{geng2021bottom,
  title={Bottom-up human pose estimation via disentangled keypoint regression},
  author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={14676--14686},
  year={2021}
}

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
HRNet-w32	512x512	0.686	0.868	0.750	0.735	0.898	ckpt	log
HRNet-w48	640x640	0.714	0.883	0.777	0.762	0.915	ckpt	log

Cid + Hrnet on Coco¶

CID (CVPR'2022)

@InProceedings{Wang_2022_CVPR,
    author    = {Wang, Dongkai and Zhang, Shiliang},
    title     = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {11060-11068}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
CID	512x512	0.704	0.894	0.775	0.753	0.928	ckpt	log
CID	512x512	0.715	0.900	0.782	0.765	0.935	ckpt	log

Rtmpose + Rtmpose on Coco¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Associative Embedding (NIPS'2017)

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t	256x192	0.682	0.883	0.759	0.736	0.920	ckpt	log
rtmpose-s	256x192	0.716	0.892	0.789	0.768	0.929	ckpt	log
rtmpose-m	256x192	0.746	0.899	0.817	0.795	0.935	ckpt	log
rtmpose-l	256x192	0.758	0.906	0.826	0.806	0.942	ckpt	log
rtmpose-t-aic-coco	256x192	0.685	0.880	0.761	0.738	0.918	ckpt	log
rtmpose-s-aic-coco	256x192	0.722	0.892	0.794	0.772	0.929	ckpt	log
rtmpose-m-aic-coco	256x192	0.758	0.903	0.826	0.806	0.940	ckpt	log
rtmpose-l-aic-coco	256x192	0.765	0.906	0.835	0.813	0.942	ckpt	log
rtmpose-m-aic-coco	384x288	0.770	0.908	0.833	0.816	0.943	ckpt	log
rtmpose-l-aic-coco	384x288	0.773	0.907	0.835	0.819	0.942	ckpt	log

Associative Embedding + Hrnet on Coco¶

@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
HRNet-w32	512x512	0.656	0.864	0.719	0.711	0.893	ckpt	log

Rtmo + Rtmo on Body7¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log	onnx
RTMO-t	640x640	0.574	0.803	0.613	0.611	0.836	ckpt	log	onnx
RTMO-s	640x640	0.686	0.879	0.744	0.723	0.908	ckpt	log	onnx
RTMO-m	640x640	0.726	0.899	0.790	0.763	0.926	ckpt	log	onnx
RTMO-l	640x640	0.748	0.911	0.813	0.786	0.939	ckpt	log	onnx

Topdown Heatmap + Hrnet on Humanart¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32-coco	256x192	0.252	0.397	0.255	0.321	0.485	ckpt	log
pose_hrnet_w32-humanart-coco	256x192	0.399	0.545	0.420	0.466	0.613	ckpt	log
pose_hrnet_w48-coco	256x192	0.271	0.413	0.277	0.339	0.499	ckpt	log
pose_hrnet_w48-humanart-coco	256x192	0.417	0.553	0.442	0.481	0.617	ckpt	log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32-coco	256x192	0.533	0.771	0.562	0.574	0.792	ckpt	log
pose_hrnet_w32-humanart-coco	256x192	0.754	0.906	0.812	0.783	0.916	ckpt	log
pose_hrnet_w48-coco	256x192	0.557	0.782	0.593	0.595	0.804	ckpt	log
pose_hrnet_w48-humanart-coco	256x192	0.769	0.906	0.825	0.796	0.919	ckpt	log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
pose_hrnet_w32-coco	256x192	0.749	0.906	0.821	0.804	0.945	ckpt	log
pose_hrnet_w32-humanart-coco	256x192	0.741	0.902	0.814	0.795	0.941	ckpt	log
pose_hrnet_w48-coco	256x192	0.756	0.908	0.826	0.809	0.945	ckpt	log
pose_hrnet_w48-humanart-coco	256x192	0.751	0.905	0.822	0.805	0.943	ckpt	log

Rtmpose + Rtmpose on Humanart¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-coco	256x192	0.161	0.283	0.154	0.221	0.373	ckpt	log
rtmpose-t-humanart-coco	256x192	0.249	0.395	0.256	0.323	0.485	ckpt	log
rtmpose-s-coco	256x192	0.199	0.328	0.198	0.261	0.418	ckpt	log
rtmpose-s-humanart-coco	256x192	0.311	0.462	0.323	0.381	0.540	ckpt	log
rtmpose-m-coco	256x192	0.239	0.372	0.243	0.302	0.455	ckpt	log
rtmpose-m-humanart-coco	256x192	0.355	0.503	0.377	0.417	0.568	ckpt	log
rtmpose-l-coco	256x192	0.260	0.393	0.267	0.323	0.472	ckpt	log
rtmpose-l-humanart-coco	256x192	0.378	0.521	0.399	0.442	0.584	ckpt	log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-coco	256x192	0.444	0.725	0.453	0.488	0.750	ckpt	log
rtmpose-t-humanart-coco	256x192	0.655	0.872	0.720	0.693	0.890	ckpt	log
rtmpose-s-coco	256x192	0.480	0.739	0.498	0.521	0.763	ckpt	log
rtmpose-s-humanart-coco	256x192	0.698	0.893	0.768	0.732	0.903	ckpt	log
rtmpose-m-coco	256x192	0.532	0.765	0.563	0.571	0.789	ckpt	log
rtmpose-m-humanart-coco	256x192	0.728	0.895	0.791	0.759	0.906	ckpt	log
rtmpose-l-coco	256x192	0.564	0.789	0.602	0.599	0.808	ckpt	log
rtmpose-l-humanart-coco	256x192	0.753	0.905	0.812	0.783	0.915	ckpt	log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-coco	256x192	0.682	0.883	0.759	0.736	0.920	ckpt	log
rtmpose-t-humanart-coco	256x192	0.665	0.875	0.739	0.721	0.916	ckpt	log
rtmpose-s-coco	256x192	0.716	0.892	0.789	0.768	0.929	ckpt	log
rtmpose-s-humanart-coco	256x192	0.706	0.888	0.780	0.759	0.928	ckpt	log
rtmpose-m-coco	256x192	0.746	0.899	0.817	0.795	0.935	ckpt	log
rtmpose-m-humanart-coco	256x192	0.725	0.892	0.795	0.775	0.929	ckpt	log
rtmpose-l-coco	256x192	0.758	0.906	0.826	0.806	0.942	ckpt	log
rtmpose-l-humanart-coco	256x192	0.748	0.901	0.816	0.796	0.938	ckpt	log

Results on COCO val2017 with ground-truth bounding box

Arch	Input Size	AP	AP⁵⁰	AP⁷⁵	AR	AR⁵⁰	ckpt	log
rtmpose-t-humanart-coco	256x192	0.679	0.895	0.755	0.710	0.907	ckpt	log
rtmpose-s-humanart-coco	256x192	0.725	0.916	0.798	0.753	0.925	ckpt	log
rtmpose-m-humanart-coco	256x192	0.744	0.916	0.818	0.770	0.930	ckpt	log
rtmpose-l-humanart-coco	256x192	0.770	0.927	0.840	0.794	0.939	ckpt	log

Rtmpose + Rtmpose + Body8-Coco on Body8¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset.
* denotes model trained on 7 public datasets:
- AI Challenger
- MS COCO
- CrowdPose
- MPII
- sub-JHMDB
- Halpe
- PoseTrack18
Body8 denotes the addition of the OCHuman dataset, in addition to the 7 datasets mentioned above, for evaluation.

Config	Input Size	AP^(COCO)	PCK@0.1^(Body8)	AUC^(Body8)	EPE^(Body8)	Params(M)	FLOPS(G)	Download
RTMPose-t*	256x192	65.9	91.44	63.18	19.45	3.34	0.36	Model
RTMPose-s*	256x192	69.7	92.45	65.15	17.85	5.47	0.68	Model
RTMPose-m*	256x192	74.9	94.25	68.59	15.12	13.59	1.93	Model
RTMPose-l*	256x192	76.7	95.08	70.14	13.79	27.66	4.16	Model
RTMPose-m*	384x288	76.6	94.64	70.38	13.98	13.72	4.33	Model
RTMPose-l*	384x288	78.3	95.36	71.58	13.08	27.79	9.35	Model

Rtmpose + Rtmpose on Face6¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset.
Face6 and * denote model trained on 6 public datasets:
- COCO-Wholebody-Face
- WFLW
- 300W
- COFW
- Halpe
- LaPa

Config	Input Size	NME^(LaPa)	FLOPS^(G)	Download
RTMPose-t*	256x256	1.67	0.652	Model
RTMPose-s*	256x256	1.59	1.119	Model
RTMPose-m*	256x256	1.44	2.852	Model

Rtmpose + Rtmpose on Hand5¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Hand5 and * denote model trained on 5 public datasets:

Config	Input Size	PCK@0.2^{(COCO-Wholebody-Hand)}	PCK@0.2^(Hand5)	AUC^(Hand5)	EPE^(Hand5)	FLOPS(G)	Download
RTMPose-m*^{(alpha version)}	256x256	81.5	96.4	83.9	5.06	2.581	ckpt

MPII (CVPR’2014)¶

Topdown Regression + Resnet + Rle on Mpii¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

RLE (ICCV'2021)

@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
deeppose_resnet_50_rle	256x256	0.861	0.277	ckpt	log

Topdown Regression + Resnet on Mpii¶

@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
deeppose_resnet_50	256x256	0.826	0.180	ckpt	log
deeppose_resnet_101	256x256	0.841	0.200	ckpt	log
deeppose_resnet_152	256x256	0.850	0.208	ckpt	log

Topdown Heatmap + Litehrnet on Mpii¶

LiteHRNet (CVPR'2021)

@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
LiteHRNet-18	256x256	0.859	0.260	ckpt	log
LiteHRNet-30	256x256	0.869	0.271	ckpt	log

Topdown Heatmap + Shufflenetv2 on Mpii¶

ShufflenetV2 (ECCV'2018)

@inproceedings{ma2018shufflenet,
  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={116--131},
  year={2018}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_shufflenetv2	256x256	0.828	0.205	ckpt	log

Topdown Heatmap + Hrnet + Dark on Mpii¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_hrnet_w32_dark	256x256	0.904	0.354	ckpt	log
pose_hrnet_w48_dark	256x256	0.905	0.360	ckpt	log

Topdown Heatmap + Shufflenetv1 on Mpii¶

ShufflenetV1 (CVPR'2018)

@inproceedings{zhang2018shufflenet,
  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={6848--6856},
  year={2018}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_shufflenetv1	256x256	0.824	0.195	ckpt	log

Topdown Heatmap + Seresnet on Mpii¶

SEResNet (CVPR'2018)

@inproceedings{hu2018squeeze,
  title={Squeeze-and-excitation networks},
  author={Hu, Jie and Shen, Li and Sun, Gang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7132--7141},
  year={2018}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_seresnet_50	256x256	0.884	0.292	ckpt	log
pose_seresnet_101	256x256	0.884	0.295	ckpt	log
pose_seresnet_152*	256x256	0.884	0.287	ckpt	log

Note that * means without imagenet pre-training.

Topdown Heatmap + Cspnext + Udp on Mpii¶

@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_hrnet_w32	256x256	0.902	0.303	ckpt	log

Topdown Heatmap + Hrnet on Mpii¶

@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_hrnet_w32	256x256	0.900	0.334	ckpt	log
pose_hrnet_w48	256x256	0.901	0.337	ckpt	log

Topdown Heatmap + Hourglass on Mpii¶

@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_hourglass_52	256x256	0.889	0.317	ckpt	log
pose_hourglass_52	384x384	0.894	0.367	ckpt	log

Topdown Heatmap + Resnext on Mpii¶

ResNext (CVPR'2017)

@inproceedings{xie2017aggregated,
  title={Aggregated residual transformations for deep neural networks},
  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1492--1500},
  year={2017}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_resnext_152	256x256	0.887	0.294	ckpt	log

Topdown Heatmap + CPM on Mpii¶

CPM (CVPR'2016)

@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
cpm	368x368	0.876	0.285	ckpt	log

Topdown Heatmap + Resnet on Mpii¶

@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_resnet_50	256x256	0.882	0.286	ckpt	log
pose_resnet_101	256x256	0.888	0.290	ckpt	log
pose_resnet_152	256x256	0.889	0.303	ckpt	log

Topdown Heatmap + Mobilenetv2 on Mpii¶

@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_mobilenetv2	256x256	0.854	0.234	ckpt	log

Topdown Heatmap + Scnet on Mpii¶

@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_scnet_50	256x256	0.888	0.290	ckpt	log
pose_scnet_101	256x256	0.887	0.293	ckpt	log

Topdown Heatmap + Resnetv1d on Mpii¶

ResNetV1D (CVPR'2019)

@inproceedings{he2019bag,
  title={Bag of tricks for image classification with convolutional neural networks},
  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={558--567},
  year={2019}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean	Mean@0.1	ckpt	log
pose_resnetv1d_50	256x256	0.881	0.290	ckpt	log
pose_resnetv1d_101	256x256	0.883	0.295	ckpt	log
pose_resnetv1d_152	256x256	0.888	0.300	ckpt	log

Rtmpose + Rtmpose on Mpii¶

@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch	Input Size	Mean / w. flip	Mean@0.1	ckpt	log
rtmpose-m	256x256	0.907	0.348	ckpt	log

Rtmo + Rtmo on Body7¶

@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}