Shortcuts

Datasets




300WLP (IEEE’2017)


Topdown Heatmap + Hrnetv2 on 300wlp

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
300WLP (IEEE'2017)
@article{zhu2017face,
  title={Face alignment in full pose range: A 3d total solution},
  author={Zhu, Xiangyu and Liu, Xiaoming and Lei, Zhen and Li, Stan Z},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  year={2017},
  publisher={IEEE}
}

Results on 300W-LP dataset

The model is trained on 300W-LP train.

Arch Input Size NMEfull NMEtest ckpt log
pose_hrnetv2_w18 256x256 0.0413 0.04125 ckpt log



CrowdPose (CVPR’2019)


Rtmo + Rtmo on Body7

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}
Halpe (CVPR'2020)
@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log onnx
RTMO-t 640x640 0.574 0.803 0.613 0.611 0.836 ckpt log onnx
RTMO-s 640x640 0.686 0.879 0.744 0.723 0.908 ckpt log onnx
RTMO-m 640x640 0.726 0.899 0.790 0.763 0.926 ckpt log onnx
RTMO-l 640x640 0.748 0.911 0.813 0.786 0.939 ckpt log onnx

Rtmo + Rtmo on Crowdpose

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
RTMO-s 640x640 0.673 0.882 0.729 0.737 0.682 0.591 ckpt log
RTMO-m 640x640 0.711 0.897 0.771 0.774 0.719 0.634 ckpt log
RTMO-l 640x640 0.732 0.907 0.793 0.792 0.741 0.653 ckpt log
RTMO-l* 640x640 0.838 0.947 0.893 0.888 0.847 0.772 ckpt log

* indicates the model is trained using a combined dataset composed of AI Challenger, COCO, CrowdPose, Halpe, MPII, PoseTrack18 and sub-JHMDB.


Dekr + Hrnet on Crowdpose

DEKR (CVPR'2021)
@inproceedings{geng2021bottom,
  title={Bottom-up human pose estimation via disentangled keypoint regression},
  author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={14676--14686},
  year={2021}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test without multi-scale test

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
HRNet-w32 512x512 0.663 0.857 0.714 0.740 0.671 0.576 ckpt log
HRNet-w48 640x640 0.679 0.869 0.731 0.753 0.688 0.593 ckpt log

Topdown Heatmap + Cspnext + Udp on Crowdpose

RTMDet (ArXiv 2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
pose_cspnext_m 256x192 0.662 0.821 0.723 0.759 0.675 0.539 ckpt log

Topdown Heatmap + Hrnet on Crowdpose

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
pose_hrnet_w32 256x192 0.675 0.825 0.729 0.770 0.687 0.553 ckpt log

Topdown Heatmap + Resnet on Crowdpose

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
pose_resnet_50 256x192 0.637 0.808 0.692 0.738 0.650 0.506 ckpt log
pose_resnet_101 256x192 0.647 0.810 0.703 0.745 0.658 0.521 ckpt log
pose_resnet_101 320x256 0.661 0.821 0.714 0.759 0.672 0.534 ckpt log
pose_resnet_152 256x192 0.656 0.818 0.712 0.754 0.666 0.533 ckpt log

Rtmpose + Rtmpose on Crowdpose

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
rtmpose-m 256x192 0.706 0.841 0.765 0.799 0.719 0.582 ckpt log



AI Challenger (ArXiv’2017)


Rtmo + Rtmo on Body7

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}
Halpe (CVPR'2020)
@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log onnx
RTMO-t 640x640 0.574 0.803 0.613 0.611 0.836 ckpt log onnx
RTMO-s 640x640 0.686 0.879 0.744 0.723 0.908 ckpt log onnx
RTMO-m 640x640 0.726 0.899 0.790 0.763 0.926 ckpt log onnx
RTMO-l 640x640 0.748 0.911 0.813 0.786 0.939 ckpt log onnx

Topdown Heatmap + Hrnet + Aic on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

MMPose supports training model with combined datasets. coco-aic-merge and coco-aic-combine are two examples.

  • coco-aic-merge leverages AIC data with partial keypoints as auxiliary data to train a COCO model

  • coco-aic-combine constructs a combined dataset whose keypoints are the union of COCO and AIC keypoints to train a model that predicts keypoints of both datasets.

Evaluation results on COCO val2017 of models trained with solely COCO dataset and combined dataset as shown below. These models are evaluated with detector having human AP of 56.4 on COCO val2017 dataset.

Train Set Arch Input Size AP AP50 AP75 AR AR50 ckpt log
coco pose_hrnet_w32 256x192 0.749 0.906 0.821 0.804 0.945 ckpt log
coco-aic-merge pose_hrnet_w32 256x192 0.756 0.907 0.828 0.809 0.944 ckpt log
coco-aic-combine pose_hrnet_w32 256x192 0.755 0.904 0.825 0.807 0.942 ckpt log

Topdown Heatmap + Hrnet on Aic

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC val set with ground-truth bounding boxes

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32 256x192 0.323 0.761 0.218 0.366 0.789 ckpt log

Topdown Heatmap + Resnet on Aic

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC val set with ground-truth bounding boxes

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_101 256x192 0.294 0.736 0.172 0.337 0.762 ckpt log



InterHand2.6M (ECCV’2020)


Internet + Internet on Interhand3d

InterNet (ECCV'2020)
@InProceedings{Moon_2020_ECCV_InterHand2.6M,
author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2020}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
InterHand2.6M (ECCV'2020)
@InProceedings{Moon_2020_ECCV_InterHand2.6M,
author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2020}
}

Results on InterHand2.6M val & test set

Train Set Set Arch Input Size MPJPE-single MPJPE-interacting MPJPE-all MRRPE APh ckpt log
All test(H+M) InterNet_resnet_50 256x256 9.69 13.72 11.86 29.27 0.99 ckpt log
All val(M) InterNet_resnet_50 256x256 11.30 15.57 13.36 32.15 0.98 ckpt log
All test(H+M) InterNet_resnet_50* 256x256 9.47 13.40 11.59 29.28 0.99 ckpt log
All val(M) InterNet_resnet_50* 256x256 11.22 15.23 13.16 31.73 0.98 ckpt log

Models with * are trained in MMPose 0.x. The checkpoints and logs are only for validation.




Human-Art (CVPR’2023)


Topdown Heatmap + Hrnet on Humanart

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
Human-Art (CVPR'2023)
@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32-coco 256x192 0.252 0.397 0.255 0.321 0.485 ckpt log
pose_hrnet_w32-humanart-coco 256x192 0.399 0.545 0.420 0.466 0.613 ckpt log
pose_hrnet_w48-coco 256x192 0.271 0.413 0.277 0.339 0.499 ckpt log
pose_hrnet_w48-humanart-coco 256x192 0.417 0.553 0.442 0.481 0.617 ckpt log

Results on Human-Art validation dataset with ground-truth bounding-box

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32-coco 256x192 0.533 0.771 0.562 0.574 0.792 ckpt log
pose_hrnet_w32-humanart-coco 256x192 0.754 0.906 0.812 0.783 0.916 ckpt log
pose_hrnet_w48-coco 256x192 0.557 0.782 0.593 0.595 0.804 ckpt log
pose_hrnet_w48-humanart-coco 256x192 0.769 0.906 0.825 0.796 0.919 ckpt log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32-coco 256x192 0.749 0.906 0.821 0.804 0.945 ckpt log
pose_hrnet_w32-humanart-coco 256x192 0.741 0.902 0.814 0.795 0.941 ckpt log
pose_hrnet_w48-coco 256x192 0.756 0.908 0.826 0.809 0.945 ckpt log
pose_hrnet_w48-humanart-coco 256x192 0.751 0.905 0.822 0.805 0.943 ckpt log

Topdown Heatmap + Vitpose on Humanart

To utilize ViTPose, you’ll need to have MMPreTrain. To install the required version, run the following command:

mim install 'mmpretrain>=1.0.0'
ViTPose (NeurIPS'2022)
@inproceedings{
  xu2022vitpose,
  title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation},
  author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022},
}
COCO-WholeBody (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}
Human-Art (CVPR'2023)
@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
ViTPose-S-coco 256x192 0.228 0.371 0.229 0.298 0.467 ckpt log
ViTPose-S-humanart-coco 256x192 0.381 0.532 0.405 0.448 0.602 ckpt log
ViTPose-B-coco 256x192 0.270 0.423 0.272 0.340 0.510 ckpt log
ViTPose-B-humanart-coco 256x192 0.410 0.549 0.434 0.475 0.615 ckpt log
ViTPose-L-coco 256x192 0.342 0.498 0.357 0.413 0.577 ckpt log
ViTPose-L-humanart-coco 256x192 0.459 0.592 0.487 0.525 0.656 ckpt log
ViTPose-H-coco 256x192 0.377 0.541 0.391 0.447 0.615 ckpt log
ViTPose-H-humanart-coco 256x192 0.468 0.594 0.498 0.534 0.655 ckpt log

Results on Human-Art validation dataset with ground-truth bounding-box

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
ViTPose-S-coco 256x192 0.507 0.758 0.531 0.551 0.780 ckpt log
ViTPose-S-humanart-coco 256x192 0.738 0.905 0.802 0.768 0.911 ckpt log
ViTPose-B-coco 256x192 0.555 0.782 0.590 0.599 0.809 ckpt log
ViTPose-B-humanart-coco 256x192 0.759 0.905 0.823 0.790 0.917 ckpt log
ViTPose-L-coco 256x192 0.637 0.838 0.689 0.677 0.859 ckpt log
ViTPose-L-humanart-coco 256x192 0.789 0.916 0.845 0.819 0.929 ckpt log
ViTPose-H-coco 256x192 0.665 0.860 0.715 0.701 0.871 ckpt log
ViTPose-H-humanart-coco 256x192 0.800 0.926 0.855 0.828 0.933 ckpt log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
ViTPose-S-coco 256x192 0.739 0.903 0.816 0.792 0.942 ckpt log
ViTPose-S-humanart-coco 256x192 0.737 0.902 0.811 0.792 0.942 ckpt log
ViTPose-B-coco 256x192 0.757 0.905 0.829 0.810 0.946 ckpt log
ViTPose-B-humanart-coco 256x192 0.758 0.906 0.829 0.812 0.946 ckpt log
ViTPose-L-coco 256x192 0.782 0.914 0.850 0.834 0.952 ckpt log
ViTPose-L-humanart-coco 256x192 0.782 0.914 0.849 0.835 0.953 ckpt log
ViTPose-H-coco 256x192 0.788 0.917 0.855 0.839 0.954 ckpt log
ViTPose-H-humanart-coco 256x192 0.788 0.914 0.853 0.841 0.956 ckpt log

Rtmpose + Rtmpose on Humanart

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
Human-Art (CVPR'2023)
@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-coco 256x192 0.161 0.283 0.154 0.221 0.373 ckpt log
rtmpose-t-humanart-coco 256x192 0.249 0.395 0.256 0.323 0.485 ckpt log
rtmpose-s-coco 256x192 0.199 0.328 0.198 0.261 0.418 ckpt log
rtmpose-s-humanart-coco 256x192 0.311 0.462 0.323 0.381 0.540 ckpt log
rtmpose-m-coco 256x192 0.239 0.372 0.243 0.302 0.455 ckpt log
rtmpose-m-humanart-coco 256x192 0.355 0.503 0.377 0.417 0.568 ckpt log
rtmpose-l-coco 256x192 0.260 0.393 0.267 0.323 0.472 ckpt log
rtmpose-l-humanart-coco 256x192 0.378 0.521 0.399 0.442 0.584 ckpt log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-coco 256x192 0.444 0.725 0.453 0.488 0.750 ckpt log
rtmpose-t-humanart-coco 256x192 0.655 0.872 0.720 0.693 0.890 ckpt log
rtmpose-s-coco 256x192 0.480 0.739 0.498 0.521 0.763 ckpt log
rtmpose-s-humanart-coco 256x192 0.698 0.893 0.768 0.732 0.903 ckpt log
rtmpose-m-coco 256x192 0.532 0.765 0.563 0.571 0.789 ckpt log
rtmpose-m-humanart-coco 256x192 0.728 0.895 0.791 0.759 0.906 ckpt log
rtmpose-l-coco 256x192 0.564 0.789 0.602 0.599 0.808 ckpt log
rtmpose-l-humanart-coco 256x192 0.753 0.905 0.812 0.783 0.915 ckpt log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-coco 256x192 0.682 0.883 0.759 0.736 0.920 ckpt log
rtmpose-t-humanart-coco 256x192 0.665 0.875 0.739 0.721 0.916 ckpt log
rtmpose-s-coco 256x192 0.716 0.892 0.789 0.768 0.929 ckpt log
rtmpose-s-humanart-coco 256x192 0.706 0.888 0.780 0.759 0.928 ckpt log
rtmpose-m-coco 256x192 0.746 0.899 0.817 0.795 0.935 ckpt log
rtmpose-m-humanart-coco 256x192 0.725 0.892 0.795 0.775 0.929 ckpt log
rtmpose-l-coco 256x192 0.758 0.906 0.826 0.806 0.942 ckpt log
rtmpose-l-humanart-coco 256x192 0.748 0.901 0.816 0.796 0.938 ckpt log

Results on COCO val2017 with ground-truth bounding box

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-humanart-coco 256x192 0.679 0.895 0.755 0.710 0.907 ckpt log
rtmpose-s-humanart-coco 256x192 0.725 0.916 0.798 0.753 0.925 ckpt log
rtmpose-m-humanart-coco 256x192 0.744 0.916 0.818 0.770 0.930 ckpt log
rtmpose-l-humanart-coco 256x192 0.770 0.927 0.840 0.794 0.939 ckpt log



COFW (ICCV’2013)


Topdown Heatmap + Hrnetv2 on Cofw

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
COFW (ICCV'2013)
@inproceedings{burgos2013robust,
  title={Robust face landmark estimation under occlusion},
  author={Burgos-Artizzu, Xavier P and Perona, Pietro and Doll{\'a}r, Piotr},
  booktitle={Proceedings of the IEEE international conference on computer vision},
  pages={1513--1520},
  year={2013}
}

Results on COFW dataset

The model is trained on COFW train.

Arch Input Size NME ckpt log
pose_hrnetv2_w18 256x256 3.48 ckpt log



MPII (CVPR’2014)


Rtmo + Rtmo on Body7

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}
Halpe (CVPR'2020)
@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log onnx
RTMO-t 640x640 0.574 0.803 0.613 0.611 0.836 ckpt log onnx
RTMO-s 640x640 0.686 0.879 0.744 0.723 0.908 ckpt log onnx
RTMO-m 640x640 0.726 0.899 0.790 0.763 0.926 ckpt log onnx
RTMO-l 640x640 0.748 0.911 0.813 0.786 0.939 ckpt log onnx

Topdown Regression + Resnet + Rle on Mpii

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
RLE (ICCV'2021)
@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
deeppose_resnet_50_rle 256x256 0.861 0.277 ckpt log

Topdown Regression + Resnet on Mpii

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
deeppose_resnet_50 256x256 0.826 0.180 ckpt log
deeppose_resnet_101 256x256 0.841 0.200 ckpt log
deeppose_resnet_152 256x256 0.850 0.208 ckpt log

Topdown Heatmap + Resnext on Mpii

ResNext (CVPR'2017)
@inproceedings{xie2017aggregated,
  title={Aggregated residual transformations for deep neural networks},
  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1492--1500},
  year={2017}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_resnext_152 256x256 0.887 0.294 ckpt log

Topdown Heatmap + Hrnet + Dark on Mpii

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_hrnet_w32_dark 256x256 0.904 0.354 ckpt log
pose_hrnet_w48_dark 256x256 0.905 0.360 ckpt log

Topdown Heatmap + Resnetv1d on Mpii

ResNetV1D (CVPR'2019)
@inproceedings{he2019bag,
  title={Bag of tricks for image classification with convolutional neural networks},
  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={558--567},
  year={2019}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_resnetv1d_50 256x256 0.881 0.290 ckpt log
pose_resnetv1d_101 256x256 0.883 0.295 ckpt log
pose_resnetv1d_152 256x256 0.888 0.300 ckpt log

Topdown Heatmap + Shufflenetv2 on Mpii

ShufflenetV2 (ECCV'2018)
@inproceedings{ma2018shufflenet,
  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={116--131},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_shufflenetv2 256x256 0.828 0.205 ckpt log

Topdown Heatmap + Scnet on Mpii

SCNet (CVPR'2020)
@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_scnet_50 256x256 0.888 0.290 ckpt log
pose_scnet_101 256x256 0.887 0.293 ckpt log

Topdown Heatmap + Hrnet on Mpii

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_hrnet_w32 256x256 0.900 0.334 ckpt log
pose_hrnet_w48 256x256 0.901 0.337 ckpt log

Topdown Heatmap + Mobilenetv2 on Mpii

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_mobilenetv2 256x256 0.854 0.234 ckpt log

Topdown Heatmap + Shufflenetv1 on Mpii

ShufflenetV1 (CVPR'2018)
@inproceedings{zhang2018shufflenet,
  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={6848--6856},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_shufflenetv1 256x256 0.824 0.195 ckpt log

Topdown Heatmap + CPM on Mpii

CPM (CVPR'2016)
@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
cpm 368x368 0.876 0.285 ckpt log

Topdown Heatmap + Resnet on Mpii

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_resnet_50 256x256 0.882 0.286 ckpt log
pose_resnet_101 256x256 0.888 0.290 ckpt log
pose_resnet_152 256x256 0.889 0.303 ckpt log

Topdown Heatmap + Cspnext + Udp on Mpii

RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_hrnet_w32 256x256 0.902 0.303 ckpt log

Topdown Heatmap + Seresnet on Mpii

SEResNet (CVPR'2018)
@inproceedings{hu2018squeeze,
  title={Squeeze-and-excitation networks},
  author={Hu, Jie and Shen, Li and Sun, Gang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7132--7141},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_seresnet_50 256x256 0.884 0.292 ckpt log
pose_seresnet_101 256x256 0.884 0.295 ckpt log
pose_seresnet_152* 256x256 0.884 0.287 ckpt log

Note that * means without imagenet pre-training.


Topdown Heatmap + Litehrnet on Mpii

LiteHRNet (CVPR'2021)
@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
LiteHRNet-18 256x256 0.859 0.260 ckpt log
LiteHRNet-30 256x256 0.869 0.271 ckpt log

Topdown Heatmap + Hourglass on Mpii

Hourglass (ECCV'2016)
@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_hourglass_52 256x256 0.889 0.317 ckpt log
pose_hourglass_52 384x384 0.894 0.367 ckpt log

Rtmpose + Rtmpose on Mpii

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean / w. flip Mean@0.1 ckpt log
rtmpose-m 256x256 0.907 0.348 ckpt log



Halpe (CVPR’2020)


Rtmo + Rtmo on Body7

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}
Halpe (CVPR'2020)
@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log onnx
RTMO-t 640x640 0.574 0.803 0.613 0.611 0.836 ckpt log onnx
RTMO-s 640x640 0.686 0.879 0.744 0.723 0.908 ckpt log onnx
RTMO-m 640x640 0.726 0.899 0.790 0.763 0.926 ckpt log onnx
RTMO-l 640x640 0.748 0.911 0.813 0.786 0.939 ckpt log onnx



COCO-WholeBody-Face (ECCV’2020)


Topdown Heatmap + Hrnetv2 + Dark + Coco + Wholebody + Face on Coco_wholebody_face

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
COCO-WholeBody-Face (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch Input Size NME ckpt log
pose_hrnetv2_w18_dark 256x256 0.0513 ckpt log

Topdown Heatmap + Mobilenetv2 + Coco + Wholebody + Face on Coco_wholebody_face

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
COCO-WholeBody-Face (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch Input Size NME ckpt log
pose_mobilenetv2 256x256 0.0611 ckpt log

Topdown Heatmap + Hourglass + Coco + Wholebody + Face on Coco_wholebody_face

Hourglass (ECCV'2016)
@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}
COCO-WholeBody-Face (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch Input Size NME ckpt log
pose_hourglass_52 256x256 0.0587 ckpt log

Topdown Heatmap + Resnet + Coco + Wholebody + Face on Coco_wholebody_face

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO-WholeBody-Face (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch Input Size NME ckpt log
pose_res50 256x256 0.0582 ckpt log

Topdown Heatmap + Hrnetv2 + Coco + Wholebody + Face on Coco_wholebody_face

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
COCO-WholeBody-Face (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch Input Size NME ckpt log
pose_hrnetv2_w18 256x256 0.0569 ckpt log

Topdown Heatmap + Scnet + Coco + Wholebody + Face on Coco_wholebody_face

SCNet (CVPR'2020)
@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}
COCO-WholeBody-Face (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch Input Size NME ckpt log
pose_scnet_50 256x256 0.0567 ckpt log

Rtmpose + Rtmpose + Coco + Wholebody + Face on Coco_wholebody_face

RTMDet (ArXiv 2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO-WholeBody-Face (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Face val set

Arch Input Size NME ckpt log
pose_rtmpose_m 256x256 0.0466 ckpt log



DeepFashion (CVPR’2016)


Topdown Heatmap + Resnet on Deepfashion

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
DeepFashion (CVPR'2016)
@inproceedings{liuLQWTcvpr16DeepFashion,
 author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
 title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
 booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 month = {June},
 year = {2016}
}
DeepFashion (ECCV'2016)
@inproceedings{liuYLWTeccv16FashionLandmark,
 author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
 title = {Fashion Landmark Detection in the Wild},
 booktitle = {European Conference on Computer Vision (ECCV)},
 month = {October},
 year = {2016}
 }

Results on DeepFashion val set

Set Arch Input Size PCK@0.2 AUC EPE ckpt log
upper pose_resnet_50 256x192 95.4 57.8 16.8 ckpt log
lower pose_resnet_50 256x192 96.5 74.4 10.5 ckpt log
full pose_resnet_50 256x192 97.7 66.4 12.7 ckpt log

Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper!


Topdown Heatmap + Hrnet on Deepfashion

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
DeepFashion (CVPR'2016)
@inproceedings{liuLQWTcvpr16DeepFashion,
 author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
 title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
 booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 month = {June},
 year = {2016}
}
DeepFashion (ECCV'2016)
@inproceedings{liuYLWTeccv16FashionLandmark,
 author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
 title = {Fashion Landmark Detection in the Wild},
 booktitle = {European Conference on Computer Vision (ECCV)},
 month = {October},
 year = {2016}
 }

Results on DeepFashion val set

Set Arch Input Size PCK@0.2 AUC EPE ckpt log
upper pose_hrnet_w48_udp 256x192 96.1 60.9 15.1 ckpt log
lower pose_hrnet_w48_udp 256x192 97.8 76.1 8.9 ckpt log
full pose_hrnet_w48_udp 256x192 98.3 67.3 11.7 ckpt log

Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper!




PoseTrack18 (CVPR’2018)


Rtmo + Rtmo on Body7

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}
Halpe (CVPR'2020)
@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log onnx
RTMO-t 640x640 0.574 0.803 0.613 0.611 0.836 ckpt log onnx
RTMO-s 640x640 0.686 0.879 0.744 0.723 0.908 ckpt log onnx
RTMO-m 640x640 0.726 0.899 0.790 0.763 0.926 ckpt log onnx
RTMO-l 640x640 0.748 0.911 0.813 0.786 0.939 ckpt log onnx

Topdown Heatmap + Resnet on Posetrack18

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

Results on PoseTrack2018 val with ground-truth bounding boxes

Arch Input Size Head Shou Elb Wri Hip Knee Ankl Total ckpt log
pose_resnet_50 256x192 86.5 87.7 82.5 75.8 80.1 78.8 74.2 81.2 ckpt log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.


Topdown Heatmap + Hrnet on Posetrack18

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

Results on PoseTrack2018 val with ground-truth bounding boxes

Arch Input Size Head Shou Elb Wri Hip Knee Ankl Total ckpt log
pose_hrnet_w32 256x192 86.2 89.0 84.5 79.2 82.3 82.5 78.7 83.4 ckpt log
pose_hrnet_w32 384x288 87.1 89.0 85.1 80.2 80.6 82.8 79.6 83.7 ckpt log
pose_hrnet_w48 256x192 88.3 90.2 86.0 81.0 80.7 83.3 80.6 84.6 ckpt log
pose_hrnet_w48 384x288 87.8 90.0 86.2 81.3 81.0 83.4 80.9 84.6 ckpt log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.

Results on PoseTrack2018 val with MMDetection pre-trained Cascade R-CNN (X-101-64x4d-FPN) human detector

Arch Input Size Head Shou Elb Wri Hip Knee Ankl Total ckpt log
pose_hrnet_w32 256x192 78.0 82.9 79.5 73.8 76.9 76.6 70.2 76.9 ckpt log
pose_hrnet_w32 384x288 79.9 83.6 80.4 74.5 74.8 76.1 70.5 77.3 ckpt log
pose_hrnet_w48 256x192 80.1 83.4 80.6 74.8 74.3 76.8 70.5 77.4 ckpt log
pose_hrnet_w48 384x288 80.2 83.8 80.9 75.2 74.7 76.7 71.7 77.8 ckpt log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.




JHMDB (ICCV’2013)


Rtmo + Rtmo on Body7

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}
Halpe (CVPR'2020)
@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log onnx
RTMO-t 640x640 0.574 0.803 0.613 0.611 0.836 ckpt log onnx
RTMO-s 640x640 0.686 0.879 0.744 0.723 0.908 ckpt log onnx
RTMO-m 640x640 0.726 0.899 0.790 0.763 0.926 ckpt log onnx
RTMO-l 640x640 0.748 0.911 0.813 0.786 0.939 ckpt log onnx

Topdown Heatmap + Resnet on JHMDB

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

Results on Sub-JHMDB dataset

The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.

  • Normalized by Person Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 pose_resnet_50 256x256 99.1 98.0 93.8 91.3 99.4 96.5 92.8 96.1 ckpt log
Sub2 pose_resnet_50 256x256 99.3 97.1 90.6 87.0 98.9 96.3 94.1 95.0 ckpt log
Sub3 pose_resnet_50 256x256 99.0 97.9 94.0 91.6 99.7 98.0 94.7 96.7 ckpt log
Average pose_resnet_50 256x256 99.2 97.7 92.8 90.0 99.3 96.9 93.9 96.0 - -
Sub1 pose_resnet_50 (2 Deconv.) 256x256 99.1 98.5 94.6 92.0 99.4 94.6 92.5 96.1 ckpt log
Sub2 pose_resnet_50 (2 Deconv.) 256x256 99.3 97.8 91.0 87.0 99.1 96.5 93.8 95.2 ckpt log
Sub3 pose_resnet_50 (2 Deconv.) 256x256 98.8 98.4 94.3 92.1 99.8 97.5 93.8 96.7 ckpt log
Average pose_resnet_50 (2 Deconv.) 256x256 99.1 98.2 93.3 90.4 99.4 96.2 93.4 96.0 - -
  • Normalized by Torso Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 pose_resnet_50 256x256 93.3 83.2 74.4 72.7 85.0 81.2 78.9 81.9 ckpt log
Sub2 pose_resnet_50 256x256 94.1 74.9 64.5 62.5 77.9 71.9 78.6 75.5 ckpt log
Sub3 pose_resnet_50 256x256 97.0 82.2 74.9 70.7 84.7 83.7 84.2 82.9 ckpt log
Average pose_resnet_50 256x256 94.8 80.1 71.3 68.6 82.5 78.9 80.6 80.1 - -
Sub1 pose_resnet_50 (2 Deconv.) 256x256 92.4 80.6 73.2 70.5 82.3 75.4 75.0 79.2 ckpt log
Sub2 pose_resnet_50 (2 Deconv.) 256x256 93.4 73.6 63.8 60.5 75.1 68.4 75.5 73.7 ckpt log
Sub3 pose_resnet_50 (2 Deconv.) 256x256 96.1 81.2 72.6 67.9 83.6 80.9 81.5 81.2 ckpt log
Average pose_resnet_50 (2 Deconv.) 256x256 94.0 78.5 69.9 66.3 80.3 74.9 77.3 78.0 - -

Topdown Heatmap + CPM on JHMDB

CPM (CVPR'2016)
@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

Results on Sub-JHMDB dataset

The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.

  • Normalized by Person Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 cpm 368x368 96.1 91.9 81.0 78.9 96.6 90.8 87.3 89.5 ckpt log
Sub2 cpm 368x368 98.1 93.6 77.1 70.9 94.0 89.1 84.7 87.4 ckpt log
Sub3 cpm 368x368 97.9 94.9 87.3 84.0 98.6 94.4 86.2 92.4 ckpt log
Average cpm 368x368 97.4 93.5 81.5 77.9 96.4 91.4 86.1 89.8 - -
  • Normalized by Torso Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 cpm 368x368 89.0 63.0 54.0 54.9 68.2 63.1 61.2 66.0 ckpt log
Sub2 cpm 368x368 90.3 57.9 46.8 44.3 60.8 58.2 62.4 61.1 ckpt log
Sub3 cpm 368x368 91.0 72.6 59.9 54.0 73.2 68.5 65.8 70.3 ckpt log
Average cpm 368x368 90.1 64.5 53.6 51.1 67.4 63.3 63.1 65.7 - -



WFLW (CVPR’2018)


Topdown Regression + Resnet on WFLW

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
WFLW (CVPR'2018)
@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train set.

Model Input Size NME ckpt log
ResNet-50 256x256 4.88 ckpt log

Topdown Regression + Resnet + Softwingloss on WFLW

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
SoftWingloss (TIP'2021)
@article{lin2021structure,
  title={Structure-Coherent Deep Feature Learning for Robust Face Alignment},
  author={Lin, Chunze and Zhu, Beier and Wang, Quan and Liao, Renjie and Qian, Chen and Lu, Jiwen and Zhou, Jie},
  journal={IEEE Transactions on Image Processing},
  year={2021},
  publisher={IEEE}
}
WFLW (CVPR'2018)
@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train set.

Model Input Size NME ckpt log
ResNet-50+SoftWingLoss 256x256 4.44 ckpt log

Topdown Regression + Resnet + Wingloss on WFLW

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
Wingloss (CVPR'2018)
@inproceedings{feng2018wing,
  title={Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural Networks},
  author={Feng, Zhen-Hua and Kittler, Josef and Awais, Muhammad and Huber, Patrik and Wu, Xiao-Jun},
  booktitle={Computer Vision and Pattern Recognition (CVPR), 2018 IEEE Conference on},
  year={2018},
  pages ={2235-2245},
  organization={IEEE}
}
WFLW (CVPR'2018)
@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train set.

Model Input Size NME ckpt log
ResNet-50+WingLoss 256x256 4.67 ckpt log

Topdown Heatmap + Hrnetv2 on WFLW

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
WFLW (CVPR'2018)
@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch Input Size NMEtest NMEpose NMEillumination NMEocclusion NMEblur NMEmakeup NMEexpression ckpt log
pose_hrnetv2_w18 256x256 4.06 6.97 3.99 4.83 4.58 3.94 4.33 ckpt log

Topdown Heatmap + Hrnetv2 + Dark on WFLW

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
WFLW (CVPR'2018)
@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch Input Size NMEtest NMEpose NMEillumination NMEocclusion NMEblur NMEmakeup NMEexpression ckpt log
pose_hrnetv2_w18_dark 256x256 3.98 6.98 3.96 4.78 4.56 3.89 4.29 ckpt log

Topdown Heatmap + Hrnetv2 + Awing on WFLW

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
AdaptiveWingloss (ICCV'2019)
@inproceedings{wang2019adaptive,
  title={Adaptive wing loss for robust face alignment via heatmap regression},
  author={Wang, Xinyao and Bo, Liefeng and Fuxin, Li},
  booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
  pages={6971--6981},
  year={2019}
}
WFLW (CVPR'2018)
@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch Input Size NMEtest NMEpose NMEillumination NMEocclusion NMEblur NMEmakeup NMEexpression ckpt log
pose_hrnetv2_w18_awing 256x256 4.02 6.94 3.97 4.78 4.59 3.87 4.28 ckpt log

Rtmpose + Rtmpose on WFLW

RTMDet (ArXiv 2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
WFLW (CVPR'2018)
@inproceedings{wu2018look,
  title={Look at boundary: A boundary-aware face alignment algorithm},
  author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2129--2138},
  year={2018}
}

Results on WFLW dataset

The model is trained on WFLW train.

Arch Input Size NME ckpt log
pose_rtmpose_m 256x256 4.01 ckpt log



Animal-Pose (ICCV’2019)


Topdown Heatmap + Resnet on Animalpose

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
Animal-Pose (ICCV'2019)
@InProceedings{Cao_2019_ICCV,
    author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
    title = {Cross-Domain Adaptation for Animal Pose Estimation},
    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
    month = {October},
    year = {2019}
}

Results on AnimalPose validation set (1117 instances)

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50 256x256 0.691 0.947 0.770 0.736 0.955 ckpt log
pose_resnet_101 256x256 0.696 0.948 0.774 0.736 0.951 ckpt log
pose_resnet_152 256x256 0.704 0.938 0.786 0.748 0.946 ckpt log

Topdown Heatmap + Hrnet on Animalpose

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
Animal-Pose (ICCV'2019)
@InProceedings{Cao_2019_ICCV,
    author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
    title = {Cross-Domain Adaptation for Animal Pose Estimation},
    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
    month = {October},
    year = {2019}
}

Results on AnimalPose validation set (1117 instances)

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32 256x256 0.740 0.959 0.833 0.780 0.965 ckpt log
pose_hrnet_w48 256x256 0.738 0.958 0.831 0.778 0.962 ckpt log



FreiHand (ICCV’2019)


Topdown Heatmap + Resnet on Freihand2d

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
FreiHand (ICCV'2019)
@inproceedings{zimmermann2019freihand,
  title={Freihand: A dataset for markerless capture of hand pose and shape from single rgb images},
  author={Zimmermann, Christian and Ceylan, Duygu and Yang, Jimei and Russell, Bryan and Argus, Max and Brox, Thomas},
  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
  pages={813--822},
  year={2019}
}

Results on FreiHand val & test set

Set Arch Input Size PCK@0.2 AUC EPE ckpt log
test pose_resnet_50 224x224 0.999 0.868 3.27 ckpt log



OneHand10K (TCSVT’2019)


Topdown Regression + Resnet on Onehand10k

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
OneHand10K (TCSVT'2019)
@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
deeppose_resnet_50 256x256 0.990 0.485 34.21 ckpt log

Topdown Heatmap + Hrnetv2 + Udp on Onehand10k

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
OneHand10K (TCSVT'2019)
@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hrnetv2_w18_udp 256x256 0.990 0.571 23.88 ckpt log

Topdown Heatmap + Mobilenetv2 on Onehand10k

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
OneHand10K (TCSVT'2019)
@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_mobilenet_v2 256x256 0.986 0.537 28.56 ckpt log

Topdown Heatmap + Hrnetv2 on Onehand10k

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
OneHand10K (TCSVT'2019)
@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hrnetv2_w18 256x256 0.990 0.567 24.26 ckpt log

Topdown Heatmap + Resnet on Onehand10k

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
OneHand10K (TCSVT'2019)
@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_resnet_50 256x256 0.989 0.555 25.16 ckpt log

Topdown Heatmap + Hrnetv2 + Dark on Onehand10k

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
OneHand10K (TCSVT'2019)
@article{wang2018mask,
  title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
  author={Wang, Yangang and Peng, Cong and Liu, Yebin},
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  volume={29},
  number={11},
  pages={3258--3268},
  year={2018},
  publisher={IEEE}
}

Results on OneHand10K val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hrnetv2_w18_dark 256x256 0.990 0.572 23.96 ckpt log



UBody (CVPR’2023)


Topdown Heatmap + Hrnet + Ubody-Coco-Wholebody on Ubody2d

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
UBody (CVPR'2023)
@article{lin2023one,
  title={One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer},
  author={Lin, Jing and Zeng, Ailing and Wang, Haoqian and Zhang, Lei and Li, Yu},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2023},
}

Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size Body AP Body AR Foot AP Foot AR Face AP Face AR Hand AP Hand AR Whole AP Whole AR ckpt log
pose_hrnet_w32 256x192 0.685 0.759 0.564 0.675 0.625 0.705 0.516 0.609 0.549 0.646 ckpt log



COCO-WholeBody-Hand (ECCV’2020)


Topdown Heatmap + Resnet + Coco + Wholebody + Hand on Coco_wholebody_hand

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_resnet_50 256x256 0.800 0.833 4.64 ckpt log

Topdown Heatmap + Scnet + Coco + Wholebody + Hand on Coco_wholebody_hand

SCNet (CVPR'2020)
@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_scnet_50 256x256 0.803 0.834 4.55 ckpt log

Topdown Heatmap + Mobilenetv2 + Coco + Wholebody + Hand on Coco_wholebody_hand

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_mobilenetv2 256x256 0.795 0.829 4.77 ckpt log

Topdown Heatmap + Hrnetv2 + Coco + Wholebody + Hand on Coco_wholebody_hand

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hrnetv2_w18 256x256 0.813 0.840 4.39 ckpt log

Topdown Heatmap + Hrnetv2 + Dark + Coco + Wholebody + Hand on Coco_wholebody_hand

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hrnetv2_w18_dark 256x256 0.814 0.840 4.37 ckpt log

Topdown Heatmap + Hourglass + Coco + Wholebody + Hand on Coco_wholebody_hand

Hourglass (ECCV'2016)
@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hourglass_52 256x256 0.804 0.835 4.54 ckpt log

Topdown Heatmap + Litehrnet + Coco + Wholebody + Hand on Coco_wholebody_hand

LiteHRNet (CVPR'2021)
@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
LiteHRNet-18 256x256 0.795 0.830 4.77 ckpt log

Rtmpose + Rtmpose + Coco + Wholebody + Hand on Coco_wholebody_hand

RTMDet (ArXiv 2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO-WholeBody-Hand (ECCV'2020)
@inproceedings{jin2020whole,
  title={Whole-Body Human Pose Estimation in the Wild},
  author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  year={2020}
}

Results on COCO-WholeBody-Hand val set

Arch Input Size PCK@0.2 AUC EPE ckpt log
rtmpose_m 256x256 0.815 0.837 4.51 ckpt log



COCO (ECCV’2014)


Rtmo + Rtmo on Body7

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}
Halpe (CVPR'2020)
@inproceedings{li2020pastanet,
  title={PaStaNet: Toward Human Activity Knowledge Engine},
  author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
  booktitle={CVPR},
  year={2020}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log onnx
RTMO-t 640x640 0.574 0.803 0.613 0.611 0.836 ckpt log onnx
RTMO-s 640x640 0.686 0.879 0.744 0.723 0.908 ckpt log onnx
RTMO-m 640x640 0.726 0.899 0.790 0.763 0.926 ckpt log onnx
RTMO-l 640x640 0.748 0.911 0.813 0.786 0.939 ckpt log onnx

Rtmo + Rtmo on Coco

RTMO
@misc{lu2023rtmo,
      title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
      author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
      year={2023},
      eprint={2312.07526},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
RTMO-s 640x640 0.677 0.878 0.737 0.715 0.908 ckpt log
RTMO-m 640x640 0.709 0.890 0.778 0.747 0.920 ckpt log
RTMO-l 640x640 0.724 0.899 0.788 0.762 0.927 ckpt log

Integral Regression + Resnet + Debias on Coco

Debias IPR (ICCV'2021)
@inproceedings{gu2021removing,
    title={Removing the Bias of Integral Pose Regression},
    author={Gu, Kerui and Yang, Linlin and Yao, Angela},
    booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
    pages={11067--11076},
    year={2021}
  }
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
debias-ipr_resnet_50 256x256 0.675 0.872 0.740 0.765 0.928 ckpt log

Integral Regression + Resnet + DSNT on Coco

DSNT (2018)
@article{nibali2018numerical,
  title={Numerical Coordinate Regression with Convolutional Neural Networks},
  author={Nibali, Aiden and He, Zhen and Morgan, Stuart and Prendergast, Luke},
  journal={arXiv preprint arXiv:1801.07372},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
ipr_resnet_50_dsnt 256x256 0.674 0.870 0.744 0.764 0.928 ckpt log

Integral Regression + Resnet + Ipr on Coco

IPR (ECCV'2018)
@inproceedings{sun2018integral,
  title={Integral human pose regression},
  author={Sun, Xiao and Xiao, Bin and Wei, Fangyin and Liang, Shuang and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={529--545},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
ipr_resnet_50 256x256 0.633 0.860 0.703 0.730 0.919 ckpt log

Edpose + Edpose on Coco

ED-Pose (ICLR'2023)
@inproceedings{
yang2023explicit,
title={Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation},
author={Jie Yang and Ailing Zeng and Shilong Liu and Feng Li and Ruimao Zhang and Lei Zhang},
booktitle={International Conference on Learning Representations},
year={2023},
url={https://openreview.net/forum?id=s4WVupnJjmX}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017.

Arch BackBone AP AP50 AP75 AR AR50 ckpt log
edpose_res50_coco ResNet-50 0.716 0.897 0.783 0.793 0.943 ckpt log

The checkpoint is converted from the official repo. The training of EDPose is not supported yet. It will be supported in the future updates.

The above config follows Pure Python style. Please install mmengine>=0.8.2 to use this config.


Topdown Regression + Mobilenetv2 + Rle on Coco

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
RLE (ICCV'2021)
@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}
MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
deeppose_mobilenetv2_rle_pretrained 256x192 0.593 0.836 0.660 0.644 0.877 ckpt log

Topdown Regression + Resnet on Coco

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
deeppose_resnet_50 256x192 0.541 0.824 0.601 0.649 0.893 ckpt log
deeppose_resnet_101 256x192 0.562 0.831 0.629 0.670 0.900 ckpt log
deeppose_resnet_152 256x192 0.584 0.842 0.659 0.688 0.907 ckpt log

Topdown Regression + Resnet + Rle on Coco

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
RLE (ICCV'2021)
@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
deeppose_resnet_50_rle 256x192 0.706 0.888 0.776 0.753 0.924 ckpt log
deeppose_resnet_50_rle_pretrained 256x192 0.719 0.891 0.788 0.764 0.925 ckpt log
deeppose_resnet_101_rle 256x192 0.722 0.894 0.794 0.768 0.930 ckpt log
deeppose_resnet_152_rle 256x192 0.731 0.897 0.805 0.777 0.933 ckpt log
deeppose_resnet_152_rle 384x288 0.749 0.901 0.815 0.793 0.935 ckpt log

Dekr + Hrnet on Coco

DEKR (CVPR'2021)
@inproceedings{geng2021bottom,
  title={Bottom-up human pose estimation via disentangled keypoint regression},
  author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={14676--14686},
  year={2021}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.686 0.868 0.750 0.735 0.898 ckpt log
HRNet-w48 640x640 0.714 0.883 0.777 0.762 0.915 ckpt log

Yoloxpose + Yoloxpose on Coco

YOLO-Pose (CVPRW'2022)
@inproceedings{maji2022yolo,
  title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss},
  author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={2637--2646},
  year={2022}
}
YOLOX
@article{ge2021yolox,
  title={Yolox: Exceeding yolo series in 2021},
  author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
  journal={arXiv preprint arXiv:2107.08430},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
yoloxpose_tiny 416x416 0.526 0.793 0.556 0.571 0.833 ckpt log
yoloxpose_s 640x640 0.641 0.872 0.702 0.682 0.902 ckpt log
yoloxpose_m 640x640 0.695 0.899 0.766 0.733 0.926 ckpt log
yoloxpose_l 640x640 0.712 0.901 0.782 0.749 0.926 ckpt log

Simcc + Vipnas on Coco

SimCC (ECCV'2022)
@misc{https://doi.org/10.48550/arxiv.2107.03332,
  title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
  author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
  year={2021}
}
ViPNAS (CVPR'2021)
@article{xu2021vipnas,
  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
simcc_S-ViPNAS-MobileNetV3 256x192 0.695 0.883 0.772 0.755 0.927 ckpt log

Simcc + Mobilenetv2 on Coco

SimCC (ECCV'2022)
@misc{https://doi.org/10.48550/arxiv.2107.03332,
  title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
  author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
  year={2021}
}
MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
simcc_mobilenetv2_wo_deconv 256x192 0.620 0.855 0.697 0.678 0.902 ckpt log

Simcc + Resnet on Coco

SimCC (ECCV'2022)
@misc{https://doi.org/10.48550/arxiv.2107.03332,
  title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
  author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
  year={2021}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
simcc_resnet_50 256x192 0.721 0.897 0.798 0.781 0.937 ckpt log
simcc_resnet_50 384x288 0.735 0.899 0.800 0.790 0.939 ckpt log

Topdown Heatmap + Hrnet + Aic on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

MMPose supports training model with combined datasets. coco-aic-merge and coco-aic-combine are two examples.

  • coco-aic-merge leverages AIC data with partial keypoints as auxiliary data to train a COCO model

  • coco-aic-combine constructs a combined dataset whose keypoints are the union of COCO and AIC keypoints to train a model that predicts keypoints of both datasets.

Evaluation results on COCO val2017 of models trained with solely COCO dataset and combined dataset as shown below. These models are evaluated with detector having human AP of 56.4 on COCO val2017 dataset.

Train Set Arch Input Size AP AP50 AP75 AR AR50 ckpt log
coco pose_hrnet_w32 256x192 0.749 0.906 0.821 0.804 0.945 ckpt log
coco-aic-merge pose_hrnet_w32 256x192 0.756 0.907 0.828 0.809 0.944 ckpt log
coco-aic-combine pose_hrnet_w32 256x192 0.755 0.904 0.825 0.807 0.942 ckpt log

Topdown Heatmap + Hourglass on Coco

Hourglass (ECCV'2016)
@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hourglass_52 256x256 0.726 0.896 0.799 0.780 0.934 ckpt log
pose_hourglass_52 384x384 0.746 0.900 0.812 0.797 0.939 ckpt log

Topdown Heatmap + Hrformer on Coco

HRFormer (NIPS'2021)
@article{yuan2021hrformer,
  title={HRFormer: High-Resolution Vision Transformer for Dense Predict},
  author={Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrformer_small 256x192 0.738 0.904 0.812 0.793 0.941 ckpt log
pose_hrformer_small 384x288 0.757 0.905 0.824 0.807 0.941 ckpt log
pose_hrformer_base 256x192 0.754 0.906 0.827 0.807 0.943 ckpt log
pose_hrformer_base 384x288 0.774 0.909 0.842 0.823 0.945 ckpt log

Topdown Heatmap + Hrnet on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32 256x192 0.749 0.906 0.821 0.804 0.945 ckpt log
pose_hrnet_w32 384x288 0.761 0.908 0.826 0.811 0.944 ckpt log
pose_hrnet_w48 256x192 0.756 0.908 0.826 0.809 0.945 ckpt log
pose_hrnet_w48 384x288 0.767 0.911 0.832 0.817 0.947 ckpt log

Topdown Heatmap + Vipnas on Coco

ViPNAS (CVPR'2021)
@article{xu2021vipnas,
  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
S-ViPNAS-MobileNetV3 256x192 0.700 0.887 0.783 0.758 0.929 ckpt log
S-ViPNAS-Res50 256x192 0.711 0.894 0.787 0.769 0.934 ckpt log

Topdown Heatmap + Resnext on Coco

ResNext (CVPR'2017)
@inproceedings{xie2017aggregated,
  title={Aggregated residual transformations for deep neural networks},
  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1492--1500},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnext_50 256x192 0.715 0.897 0.791 0.771 0.935 ckpt log
pose_resnext_50 384x288 0.724 0.899 0.794 0.777 0.936 ckpt log
pose_resnext_101 256x192 0.726 0.900 0.801 0.781 0.939 ckpt log
pose_resnext_101 384x288 0.744 0.903 0.815 0.794 0.939 ckpt log
pose_resnext_152 256x192 0.730 0.903 0.808 0.785 0.940 ckpt log
pose_resnext_152 384x288 0.742 0.904 0.810 0.794 0.940 ckpt log

Topdown Heatmap + CPM on Coco

CPM (CVPR'2016)
@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
cpm 256x192 0.627 0.862 0.709 0.689 0.906 ckpt log
cpm 384x288 0.652 0.865 0.730 0.710 0.907 ckpt log

Topdown Heatmap + Shufflenetv2 on Coco

ShufflenetV2 (ECCV'2018)
@inproceedings{ma2018shufflenet,
  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={116--131},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_shufflenetv2 256x192 0.602 0.857 0.672 0.668 0.902 ckpt log
pose_shufflenetv2 384x288 0.638 0.866 0.707 0.699 0.910 ckpt log

Topdown Heatmap + RSN on Coco

RSN (ECCV'2020)
@misc{cai2020learning,
    title={Learning Delicate Local Representations for Multi-Person Pose Estimation},
    author={Yuanhao Cai and Zhicheng Wang and Zhengxiong Luo and Binyi Yin and Angang Du and Haoqian Wang and Xinyu Zhou and Erjin Zhou and Xiangyu Zhang and Jian Sun},
    year={2020},
    eprint={2003.04030},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rsn_18 256x192 0.704 0.887 0.781 0.773 0.927 ckpt log
rsn_50 256x192 0.724 0.894 0.799 0.790 0.935 ckpt log
2xrsn_50 256x192 0.748 0.900 0.821 0.810 0.939 ckpt log
3xrsn_50 256x192 0.750 0.900 0.824 0.814 0.941 ckpt log

Topdown Heatmap + Hrnet + Dark on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32_dark 256x192 0.757 0.907 0.825 0.807 0.943 ckpt log
pose_hrnet_w32_dark 384x288 0.766 0.907 0.829 0.815 0.943 ckpt log
pose_hrnet_w48_dark 256x192 0.764 0.907 0.831 0.814 0.942 ckpt log
pose_hrnet_w48_dark 384x288 0.772 0.911 0.833 0.821 0.948 ckpt log

Topdown Heatmap + Resnet + Fp16 on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
FP16 (ArXiv'2017)
@article{micikevicius2017mixed,
  title={Mixed precision training},
  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
  journal={arXiv preprint arXiv:1710.03740},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50_fp16 256x192 0.716 0.898 0.798 0.772 0.937 ckpt log

Topdown Heatmap + Resnetv1d on Coco

ResNetV1D (CVPR'2019)
@inproceedings{he2019bag,
  title={Bag of tricks for image classification with convolutional neural networks},
  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={558--567},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnetv1d_50 256x192 0.722 0.897 0.796 0.777 0.936 ckpt log
pose_resnetv1d_50 384x288 0.730 0.899 0.800 0.782 0.935 ckpt log
pose_resnetv1d_101 256x192 0.732 0.901 0.808 0.785 0.940 ckpt log
pose_resnetv1d_101 384x288 0.748 0.906 0.817 0.798 0.941 ckpt log
pose_resnetv1d_152 256x192 0.737 0.904 0.814 0.790 0.940 ckpt log
pose_resnetv1d_152 384x288 0.751 0.907 0.821 0.801 0.942 ckpt log

Topdown Heatmap + Alexnet on Coco

AlexNet (NeurIPS'2012)
@inproceedings{krizhevsky2012imagenet,
  title={Imagenet classification with deep convolutional neural networks},
  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle={Advances in neural information processing systems},
  pages={1097--1105},
  year={2012}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_alexnet 256x192 0.448 0.767 0.461 0.521 0.829 ckpt log

Topdown Heatmap + Seresnet on Coco

SEResNet (CVPR'2018)
@inproceedings{hu2018squeeze,
  title={Squeeze-and-excitation networks},
  author={Hu, Jie and Shen, Li and Sun, Gang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7132--7141},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_seresnet_50 256x192 0.729 0.903 0.807 0.784 0.941 ckpt log
pose_seresnet_50 384x288 0.748 0.904 0.819 0.799 0.941 ckpt log
pose_seresnet_101 256x192 0.734 0.905 0.814 0.790 0.941 ckpt log
pose_seresnet_101 384x288 0.754 0.907 0.823 0.805 0.943 ckpt log
pose_seresnet_152* 256x192 0.730 0.899 0.810 0.787 0.939 ckpt log
pose_seresnet_152* 384x288 0.753 0.906 0.824 0.806 0.945 ckpt log

Note that * means without imagenet pre-training.


Topdown Heatmap + Hrnet + Fp16 on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
FP16 (ArXiv'2017)
@article{micikevicius2017mixed,
  title={Mixed precision training},
  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
  journal={arXiv preprint arXiv:1710.03740},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32_fp16 256x192 0.749 0.907 0.822 0.802 0.946 ckpt log

Topdown Heatmap + Shufflenetv1 on Coco

ShufflenetV1 (CVPR'2018)
@inproceedings{zhang2018shufflenet,
  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={6848--6856},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_shufflenetv1 256x192 0.587 0.849 0.654 0.654 0.896 ckpt log
pose_shufflenetv1 384x288 0.626 0.862 0.696 0.687 0.903 ckpt log

Topdown Heatmap + Swin on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
Swin (ICCV'2021)
@inproceedings{liu2021swin,
  title={Swin transformer: Hierarchical vision transformer using shifted windows},
  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={10012--10022},
  year={2021}
}
FPN (CVPR'2017)
@inproceedings{lin2017feature,
  title={Feature pyramid networks for object detection},
  author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2117--2125},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_swin_t 256x192 0.724 0.901 0.806 0.782 0.940 ckpt log
pose_swin_b 256x192 0.737 0.904 0.820 0.794 0.942 ckpt log
pose_swin_b 384x288 0.759 0.910 0.832 0.811 0.946 ckpt log
pose_swin_l 256x192 0.743 0.906 0.821 0.798 0.943 ckpt log
pose_swin_l 384x288 0.763 0.912 0.830 0.814 0.949 ckpt log

Topdown Heatmap + Cspnext + Udp on Coco

RTMDet (ArXiv 2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_cspnext_t_udp 256x192 0.665 0.874 0.723 0.723 0.917 ckpt log
pose_cspnext_s_udp 256x192 0.697 0.886 0.776 0.753 0.929 ckpt log
pose_cspnext_m_udp 256x192 0.732 0.896 0.806 0.785 0.937 ckpt log
pose_cspnext_l_udp 256x192 0.750 0.904 0.822 0.800 0.941 ckpt log
pose_cspnext_t_udp_aic_coco 256x192 0.655 0.884 0.731 0.689 0.890 ckpt log
pose_cspnext_s_udp_aic_coco 256x192 0.700 0.905 0.783 0.733 0.918 ckpt log
pose_cspnext_m_udp_aic_coco 256x192 0.748 0.925 0.818 0.777 0.933 ckpt log
pose_cspnext_l_udp_aic_coco 256x192 0.772 0.936 0.839 0.799 0.943 ckpt log

Note that, UDP also adopts the unbiased encoding/decoding algorithm of DARK.

Flip test and detector is not used in the result of aic-coco training.


Topdown Heatmap + MSPN on Coco

MSPN (ArXiv'2019)
@article{li2019rethinking,
  title={Rethinking on Multi-Stage Networks for Human Pose Estimation},
  author={Li, Wenbo and Wang, Zhicheng and Yin, Binyi and Peng, Qixiang and Du, Yuming and Xiao, Tianzi and Yu, Gang and Lu, Hongtao and Wei, Yichen and Sun, Jian},
  journal={arXiv preprint arXiv:1901.00148},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
mspn_50 256x192 0.723 0.895 0.794 0.788 0.934 ckpt log
2xmspn_50 256x192 0.754 0.903 0.826 0.816 0.942 ckpt log
3xmspn_50 256x192 0.758 0.904 0.830 0.821 0.943 ckpt log
4xmspn_50 256x192 0.765 0.906 0.835 0.826 0.943 ckpt log

Topdown Heatmap + Resnest on Coco

ResNeSt (ArXiv'2020)
@article{zhang2020resnest,
  title={ResNeSt: Split-Attention Networks},
  author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
  journal={arXiv preprint arXiv:2004.08955},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnest_50 256x192 0.720 0.899 0.800 0.775 0.939 ckpt log
pose_resnest_50 384x288 0.737 0.900 0.811 0.789 0.937 ckpt log
pose_resnest_101 256x192 0.725 0.900 0.807 0.781 0.939 ckpt log
pose_resnest_101 384x288 0.745 0.905 0.818 0.798 0.942 ckpt log
pose_resnest_200 256x192 0.731 0.905 0.812 0.787 0.943 ckpt log
pose_resnest_200 384x288 0.753 0.907 0.827 0.805 0.943 ckpt log
pose_resnest_269 256x192 0.737 0.907 0.819 0.792 0.943 ckpt log
pose_resnest_269 384x288 0.754 0.908 0.828 0.805 0.943 ckpt log

Topdown Heatmap + Hrnet + Augmentation on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
Albumentations (Information'2020)
@article{buslaev2020albumentations,
  title={Albumentations: fast and flexible image augmentations},
  author={Buslaev, Alexander and Iglovikov, Vladimir I and Khvedchenya, Eugene and Parinov, Alex and Druzhinin, Mikhail and Kalinin, Alexandr A},
  journal={Information},
  volume={11},
  number={2},
  pages={125},
  year={2020},
  publisher={Multidisciplinary Digital Publishing Institute}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
coarsedropout 256x192 0.753 0.908 0.822 0.805 0.944 ckpt log
gridmask 256x192 0.752 0.906 0.825 0.804 0.943 ckpt log
photometric 256x192 0.754 0.908 0.825 0.805 0.943 ckpt log

Topdown Heatmap + Scnet on Coco

SCNet (CVPR'2020)
@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_scnet_50 256x192 0.728 0.899 0.807 0.784 0.938 ckpt log
pose_scnet_50 384x288 0.751 0.906 0.818 0.802 0.942 ckpt log
pose_scnet_101 256x192 0.733 0.902 0.811 0.789 0.940 ckpt log
pose_scnet_101 384x288 0.752 0.906 0.823 0.804 0.943 ckpt log

Topdown Heatmap + Mobilenetv2 on Coco

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_mobilenetv2 256x192 0.648 0.874 0.725 0.709 0.918 ckpt log
pose_mobilenetv2 384x288 0.677 0.882 0.746 0.734 0.920 ckpt log

Topdown Heatmap + VGG on Coco

VGG (ICLR'2015)
@article{simonyan2014very,
  title={Very deep convolutional networks for large-scale image recognition},
  author={Simonyan, Karen and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1409.1556},
  year={2014}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
vgg 256x192 0.699 0.890 0.769 0.754 0.927 ckpt log

Topdown Heatmap + Resnet + Dark on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50_dark 256x192 0.724 0.897 0.797 0.777 0.934 ckpt log
pose_resnet_50_dark 384x288 0.735 0.902 0.801 0.786 0.938 ckpt log
pose_resnet_101_dark 256x192 0.733 0.900 0.810 0.786 0.938 ckpt log
pose_resnet_101_dark 384x288 0.749 0.905 0.818 0.799 0.940 ckpt log
pose_resnet_152_dark 256x192 0.743 0.906 0.819 0.796 0.943 ckpt log
pose_resnet_152_dark 384x288 0.755 0.907 0.825 0.805 0.943 ckpt log

Topdown Heatmap + Litehrnet on Coco

LiteHRNet (CVPR'2021)
@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
LiteHRNet-18 256x192 0.642 0.867 0.719 0.705 0.911 ckpt log
LiteHRNet-18 384x288 0.676 0.876 0.746 0.735 0.919 ckpt log
LiteHRNet-30 256x192 0.676 0.880 0.756 0.736 0.922 ckpt log
LiteHRNet-30 384x288 0.700 0.883 0.776 0.758 0.926 ckpt log

Topdown Heatmap + Resnet on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50 256x192 0.718 0.898 0.796 0.774 0.934 ckpt log
pose_resnet_50 384x288 0.731 0.900 0.799 0.782 0.937 ckpt log
pose_resnet_101 256x192 0.728 0.904 0.809 0.783 0.942 ckpt log
pose_resnet_101 384x288 0.749 0.906 0.817 0.799 0.941 ckpt log
pose_resnet_152 256x192 0.736 0.904 0.818 0.791 0.942 ckpt log
pose_resnet_152 384x288 0.750 0.908 0.821 0.800 0.942 ckpt log

The following model is equipped with a visibility prediction head and has been trained using COCO and AIC datasets.

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50 256x192 0.729 0.900 0.807 0.783 0.938 ckpt log

Topdown Heatmap + Hrnet + Udp on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32_udp 256x192 0.762 0.907 0.829 0.810 0.942 ckpt log
pose_hrnet_w32_udp 384x288 0.768 0.909 0.832 0.815 0.945 ckpt log
pose_hrnet_w48_udp 256x192 0.768 0.908 0.833 0.817 0.945 ckpt log
pose_hrnet_w48_udp 384x288 0.773 0.911 0.836 0.821 0.946 ckpt log
pose_hrnet_w32_udp_regress 256x192 0.759 0.907 0.827 0.813 0.943 ckpt log

Note that, UDP also adopts the unbiased encoding/decoding algorithm of DARK.


Topdown Heatmap + PVT on Coco

PVT (ICCV'2021)
@inproceedings{wang2021pyramid,
  title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions},
  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={568--578},
  year={2021}
}
PVTV2 (CVMJ'2022)
@article{wang2022pvt,
  title={PVT v2: Improved baselines with Pyramid Vision Transformer},
  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
  journal={Computational Visual Media},
  pages={1--10},
  year={2022},
  publisher={Springer}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_pvt-s 256x192 0.714 0.896 0.794 0.773 0.936 ckpt log
pose_pvtv2-b2 256x192 0.737 0.905 0.812 0.791 0.942 ckpt log

Cid + Hrnet on Coco

CID (CVPR'2022)
@InProceedings{Wang_2022_CVPR,
    author    = {Wang, Dongkai and Zhang, Shiliang},
    title     = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {11060-11068}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
CID 512x512 0.704 0.894 0.775 0.753 0.928 ckpt log
CID 512x512 0.715 0.900 0.782 0.765 0.935 ckpt log

Rtmpose + Rtmpose on Coco

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t 256x192 0.682 0.883 0.759 0.736 0.920 ckpt log
rtmpose-s 256x192 0.716 0.892 0.789 0.768 0.929 ckpt log
rtmpose-m 256x192 0.746 0.899 0.817 0.795 0.935 ckpt log
rtmpose-l 256x192 0.758 0.906 0.826 0.806 0.942 ckpt log
rtmpose-t-aic-coco 256x192 0.685 0.880 0.761 0.738 0.918 ckpt log
rtmpose-s-aic-coco 256x192 0.722 0.892 0.794 0.772 0.929 ckpt log
rtmpose-m-aic-coco 256x192 0.758 0.903 0.826 0.806 0.940 ckpt log
rtmpose-l-aic-coco 256x192 0.765 0.906 0.835 0.813 0.942 ckpt log
rtmpose-m-aic-coco 384x288 0.770 0.908 0.833 0.816 0.943 ckpt log
rtmpose-l-aic-coco 384x288 0.773 0.907 0.835 0.819 0.942 ckpt log

Associative Embedding + Hrnet on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.656 0.864 0.719 0.711 0.893 ckpt log

Topdown Heatmap + Hrnet on Humanart

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
Human-Art (CVPR'2023)
@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32-coco 256x192 0.252 0.397 0.255 0.321 0.485 ckpt log
pose_hrnet_w32-humanart-coco 256x192 0.399 0.545 0.420 0.466 0.613 ckpt log
pose_hrnet_w48-coco 256x192 0.271 0.413 0.277 0.339 0.499 ckpt log
pose_hrnet_w48-humanart-coco 256x192 0.417 0.553 0.442 0.481 0.617 ckpt log

Results on Human-Art validation dataset with ground-truth bounding-box

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32-coco 256x192 0.533 0.771 0.562 0.574 0.792 ckpt log
pose_hrnet_w32-humanart-coco 256x192 0.754 0.906 0.812 0.783 0.916 ckpt log
pose_hrnet_w48-coco 256x192 0.557 0.782 0.593 0.595 0.804 ckpt log
pose_hrnet_w48-humanart-coco 256x192 0.769 0.906 0.825 0.796 0.919 ckpt log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

With classic decoder

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32-coco 256x192 0.749 0.906 0.821 0.804 0.945 ckpt log
pose_hrnet_w32-humanart-coco 256x192 0.741 0.902 0.814 0.795 0.941 ckpt log
pose_hrnet_w48-coco 256x192 0.756 0.908 0.826 0.809 0.945 ckpt log
pose_hrnet_w48-humanart-coco 256x192 0.751 0.905 0.822 0.805 0.943 ckpt log

Rtmpose + Rtmpose on Humanart

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
Human-Art (CVPR'2023)
@inproceedings{ju2023humanart,
    title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
    author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
    year={2023}}

Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-coco 256x192 0.161 0.283 0.154 0.221 0.373 ckpt log
rtmpose-t-humanart-coco 256x192 0.249 0.395 0.256 0.323 0.485 ckpt log
rtmpose-s-coco 256x192 0.199 0.328 0.198 0.261 0.418 ckpt log
rtmpose-s-humanart-coco 256x192 0.311 0.462 0.323 0.381 0.540 ckpt log
rtmpose-m-coco 256x192 0.239 0.372 0.243 0.302 0.455 ckpt log
rtmpose-m-humanart-coco 256x192 0.355 0.503 0.377 0.417 0.568 ckpt log
rtmpose-l-coco 256x192 0.260 0.393 0.267 0.323 0.472 ckpt log
rtmpose-l-humanart-coco 256x192 0.378 0.521 0.399 0.442 0.584 ckpt log

Results on Human-Art validation dataset with ground-truth bounding-box

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-coco 256x192 0.444 0.725 0.453 0.488 0.750 ckpt log
rtmpose-t-humanart-coco 256x192 0.655 0.872 0.720 0.693 0.890 ckpt log
rtmpose-s-coco 256x192 0.480 0.739 0.498 0.521 0.763 ckpt log
rtmpose-s-humanart-coco 256x192 0.698 0.893 0.768 0.732 0.903 ckpt log
rtmpose-m-coco 256x192 0.532 0.765 0.563 0.571 0.789 ckpt log
rtmpose-m-humanart-coco 256x192 0.728 0.895 0.791 0.759 0.906 ckpt log
rtmpose-l-coco 256x192 0.564 0.789 0.602 0.599 0.808 ckpt log
rtmpose-l-humanart-coco 256x192 0.753 0.905 0.812 0.783 0.915 ckpt log

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-coco 256x192 0.682 0.883 0.759 0.736 0.920 ckpt log
rtmpose-t-humanart-coco 256x192 0.665 0.875 0.739 0.721 0.916 ckpt log
rtmpose-s-coco 256x192 0.716 0.892 0.789 0.768 0.929 ckpt log
rtmpose-s-humanart-coco 256x192 0.706 0.888 0.780 0.759 0.928 ckpt log
rtmpose-m-coco 256x192 0.746 0.899 0.817 0.795 0.935 ckpt log
rtmpose-m-humanart-coco 256x192 0.725 0.892 0.795 0.775 0.929 ckpt log
rtmpose-l-coco 256x192 0.758 0.906 0.826 0.806 0.942 ckpt log
rtmpose-l-humanart-coco 256x192 0.748 0.901 0.816 0.796 0.938 ckpt log

Results on COCO val2017 with ground-truth bounding box

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rtmpose-t-humanart-coco 256x192 0.679 0.895 0.755 0.710 0.907 ckpt log
rtmpose-s-humanart-coco 256x192 0.725 0.916 0.798 0.753 0.925 ckpt log
rtmpose-m-humanart-coco 256x192 0.744 0.916 0.818 0.770 0.930 ckpt log
rtmpose-l-humanart-coco 256x192 0.770 0.927 0.840 0.794 0.939 ckpt log

Rtmpose + Rtmpose + Body8-Coco on Body8

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
Config Input Size AP
(COCO)
PCK@0.1
(Body8)
AUC
(Body8)
EPE
(Body8)
Params(M) FLOPS(G) Download
RTMPose-t* 256x192 65.9 91.44 63.18 19.45 3.34 0.36 Model
RTMPose-s* 256x192 69.7 92.45 65.15 17.85 5.47 0.68 Model
RTMPose-m* 256x192 74.9 94.25 68.59 15.12 13.59 1.93 Model
RTMPose-l* 256x192 76.7 95.08 70.14 13.79 27.66 4.16 Model
RTMPose-m* 384x288 76.6 94.64 70.38 13.98 13.72 4.33 Model
RTMPose-l* 384x288 78.3 95.36 71.58 13.08 27.79 9.35 Model

Rtmpose + Rtmpose on Hand5

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
Config Input Size PCK@0.2
(COCO-Wholebody-Hand)
PCK@0.2
(Hand5)
AUC
(Hand5)
EPE
(Hand5)
FLOPS(G) Download
RTMPose-m*
(alpha version)
256x256 81.5 96.4 83.9 5.06 2.581 ckpt

Rtmpose + Rtmpose on Face6

RTMPose (arXiv'2023)
@misc{https://doi.org/10.48550/arxiv.2303.07399,
  doi = {10.48550/ARXIV.2303.07399},
  url = {https://arxiv.org/abs/2303.07399},
  author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
  publisher = {arXiv},
  year = {2023},
  copyright = {Creative Commons Attribution 4.0 International}
}
RTMDet (arXiv'2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}
Config Input Size NME
(LaPa)
FLOPS
(G)
Download
RTMPose-t* 256x256 1.67 0.652 Model
RTMPose-s* 256x256 1.59 1.119 Model
RTMPose-m* 256x256 1.44 2.852 Model



RHD (ICCV’2017)


Topdown Regression + Resnet on Rhd2d

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
RHD (ICCV'2017)
@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch Input Size PCK@0.2 AUC EPE ckpt log
deeppose_resnet_50 256x256 0.988 0.865 3.32 ckpt log

Topdown Heatmap + Resnet on Rhd2d

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
RHD (ICCV'2017)
@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_resnet50 256x256 0.991 0.898 2.32 ckpt log

Topdown Heatmap + Hrnetv2 on Rhd2d

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
RHD (ICCV'2017)
@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hrnetv2_w18 256x256 0.992 0.902 2.21 ckpt log

Topdown Heatmap + Hrnetv2 + Udp on Rhd2d

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
RHD (ICCV'2017)
@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch Input Size PCKh@0.7 AUC EPE ckpt log
pose_hrnetv2_w18_udp 256x256 0.992 0.902 2.19 ckpt log

Topdown Heatmap + Hrnetv2 + Dark on Rhd2d

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
RHD (ICCV'2017)
@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_hrnetv2_w18_dark 256x256 0.992 0.903 2.18 ckpt log

Topdown Heatmap + Mobilenetv2 on Rhd2d

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
RHD (ICCV'2017)
@TechReport{zb2017hand,
  author={Christian Zimmermann and Thomas Brox},
  title={Learning to Estimate 3D Hand Pose from Single RGB Images},
  institution={arXiv:1705.01389},
  year={2017},
  note="https://arxiv.org/abs/1705.01389",
  url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
}

Results on RHD test set

Arch Input Size PCK@0.2 AUC EPE ckpt log
pose_mobilenet_v2 256x256 0.985 0.883 2.79 ckpt log



AP-10K (NeurIPS’2021)


Topdown Heatmap + Resnet on Ap10k

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
AP-10K (NeurIPS'2021)
@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch Input Size AP AP50 AP75 APM APL ckpt log
pose_resnet_50 256x256 0.680 0.926 0.738 0.552 0.687 ckpt log
pose_resnet_101 256x256 0.681 0.921 0.751 0.545 0.690 ckpt log

Topdown Heatmap + Cspnext + Udp on Ap10k

RTMDet (ArXiv 2022)
@misc{lyu2022rtmdet,
      title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
      author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
      year={2022},
      eprint={2212.07784},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
AP-10K (NeurIPS'2021)
@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch Input Size AP AP50 AP75 APM APL ckpt log
pose_cspnext_m 256x256 0.703 0.944 0.776 0.513 0.710 ckpt log

Topdown Heatmap + Hrnet on Ap10k

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
AP-10K (NeurIPS'2021)
@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch Input Size AP AP50 AP75 APM APL ckpt log
pose_hrnet_w32 256x256 0.722 0.935 0.789 0.557 0.729 ckpt log
pose_hrnet_w48 256x256 0.728 0.936 0.802 0.577 0.735 ckpt log

Rtmpose + Rtmpose on Ap10k

AP-10K (NeurIPS'2021)
@misc{yu2021ap10k,
      title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
      author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
      year={2021},
      eprint={2108.12617},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Results on AP-10K validation set

Arch Input Size AP AP50 AP75 APM APL ckpt log
rtmpose-m 256x256 0.722 0.939 0.788 0.569 0.728 ckpt log



300W (IMAVIS’2016)


Topdown Heatmap + Hrnetv2 on 300w

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
300W (IMAVIS'2016)
@article{sagonas2016300,
  title={300 faces in-the-wild challenge: Database and results},
  author={Sagonas, Christos and Antonakos, Epameinondas and Tzimiropoulos, Georgios and Zafeiriou, Stefanos and Pantic, Maja},
  journal={Image and vision computing},
  volume={47},
  pages={3--18},
  year={2016},
  publisher={Elsevier}
}

Results on 300W dataset

The model is trained on 300W train.

Arch Input Size NMEcommon NMEchallenge NMEfull NMEtest ckpt log
pose_hrnetv2_w18 256x256 2.92 5.64 3.45 4.10 ckpt log



AFLW (ICCVW’2011)


Topdown Heatmap + Hrnetv2 + Dark on Aflw

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
AFLW (ICCVW'2011)
@inproceedings{koestinger2011annotated,
  title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
  author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
  booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
  pages={2144--2151},
  year={2011},
  organization={IEEE}
}

Results on AFLW dataset

The model is trained on AFLW train and evaluated on AFLW full and frontal.

Arch Input Size NMEfull NMEfrontal ckpt log
pose_hrnetv2_w18_dark 256x256 1.35 1.19 ckpt log

Topdown Heatmap + Hrnetv2 on Aflw

HRNetv2 (TPAMI'2019)
@article{WangSCJDZLMTWLX19,
  title={Deep High-Resolution Representation Learning for Visual Recognition},
  author={Jingdong Wang and Ke Sun and Tianheng Cheng and
          Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
          Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
  journal={TPAMI},
  year={2019}
}
AFLW (ICCVW'2011)
@inproceedings{koestinger2011annotated,
  title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
  author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
  booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
  pages={2144--2151},
  year={2011},
  organization={IEEE}
}

Results on AFLW dataset

The model is trained on AFLW train and evaluated on AFLW full and frontal.

Arch Input Size NMEfull NMEfronta