Shortcuts

Body(2D,Kpt,Sview,Img)




Aic Dataset


Associative Embedding + Hrnet on Aic

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC validation set without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.303 0.697 0.225 0.373 0.755 ckpt log

Results on AIC validation set with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.318 0.717 0.246 0.379 0.764 ckpt log

Associative Embedding + Higherhrnet on Aic

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HigherHRNet (CVPR'2020)
@inproceedings{cheng2020higherhrnet,
  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={5386--5395},
  year={2020}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC validation set without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HigherHRNet-w32 512x512 0.315 0.710 0.243 0.379 0.757 ckpt log

Results on AIC validation set with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HigherHRNet-w32 512x512 0.323 0.718 0.254 0.379 0.758 ckpt log

Topdown Heatmap + Hrnet on Aic

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC val set with ground-truth bounding boxes

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32 256x192 0.323 0.762 0.219 0.366 0.789 ckpt log

Topdown Heatmap + Resnet on Aic

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
AI Challenger (ArXiv'2017)
@article{wu2017ai,
  title={Ai challenger: A large-scale dataset for going deeper in image understanding},
  author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
  journal={arXiv preprint arXiv:1711.06475},
  year={2017}
}

Results on AIC val set with ground-truth bounding boxes

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_101 256x192 0.294 0.736 0.174 0.337 0.763 ckpt log



Coco Dataset


Associative Embedding + Higherhrnet + Udp on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HigherHRNet (CVPR'2020)
@inproceedings{cheng2020higherhrnet,
  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={5386--5395},
  year={2020}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HigherHRNet-w32_udp 512x512 0.678 0.862 0.736 0.724 0.890 ckpt log
HigherHRNet-w48_udp 512x512 0.690 0.872 0.750 0.734 0.891 ckpt log

Associative Embedding + Mobilenetv2 on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_mobilenetv2 512x512 0.380 0.671 0.368 0.473 0.741 ckpt log

Results on COCO val2017 with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_mobilenetv2 512x512 0.442 0.696 0.422 0.517 0.766 ckpt log

Associative Embedding + Hrnet on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.654 0.863 0.720 0.710 0.892 ckpt log
HRNet-w48 512x512 0.665 0.860 0.727 0.716 0.889 ckpt log

Results on COCO val2017 with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.698 0.877 0.760 0.748 0.907 ckpt log
HRNet-w48 512x512 0.712 0.880 0.771 0.757 0.909 ckpt log

Associative Embedding + Higherhrnet on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HigherHRNet (CVPR'2020)
@inproceedings{cheng2020higherhrnet,
  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={5386--5395},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HigherHRNet-w32 512x512 0.677 0.870 0.738 0.723 0.890 ckpt log
HigherHRNet-w32 640x640 0.686 0.871 0.747 0.733 0.898 ckpt log
HigherHRNet-w48 512x512 0.686 0.873 0.741 0.731 0.892 ckpt log

Results on COCO val2017 with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HigherHRNet-w32 512x512 0.706 0.881 0.771 0.747 0.901 ckpt log
HigherHRNet-w32 640x640 0.706 0.880 0.770 0.749 0.902 ckpt log
HigherHRNet-w48 512x512 0.716 0.884 0.775 0.755 0.901 ckpt log

Associative Embedding + Resnet on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50 512x512 0.466 0.742 0.479 0.552 0.797 ckpt log
pose_resnet_50 640x640 0.479 0.757 0.487 0.566 0.810 ckpt log
pose_resnet_101 512x512 0.554 0.807 0.599 0.622 0.841 ckpt log
pose_resnet_152 512x512 0.595 0.829 0.648 0.651 0.856 ckpt log

Results on COCO val2017 with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50 512x512 0.503 0.765 0.521 0.591 0.821 ckpt log
pose_resnet_50 640x640 0.525 0.784 0.542 0.610 0.832 ckpt log
pose_resnet_101 512x512 0.603 0.831 0.641 0.668 0.870 ckpt log
pose_resnet_152 512x512 0.660 0.860 0.713 0.709 0.889 ckpt log

Associative Embedding + Hrnet + Udp on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32_udp 512x512 0.671 0.863 0.729 0.717 0.889 ckpt log
HRNet-w48_udp 512x512 0.681 0.872 0.741 0.725 0.892 ckpt log

Associative Embedding + Hourglass + Ae on Coco

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HourglassAENet (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hourglass_ae 512x512 0.613 0.833 0.667 0.659 0.850 ckpt log

Results on COCO val2017 with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hourglass_ae 512x512 0.667 0.855 0.723 0.707 0.877 ckpt log

Deeppose + Resnet on Coco

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
deeppose_resnet_50 256x192 0.526 0.816 0.586 0.638 0.887 ckpt log
deeppose_resnet_101 256x192 0.560 0.832 0.628 0.668 0.900 ckpt log
deeppose_resnet_152 256x192 0.583 0.843 0.659 0.686 0.907 ckpt log

Deeppose + Resnet + Rle on Coco

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
RLE (ICCV'2021)
@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
deeppose_resnet_50_rle 256x192 0.704 0.883 0.777 0.751 0.920 ckpt log
deeppose_resnet_101_rle 256x192 0.722 0.894 0.794 0.768 0.930 ckpt log
deeppose_resnet_152_rle 256x192 0.731 0.897 0.805 0.777 0.933 ckpt log
deeppose_resnet_152_rle 384x288 0.749 0.901 0.815 0.793 0.935 ckpt log

Dekr + Hrnet on Coco

DEKR (CVPR'2021)
@inproceedings{geng2021bottom,
  title={Bottom-up human pose estimation via disentangled keypoint regression},
  author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={14676--14686},
  year={2021}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.680 0.868 0.745 0.728 0.897 ckpt log
HRNet-w48 640x640 0.709 0.876 0.773 0.758 0.909 ckpt log

Results on COCO val2017 with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt
HRNet-w32* 512x512 0.705 0.878 0.767 0.759 0.921 ckpt
HRNet-w48* 640x640 0.722 0.882 0.785 0.778 0.928 ckpt

* these configs are generally used for evaluation. The training settings are identical to their single-scale counterparts.

The results of models provided by the authors on COCO val2017 using the same evaluation protocol

Arch Input Size Setting AP AP50 AP75 AR AR50 ckpt
HRNet-w32 512x512 single-scale 0.678 0.868 0.744 0.728 0.897 see official implementation
HRNet-w48 640x640 single-scale 0.707 0.876 0.773 0.757 0.909 see official implementation
HRNet-w32 512x512 multi-scale 0.708 0.880 0.773 0.763 0.921 see official implementation
HRNet-w48 640x640 multi-scale 0.721 0.881 0.786 0.779 0.927 see official implementation

The discrepancy between these results and that shown in paper is attributed to the differences in implementation details in evaluation process.


Topdown Heatmap + Seresnet on Coco

SEResNet (CVPR'2018)
@inproceedings{hu2018squeeze,
  title={Squeeze-and-excitation networks},
  author={Hu, Jie and Shen, Li and Sun, Gang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7132--7141},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_seresnet_50 256x192 0.728 0.900 0.809 0.784 0.940 ckpt log
pose_seresnet_50 384x288 0.748 0.905 0.819 0.799 0.941 ckpt log
pose_seresnet_101 256x192 0.734 0.904 0.815 0.790 0.942 ckpt log
pose_seresnet_101 384x288 0.753 0.907 0.823 0.805 0.943 ckpt log
pose_seresnet_152* 256x192 0.730 0.899 0.810 0.786 0.940 ckpt log
pose_seresnet_152* 384x288 0.753 0.906 0.823 0.806 0.945 ckpt log

Note that * means without imagenet pre-training.


Topdown Heatmap + Resnetv1d on Coco

ResNetV1D (CVPR'2019)
@inproceedings{he2019bag,
  title={Bag of tricks for image classification with convolutional neural networks},
  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={558--567},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnetv1d_50 256x192 0.722 0.897 0.799 0.777 0.933 ckpt log
pose_resnetv1d_50 384x288 0.730 0.900 0.799 0.780 0.934 ckpt log
pose_resnetv1d_101 256x192 0.731 0.899 0.809 0.786 0.938 ckpt log
pose_resnetv1d_101 384x288 0.748 0.902 0.816 0.799 0.939 ckpt log
pose_resnetv1d_152 256x192 0.737 0.902 0.812 0.791 0.940 ckpt log
pose_resnetv1d_152 384x288 0.752 0.909 0.821 0.802 0.944 ckpt log

Topdown Heatmap + Hourglass on Coco

Hourglass (ECCV'2016)
@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hourglass_52 256x256 0.726 0.896 0.799 0.780 0.934 ckpt log
pose_hourglass_52 384x384 0.746 0.900 0.813 0.797 0.939 ckpt log

Topdown Heatmap + RSN on Coco

RSN (ECCV'2020)
@misc{cai2020learning,
    title={Learning Delicate Local Representations for Multi-Person Pose Estimation},
    author={Yuanhao Cai and Zhicheng Wang and Zhengxiong Luo and Binyi Yin and Angang Du and Haoqian Wang and Xinyu Zhou and Erjin Zhou and Xiangyu Zhang and Jian Sun},
    year={2020},
    eprint={2003.04030},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
rsn_18 256x192 0.704 0.887 0.779 0.771 0.926 ckpt log
rsn_50 256x192 0.723 0.896 0.800 0.788 0.934 ckpt log
2xrsn_50 256x192 0.745 0.899 0.818 0.809 0.939 ckpt log
3xrsn_50 256x192 0.750 0.900 0.823 0.813 0.940 ckpt log

Topdown Heatmap + Resnet + Fp16 on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
FP16 (ArXiv'2017)
@article{micikevicius2017mixed,
  title={Mixed precision training},
  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
  journal={arXiv preprint arXiv:1710.03740},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50_fp16 256x192 0.717 0.898 0.793 0.772 0.936 ckpt log

Topdown Heatmap + Mobilenetv2 on Coco

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_mobilenetv2 256x192 0.646 0.874 0.723 0.707 0.917 ckpt log
pose_mobilenetv2 384x288 0.673 0.879 0.743 0.729 0.916 ckpt log

Topdown Heatmap + Shufflenetv1 on Coco

ShufflenetV1 (CVPR'2018)
@inproceedings{zhang2018shufflenet,
  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={6848--6856},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_shufflenetv1 256x192 0.585 0.845 0.650 0.651 0.894 ckpt log
pose_shufflenetv1 384x288 0.622 0.859 0.685 0.684 0.901 ckpt log

Topdown Heatmap + MSPN on Coco

MSPN (ArXiv'2019)
@article{li2019rethinking,
  title={Rethinking on Multi-Stage Networks for Human Pose Estimation},
  author={Li, Wenbo and Wang, Zhicheng and Yin, Binyi and Peng, Qixiang and Du, Yuming and Xiao, Tianzi and Yu, Gang and Lu, Hongtao and Wei, Yichen and Sun, Jian},
  journal={arXiv preprint arXiv:1901.00148},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
mspn_50 256x192 0.723 0.895 0.794 0.788 0.933 ckpt log
2xmspn_50 256x192 0.754 0.903 0.825 0.815 0.941 ckpt log
3xmspn_50 256x192 0.758 0.904 0.830 0.821 0.943 ckpt log
4xmspn_50 256x192 0.764 0.906 0.835 0.826 0.944 ckpt log

Topdown Heatmap + Hrnet + Fp16 on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
FP16 (ArXiv'2017)
@article{micikevicius2017mixed,
  title={Mixed precision training},
  author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
  journal={arXiv preprint arXiv:1710.03740},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32_fp16 256x192 0.746 0.905 0.88 0.800 0.943 ckpt log

Topdown Heatmap + Hrnet on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32 256x192 0.746 0.904 0.819 0.799 0.942 ckpt log
pose_hrnet_w32 384x288 0.760 0.906 0.829 0.810 0.943 ckpt log
pose_hrnet_w48 256x192 0.756 0.907 0.825 0.806 0.942 ckpt log
pose_hrnet_w48 384x288 0.767 0.910 0.831 0.816 0.946 ckpt log

Topdown Heatmap + Resnext on Coco

ResNext (CVPR'2017)
@inproceedings{xie2017aggregated,
  title={Aggregated residual transformations for deep neural networks},
  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1492--1500},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnext_50 256x192 0.714 0.898 0.789 0.771 0.937 ckpt log
pose_resnext_50 384x288 0.724 0.899 0.794 0.777 0.935 ckpt log
pose_resnext_101 256x192 0.726 0.900 0.801 0.782 0.940 ckpt log
pose_resnext_101 384x288 0.743 0.903 0.815 0.795 0.939 ckpt log
pose_resnext_152 256x192 0.730 0.904 0.808 0.786 0.940 ckpt log
pose_resnext_152 384x288 0.742 0.902 0.810 0.794 0.939 ckpt log

Topdown Heatmap + Resnet + Dark on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50_dark 256x192 0.724 0.898 0.800 0.777 0.936 ckpt log
pose_resnet_50_dark 384x288 0.735 0.900 0.801 0.785 0.937 ckpt log
pose_resnet_101_dark 256x192 0.732 0.899 0.808 0.786 0.938 ckpt log
pose_resnet_101_dark 384x288 0.749 0.902 0.816 0.799 0.939 ckpt log
pose_resnet_152_dark 256x192 0.745 0.905 0.821 0.797 0.942 ckpt log
pose_resnet_152_dark 384x288 0.757 0.909 0.826 0.806 0.943 ckpt log

Topdown Heatmap + VGG on Coco

VGG (ICLR'2015)
@article{simonyan2014very,
  title={Very deep convolutional networks for large-scale image recognition},
  author={Simonyan, Karen and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1409.1556},
  year={2014}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
vgg 256x192 0.698 0.890 0.768 0.754 0.929 ckpt log

Topdown Heatmap + Shufflenetv2 on Coco

ShufflenetV2 (ECCV'2018)
@inproceedings{ma2018shufflenet,
  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={116--131},
  year={2018}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_shufflenetv2 256x192 0.599 0.854 0.663 0.664 0.899 ckpt log
pose_shufflenetv2 384x288 0.636 0.865 0.705 0.697 0.909 ckpt log

Topdown Heatmap + Hrnet + Augmentation on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
Albumentations (Information'2020)
@article{buslaev2020albumentations,
  title={Albumentations: fast and flexible image augmentations},
  author={Buslaev, Alexander and Iglovikov, Vladimir I and Khvedchenya, Eugene and Parinov, Alex and Druzhinin, Mikhail and Kalinin, Alexandr A},
  journal={Information},
  volume={11},
  number={2},
  pages={125},
  year={2020},
  publisher={Multidisciplinary Digital Publishing Institute}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
coarsedropout 256x192 0.753 0.908 0.822 0.806 0.946 ckpt log
gridmask 256x192 0.752 0.906 0.825 0.804 0.943 ckpt log
photometric 256x192 0.753 0.909 0.825 0.805 0.943 ckpt log

Topdown Heatmap + Swin on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
Swin (ICCV'2021)
@inproceedings{liu2021swin,
  title={Swin transformer: Hierarchical vision transformer using shifted windows},
  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={10012--10022},
  year={2021}
}
FPN (CVPR'2017)
@inproceedings{lin2017feature,
  title={Feature pyramid networks for object detection},
  author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={2117--2125},
  year={2017}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_swin_t 256x192 0.724 0.901 0.806 0.782 0.940 ckpt log
pose_swin_b 256x192 0.737 0.904 0.820 0.798 0.946 ckpt log
pose_swin_b 384x288 0.759 0.910 0.832 0.811 0.946 ckpt log
pose_swin_l 256x192 0.743 0.906 0.821 0.798 0.943 ckpt log
pose_swin_l 384x288 0.763 0.912 0.830 0.814 0.949 ckpt log
pose_swin_b_fpn 256x192 0.741 0.907 0.821 0.798 0.946 ckpt log

Topdown Heatmap + Litehrnet on Coco

LiteHRNet (CVPR'2021)
@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
LiteHRNet-18 256x192 0.643 0.868 0.720 0.706 0.912 ckpt log
LiteHRNet-18 384x288 0.677 0.878 0.746 0.735 0.920 ckpt log
LiteHRNet-30 256x192 0.675 0.881 0.754 0.736 0.924 ckpt log
LiteHRNet-30 384x288 0.700 0.884 0.776 0.758 0.928 ckpt log

Topdown Heatmap + Scnet on Coco

SCNet (CVPR'2020)
@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_scnet_50 256x192 0.728 0.899 0.807 0.784 0.938 ckpt log
pose_scnet_50 384x288 0.751 0.906 0.818 0.802 0.943 ckpt log
pose_scnet_101 256x192 0.733 0.903 0.813 0.790 0.941 ckpt log
pose_scnet_101 384x288 0.752 0.906 0.823 0.804 0.943 ckpt log

Topdown Heatmap + Hrformer on Coco

HRFormer (NIPS'2021)
@article{yuan2021hrformer,
  title={HRFormer: High-Resolution Vision Transformer for Dense Predict},
  author={Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrformer_small 256x192 0.738 0.904 0.811 0.792 0.941 ckpt log
pose_hrformer_small 384x288 0.757 0.905 0.824 0.807 0.941 ckpt log
pose_hrformer_base 256x192 0.753 0.907 0.826 0.807 0.943 ckpt log
pose_hrformer_base 384x288 0.774 0.909 0.842 0.823 0.945 ckpt log

Topdown Heatmap + Resnest on Coco

ResNeSt (ArXiv'2020)
@article{zhang2020resnest,
  title={ResNeSt: Split-Attention Networks},
  author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
  journal={arXiv preprint arXiv:2004.08955},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnest_50 256x192 0.721 0.899 0.802 0.776 0.938 ckpt log
pose_resnest_50 384x288 0.737 0.900 0.811 0.789 0.938 ckpt log
pose_resnest_101 256x192 0.725 0.899 0.807 0.781 0.939 ckpt log
pose_resnest_101 384x288 0.746 0.906 0.820 0.798 0.943 ckpt log
pose_resnest_200 256x192 0.732 0.905 0.812 0.787 0.942 ckpt log
pose_resnest_200 384x288 0.754 0.908 0.827 0.807 0.945 ckpt log
pose_resnest_269 256x192 0.738 0.907 0.819 0.793 0.945 ckpt log
pose_resnest_269 384x288 0.755 0.908 0.828 0.806 0.943 ckpt log

Topdown Heatmap + PVT on Coco

PVT (ICCV'2021)
@inproceedings{wang2021pyramid,
  title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions},
  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={568--578},
  year={2021}
}
PVTV2 (CVMJ'2022)
@article{wang2022pvt,
  title={PVT v2: Improved baselines with Pyramid Vision Transformer},
  author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
  journal={Computational Visual Media},
  pages={1--10},
  year={2022},
  publisher={Springer}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_pvt-s 256x192 0.714 0.896 0.794 0.773 0.936 ckpt log
pose_pvtv2-b2 256x192 0.737 0.905 0.812 0.791 0.942 ckpt log

Topdown Heatmap + Resnet on Coco

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50 256x192 0.718 0.898 0.795 0.773 0.937 ckpt log
pose_resnet_50 384x288 0.731 0.900 0.799 0.783 0.931 ckpt log
pose_resnet_101 256x192 0.726 0.899 0.806 0.781 0.939 ckpt log
pose_resnet_101 384x288 0.748 0.905 0.817 0.798 0.940 ckpt log
pose_resnet_152 256x192 0.735 0.905 0.812 0.790 0.943 ckpt log
pose_resnet_152 384x288 0.750 0.908 0.821 0.800 0.942 ckpt log

Topdown Heatmap + Alexnet on Coco

AlexNet (NeurIPS'2012)
@inproceedings{krizhevsky2012imagenet,
  title={Imagenet classification with deep convolutional neural networks},
  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle={Advances in neural information processing systems},
  pages={1097--1105},
  year={2012}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_alexnet 256x192 0.397 0.758 0.381 0.478 0.822 ckpt log

Topdown Heatmap + Hrnet + Udp on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
UDP (CVPR'2020)
@InProceedings{Huang_2020_CVPR,
  author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
  title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
  booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32_udp 256x192 0.760 0.907 0.827 0.811 0.945 ckpt log
pose_hrnet_w32_udp 384x288 0.769 0.908 0.833 0.817 0.944 ckpt log
pose_hrnet_w48_udp 256x192 0.767 0.906 0.834 0.817 0.945 ckpt log
pose_hrnet_w48_udp 384x288 0.772 0.910 0.835 0.820 0.945 ckpt log
pose_hrnet_w32_udp_regress 256x192 0.758 0.908 0.823 0.812 0.943 ckpt log

Note that, UDP also adopts the unbiased encoding/decoding algorithm of DARK.


Topdown Heatmap + Hrnet + Dark on Coco

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32_dark 256x192 0.757 0.907 0.823 0.808 0.943 ckpt log
pose_hrnet_w32_dark 384x288 0.766 0.907 0.831 0.815 0.943 ckpt log
pose_hrnet_w48_dark 256x192 0.764 0.907 0.830 0.814 0.943 ckpt log
pose_hrnet_w48_dark 384x288 0.772 0.910 0.836 0.820 0.946 ckpt log

Topdown Heatmap + Vipnas on Coco

ViPNAS (CVPR'2021)
@article{xu2021vipnas,
  title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
  author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  year={2021}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
S-ViPNAS-MobileNetV3 256x192 0.700 0.887 0.778 0.757 0.929 ckpt log
S-ViPNAS-Res50 256x192 0.711 0.893 0.789 0.769 0.934 ckpt log

Topdown Heatmap + CPM on Coco

CPM (CVPR'2016)
@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
cpm 256x192 0.623 0.859 0.704 0.686 0.903 ckpt log
cpm 384x288 0.650 0.864 0.725 0.708 0.905 ckpt log

Cid + Hrnet on Coco

CID (CVPR'2022)
@InProceedings{Wang_2022_CVPR,
    author    = {Wang, Dongkai and Zhang, Shiliang},
    title     = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2022},
    pages     = {11060-11068}
}
COCO (ECCV'2014)
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={European conference on computer vision},
  pages={740--755},
  year={2014},
  organization={Springer}
}

Results on COCO val2017 without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
CID 512x512 0.702 0.887 0.768 0.755 0.926 ckpt log
CID 512x512 0.715 0.895 0.780 0.768 0.932 ckpt log



Crowdpose Dataset


Associative Embedding + Higherhrnet on Crowdpose

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HigherHRNet (CVPR'2020)
@inproceedings{cheng2020higherhrnet,
  title={HigherHRNet: Scale-Aware Representation Learning for Bottom-Up Human Pose Estimation},
  author={Cheng, Bowen and Xiao, Bin and Wang, Jingdong and Shi, Honghui and Huang, Thomas S and Zhang, Lei},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={5386--5395},
  year={2020}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test without multi-scale test

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
HigherHRNet-w32 512x512 0.655 0.859 0.705 0.728 0.660 0.577 ckpt log

Results on CrowdPose test with multi-scale test. 2 scales ([2, 1]) are used

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
HigherHRNet-w32 512x512 0.661 0.864 0.710 0.742 0.670 0.566 ckpt log

Dekr + Hrnet on Crowdpose

DEKR (CVPR'2021)
@inproceedings{geng2021bottom,
  title={Bottom-up human pose estimation via disentangled keypoint regression},
  author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={14676--14686},
  year={2021}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w32 512x512 0.663 0.857 0.715 0.719 0.893 ckpt log
HRNet-w48 640x640 0.682 0.869 0.736 0.742 0.911 ckpt log

Results on CrowdPose test with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt
HRNet-w32* 512x512 0.692 0.874 0.748 0.755 0.926 ckpt
HRNet-w48* 640x640 0.696 0.869 0.749 0.769 0.933 ckpt

* these configs are generally used for evaluation. The training settings are identical to their single-scale counterparts.


Topdown Heatmap + Hrnet on Crowdpose

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
pose_hrnet_w32 256x192 0.675 0.825 0.729 0.770 0.687 0.553 ckpt log

Topdown Heatmap + Resnet on Crowdpose

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
CrowdPose (CVPR'2019)
@article{li2018crowdpose,
  title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
  author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
  journal={arXiv preprint arXiv:1812.00324},
  year={2018}
}

Results on CrowdPose test with YOLOv3 human detector

Arch Input Size AP AP50 AP75 AP (E) AP (M) AP (H) ckpt log
pose_resnet_50 256x192 0.637 0.808 0.692 0.739 0.650 0.506 ckpt log
pose_resnet_101 256x192 0.647 0.810 0.703 0.744 0.658 0.522 ckpt log
pose_resnet_101 320x256 0.661 0.821 0.714 0.759 0.671 0.536 ckpt log
pose_resnet_152 256x192 0.656 0.818 0.712 0.754 0.666 0.532 ckpt log



H36m Dataset


Topdown Heatmap + Hrnet on H36m

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
Human3.6M (TPAMI'2014)
@article{h36m_pami,
  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  publisher = {IEEE Computer Society},
  volume = {36},
  number = {7},
  pages = {1325-1339},
  month = {jul},
  year = {2014}
}

Results on Human3.6M test set with ground truth 2D detections

Arch Input Size EPE PCK ckpt log
pose_hrnet_w32 256x256 9.43 0.911 ckpt log
pose_hrnet_w48 256x256 7.36 0.932 ckpt log



JHMDB Dataset


Topdown Heatmap + CPM on JHMDB

CPM (CVPR'2016)
@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

Results on Sub-JHMDB dataset

The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.

  • Normalized by Person Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 cpm 368x368 96.1 91.9 81.0 78.9 96.6 90.8 87.3 89.5 ckpt log
Sub2 cpm 368x368 98.1 93.6 77.1 70.9 94.0 89.1 84.7 87.4 ckpt log
Sub3 cpm 368x368 97.9 94.9 87.3 84.0 98.6 94.4 86.2 92.4 ckpt log
Average cpm 368x368 97.4 93.5 81.5 77.9 96.4 91.4 86.1 89.8 - -
  • Normalized by Torso Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 cpm 368x368 89.0 63.0 54.0 54.9 68.2 63.1 61.2 66.0 ckpt log
Sub2 cpm 368x368 90.3 57.9 46.8 44.3 60.8 58.2 62.4 61.1 ckpt log
Sub3 cpm 368x368 91.0 72.6 59.9 54.0 73.2 68.5 65.8 70.3 ckpt log
Average cpm 368x368 90.1 64.5 53.6 51.1 67.4 63.3 63.1 65.7 - -

Topdown Heatmap + Resnet on JHMDB

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
JHMDB (ICCV'2013)
@inproceedings{Jhuang:ICCV:2013,
  title = {Towards understanding action recognition},
  author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
  booktitle = {International Conf. on Computer Vision (ICCV)},
  month = Dec,
  pages = {3192-3199},
  year = {2013}
}

Results on Sub-JHMDB dataset

The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.

  • Normalized by Person Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 pose_resnet_50 256x256 99.1 98.0 93.8 91.3 99.4 96.5 92.8 96.1 ckpt log
Sub2 pose_resnet_50 256x256 99.3 97.1 90.6 87.0 98.9 96.3 94.1 95.0 ckpt log
Sub3 pose_resnet_50 256x256 99.0 97.9 94.0 91.6 99.7 98.0 94.7 96.7 ckpt log
Average pose_resnet_50 256x256 99.2 97.7 92.8 90.0 99.3 96.9 93.9 96.0 - -
Sub1 pose_resnet_50 (2 Deconv.) 256x256 99.1 98.5 94.6 92.0 99.4 94.6 92.5 96.1 ckpt log
Sub2 pose_resnet_50 (2 Deconv.) 256x256 99.3 97.8 91.0 87.0 99.1 96.5 93.8 95.2 ckpt log
Sub3 pose_resnet_50 (2 Deconv.) 256x256 98.8 98.4 94.3 92.1 99.8 97.5 93.8 96.7 ckpt log
Average pose_resnet_50 (2 Deconv.) 256x256 99.1 98.2 93.3 90.4 99.4 96.2 93.4 96.0 - -
  • Normalized by Torso Size

Split Arch Input Size Head Sho Elb Wri Hip Knee Ank Mean ckpt log
Sub1 pose_resnet_50 256x256 93.3 83.2 74.4 72.7 85.0 81.2 78.9 81.9 ckpt log
Sub2 pose_resnet_50 256x256 94.1 74.9 64.5 62.5 77.9 71.9 78.6 75.5 ckpt log
Sub3 pose_resnet_50 256x256 97.0 82.2 74.9 70.7 84.7 83.7 84.2 82.9 ckpt log
Average pose_resnet_50 256x256 94.8 80.1 71.3 68.6 82.5 78.9 80.6 80.1 - -
Sub1 pose_resnet_50 (2 Deconv.) 256x256 92.4 80.6 73.2 70.5 82.3 75.4 75.0 79.2 ckpt log
Sub2 pose_resnet_50 (2 Deconv.) 256x256 93.4 73.6 63.8 60.5 75.1 68.4 75.5 73.7 ckpt log
Sub3 pose_resnet_50 (2 Deconv.) 256x256 96.1 81.2 72.6 67.9 83.6 80.9 81.5 81.2 ckpt log
Average pose_resnet_50 (2 Deconv.) 256x256 94.0 78.5 69.9 66.3 80.3 74.9 77.3 78.0 - -



MHP Dataset


Associative Embedding + Hrnet on MHP

Associative Embedding (NIPS'2017)
@inproceedings{newell2017associative,
  title={Associative embedding: End-to-end learning for joint detection and grouping},
  author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
  booktitle={Advances in neural information processing systems},
  pages={2277--2287},
  year={2017}
}
HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
MHP (ACM MM'2018)
@inproceedings{zhao2018understanding,
  title={Understanding humans in crowded scenes: Deep nested adversarial learning and a new benchmark for multi-human parsing},
  author={Zhao, Jian and Li, Jianshu and Cheng, Yu and Sim, Terence and Yan, Shuicheng and Feng, Jiashi},
  booktitle={Proceedings of the 26th ACM international conference on Multimedia},
  pages={792--800},
  year={2018}
}

Results on MHP v2.0 validation set without multi-scale test

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w48 512x512 0.583 0.895 0.666 0.656 0.931 ckpt log

Results on MHP v2.0 validation set with multi-scale test. 3 default scales ([2, 1, 0.5]) are used

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
HRNet-w48 512x512 0.592 0.898 0.673 0.664 0.932 ckpt log

Topdown Heatmap + Resnet on MHP

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MHP (ACM MM'2018)
@inproceedings{zhao2018understanding,
  title={Understanding humans in crowded scenes: Deep nested adversarial learning and a new benchmark for multi-human parsing},
  author={Zhao, Jian and Li, Jianshu and Cheng, Yu and Sim, Terence and Yan, Shuicheng and Feng, Jiashi},
  booktitle={Proceedings of the 26th ACM international conference on Multimedia},
  pages={792--800},
  year={2018}
}

Results on MHP v2.0 val set

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_101 256x192 0.583 0.897 0.669 0.636 0.918 ckpt log

Note that, the evaluation metric used here is mAP (adapted from COCO), which may be different from the official evaluation codes. Please be cautious if you use the results in papers.




Mpii Dataset


Deeppose + Resnet on Mpii

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
deeppose_resnet_50 256x256 0.825 0.174 ckpt log
deeppose_resnet_101 256x256 0.841 0.193 ckpt log
deeppose_resnet_152 256x256 0.850 0.198 ckpt log

Deeppose + Resnet + Rle on Mpii

DeepPose (CVPR'2014)
@inproceedings{toshev2014deeppose,
  title={Deeppose: Human pose estimation via deep neural networks},
  author={Toshev, Alexander and Szegedy, Christian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1653--1660},
  year={2014}
}
RLE (ICCV'2021)
@inproceedings{li2021human,
  title={Human pose regression with residual log-likelihood estimation},
  author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={11025--11034},
  year={2021}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
deeppose_resnet_50_rle 256x256 0.860 0.263 ckpt log

Topdown Heatmap + Hrnet on Mpii

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_hrnet_w32 256x256 0.900 0.334 ckpt log
pose_hrnet_w48 256x256 0.901 0.337 ckpt log

Topdown Heatmap + Mobilenetv2 on Mpii

MobilenetV2 (CVPR'2018)
@inproceedings{sandler2018mobilenetv2,
  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4510--4520},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_mobilenetv2 256x256 0.854 0.235 ckpt log

Topdown Heatmap + Resnet on Mpii

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_resnet_50 256x256 0.882 0.286 ckpt log
pose_resnet_101 256x256 0.888 0.290 ckpt log
pose_resnet_152 256x256 0.889 0.303 ckpt log

Topdown Heatmap + CPM on Mpii

CPM (CVPR'2016)
@inproceedings{wei2016convolutional,
  title={Convolutional pose machines},
  author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
  pages={4724--4732},
  year={2016}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
cpm 368x368 0.876 0.285 ckpt log

Topdown Heatmap + Shufflenetv2 on Mpii

ShufflenetV2 (ECCV'2018)
@inproceedings{ma2018shufflenet,
  title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
  author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={116--131},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_shufflenetv2 256x256 0.828 0.205 ckpt log

Topdown Heatmap + Litehrnet on Mpii

LiteHRNet (CVPR'2021)
@inproceedings{Yulitehrnet21,
  title={Lite-HRNet: A Lightweight High-Resolution Network},
  author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
  booktitle={CVPR},
  year={2021}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
LiteHRNet-18 256x256 0.859 0.260 ckpt log
LiteHRNet-30 256x256 0.869 0.271 ckpt log

Topdown Heatmap + Resnext on Mpii

ResNext (CVPR'2017)
@inproceedings{xie2017aggregated,
  title={Aggregated residual transformations for deep neural networks},
  author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1492--1500},
  year={2017}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_resnext_152 256x256 0.887 0.294 ckpt log

Topdown Heatmap + Shufflenetv1 on Mpii

ShufflenetV1 (CVPR'2018)
@inproceedings{zhang2018shufflenet,
  title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
  author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={6848--6856},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_shufflenetv1 256x256 0.823 0.195 ckpt log

Topdown Heatmap + Hrnet + Dark on Mpii

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
DarkPose (CVPR'2020)
@inproceedings{zhang2020distribution,
  title={Distribution-aware coordinate representation for human pose estimation},
  author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={7093--7102},
  year={2020}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_hrnet_w32_dark 256x256 0.904 0.354 ckpt log
pose_hrnet_w48_dark 256x256 0.905 0.360 ckpt log

Topdown Heatmap + Scnet on Mpii

SCNet (CVPR'2020)
@inproceedings{liu2020improving,
  title={Improving Convolutional Networks with Self-Calibrated Convolutions},
  author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10096--10105},
  year={2020}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_scnet_50 256x256 0.888 0.290 ckpt log
pose_scnet_101 256x256 0.886 0.293 ckpt log

Topdown Heatmap + Seresnet on Mpii

SEResNet (CVPR'2018)
@inproceedings{hu2018squeeze,
  title={Squeeze-and-excitation networks},
  author={Hu, Jie and Shen, Li and Sun, Gang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7132--7141},
  year={2018}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_seresnet_50 256x256 0.884 0.292 ckpt log
pose_seresnet_101 256x256 0.884 0.295 ckpt log
pose_seresnet_152* 256x256 0.884 0.287 ckpt log

Note that * means without imagenet pre-training.


Topdown Heatmap + Resnetv1d on Mpii

ResNetV1D (CVPR'2019)
@inproceedings{he2019bag,
  title={Bag of tricks for image classification with convolutional neural networks},
  author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={558--567},
  year={2019}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_resnetv1d_50 256x256 0.881 0.290 ckpt log
pose_resnetv1d_101 256x256 0.883 0.295 ckpt log
pose_resnetv1d_152 256x256 0.888 0.300 ckpt log

Topdown Heatmap + Hourglass on Mpii

Hourglass (ECCV'2016)
@inproceedings{newell2016stacked,
  title={Stacked hourglass networks for human pose estimation},
  author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
  booktitle={European conference on computer vision},
  pages={483--499},
  year={2016},
  organization={Springer}
}
MPII (CVPR'2014)
@inproceedings{andriluka14cvpr,
  author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
  title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year = {2014},
  month = {June}
}

Results on MPII val set

Arch Input Size Mean Mean@0.1 ckpt log
pose_hourglass_52 256x256 0.889 0.317 ckpt log
pose_hourglass_52 384x384 0.894 0.366 ckpt log



Mpii_trb Dataset


Topdown Heatmap + Resnet + Mpii on Mpii_trb

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
MPII-TRB (ICCV'2019)
@inproceedings{duan2019trb,
  title={TRB: A Novel Triplet Representation for Understanding 2D Human Body},
  author={Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and Liu, Wentao and Qian, Chen and Ouyang, Wanli},
  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
  pages={9479--9488},
  year={2019}
}

Results on MPII-TRB val set

Arch Input Size Skeleton Acc Contour Acc Mean Acc ckpt log
pose_resnet_50 256x256 0.887 0.858 0.868 ckpt log
pose_resnet_101 256x256 0.890 0.863 0.873 ckpt log
pose_resnet_152 256x256 0.897 0.868 0.879 ckpt log



Ochuman Dataset


Topdown Heatmap + Resnet on Ochuman

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
OCHuman (CVPR'2019)
@inproceedings{zhang2019pose2seg,
  title={Pose2seg: Detection free human instance segmentation},
  author={Zhang, Song-Hai and Li, Ruilong and Dong, Xin and Rosin, Paul and Cai, Zixi and Han, Xi and Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={889--898},
  year={2019}
}

Results on OCHuman test dataset with ground-truth bounding boxes

Following the common setting, the models are trained on COCO train dataset, and evaluate on OCHuman dataset.

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_resnet_50 256x192 0.546 0.726 0.593 0.592 0.755 ckpt log
pose_resnet_50 384x288 0.539 0.723 0.574 0.588 0.756 ckpt log
pose_resnet_101 256x192 0.559 0.724 0.606 0.605 0.751 ckpt log
pose_resnet_101 384x288 0.571 0.715 0.615 0.615 0.748 ckpt log
pose_resnet_152 256x192 0.570 0.725 0.617 0.616 0.754 ckpt log
pose_resnet_152 384x288 0.582 0.723 0.627 0.627 0.752 ckpt log

Topdown Heatmap + Hrnet on Ochuman

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
OCHuman (CVPR'2019)
@inproceedings{zhang2019pose2seg,
  title={Pose2seg: Detection free human instance segmentation},
  author={Zhang, Song-Hai and Li, Ruilong and Dong, Xin and Rosin, Paul and Cai, Zixi and Han, Xi and Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={889--898},
  year={2019}
}

Results on OCHuman test dataset with ground-truth bounding boxes

Following the common setting, the models are trained on COCO train dataset, and evaluate on OCHuman dataset.

Arch Input Size AP AP50 AP75 AR AR50 ckpt log
pose_hrnet_w32 256x192 0.591 0.748 0.641 0.631 0.775 ckpt log
pose_hrnet_w32 384x288 0.606 0.748 0.650 0.647 0.776 ckpt log
pose_hrnet_w48 256x192 0.611 0.752 0.663 0.648 0.778 ckpt log
pose_hrnet_w48 384x288 0.616 0.749 0.663 0.653 0.773 ckpt log



Posetrack18 Dataset


Topdown Heatmap + Hrnet on Posetrack18

HRNet (CVPR'2019)
@inproceedings{sun2019deep,
  title={Deep high-resolution representation learning for human pose estimation},
  author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5693--5703},
  year={2019}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

Results on PoseTrack2018 val with ground-truth bounding boxes

Arch Input Size Head Shou Elb Wri Hip Knee Ankl Total ckpt log
pose_hrnet_w32 256x192 87.4 88.6 84.3 78.5 79.7 81.8 78.8 83.0 ckpt log
pose_hrnet_w32 384x288 87.0 88.8 85.0 80.1 80.5 82.6 79.4 83.6 ckpt log
pose_hrnet_w48 256x192 88.2 90.1 85.8 80.8 80.7 83.3 80.3 84.4 ckpt log
pose_hrnet_w48 384x288 87.8 90.0 85.9 81.3 81.1 83.3 80.9 84.5 ckpt log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.

Results on PoseTrack2018 val with MMDetection pre-trained Cascade R-CNN (X-101-64x4d-FPN) human detector

Arch Input Size Head Shou Elb Wri Hip Knee Ankl Total ckpt log
pose_hrnet_w32 256x192 78.0 82.9 79.5 73.8 76.9 76.6 70.2 76.9 ckpt log
pose_hrnet_w32 384x288 79.9 83.6 80.4 74.5 74.8 76.1 70.5 77.3 ckpt log
pose_hrnet_w48 256x192 80.1 83.4 80.6 74.8 74.3 76.8 70.4 77.4 ckpt log
pose_hrnet_w48 384x288 80.2 83.8 80.9 75.2 74.7 76.7 71.7 77.8 ckpt log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.


Topdown Heatmap + Resnet on Posetrack18

SimpleBaseline2D (ECCV'2018)
@inproceedings{xiao2018simple,
  title={Simple baselines for human pose estimation and tracking},
  author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
  booktitle={Proceedings of the European conference on computer vision (ECCV)},
  pages={466--481},
  year={2018}
}
ResNet (CVPR'2016)
@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}
PoseTrack18 (CVPR'2018)
@inproceedings{andriluka2018posetrack,
  title={Posetrack: A benchmark for human pose estimation and tracking},
  author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5167--5176},
  year={2018}
}

Results on PoseTrack2018 val with ground-truth bounding boxes

Arch Input Size Head Shou Elb Wri Hip Knee Ankl Total ckpt log
pose_resnet_50 256x192 86.5 87.5 82.3 75.6 79.9 78.6 74.0 81.0 ckpt log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.

Results on PoseTrack2018 val with MMDetection pre-trained Cascade R-CNN (X-101-64x4d-FPN) human detector

Arch Input Size Head Shou Elb Wri Hip Knee Ankl Total ckpt log
pose_resnet_50 256x192 78.9 81.9 77.8 70.8 75.3 73.2 66.4 75.2 ckpt log

The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.

Read the Docs v: latest
Versions
latest
1.x
v1.0.0rc0
v0.29.0
v0.28.0
v0.27.0
dev-1.x
Downloads
pdf
html
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.