Dataset Open Access
Haque, Albert;
Peng, Boya;
Luo, Zelun;
Alahi, Alexandre;
Yeung, Serena;
Fei-Fei, Li
{ "files": [ { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_test_depth_map.h5.gz" }, "checksum": "md5:65f431c9f7540db6118d99bc9bae7576", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_test_depth_map.h5.gz", "type": "gz", "size": 245104261 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_test_images.h5.gz" }, "checksum": "md5:1803c50e44746dca7ccf03c2d46c466e", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_test_images.h5.gz", "type": "gz", "size": 257980348 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_test_labels.h5.gz" }, "checksum": "md5:7205b0ba47f76892742ded774754d7a1", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_test_labels.h5.gz", "type": "gz", "size": 3699135 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_test_point_cloud.h5.gz" }, "checksum": "md5:3f5227d6f260011b19f325fffde08a65", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_test_point_cloud.h5.gz", "type": "gz", "size": 2061701631 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_train_depth_map.h5.gz" }, "checksum": "md5:80736f716b0e83f7cc73ec85bb13effc", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_train_depth_map.h5.gz", "type": "gz", "size": 926228035 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_train_images.h5.gz" }, "checksum": "md5:e325ed23ed962f86594b70f17c048a30", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_train_images.h5.gz", "type": "gz", "size": 1010377751 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_train_labels.h5.gz" }, "checksum": "md5:e62a67678d5cddc13e07cfdd1eb0a176", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_train_labels.h5.gz", "type": "gz", "size": 16833112 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_side_train_point_cloud.h5.gz" }, "checksum": "md5:6ca457e8471e7514222624e937e11a9c", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_side_train_point_cloud.h5.gz", "type": "gz", "size": 7840345186 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_test_depth_map.h5.gz" }, "checksum": "md5:d8ad31ecbbcd13ee5e1f02874c0cb3d0", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_test_depth_map.h5.gz", "type": "gz", "size": 245493889 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_test_images.h5.gz" }, "checksum": "md5:21f702e3ce0e5602340957e6cae6148a", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_test_images.h5.gz", "type": "gz", "size": 246678932 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_test_labels.h5.gz" }, "checksum": "md5:6a9c5d7845dc7fdf6d168ee4dd356afd", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_test_labels.h5.gz", "type": "gz", "size": 9280299 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_test_point_cloud.h5.gz" }, "checksum": "md5:3ac977488864e27ac13e8cf17d03f8c7", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_test_point_cloud.h5.gz", "type": "gz", "size": 2020245383 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_train_depth_map.h5.gz" }, "checksum": "md5:159a8694f653f5b639252de84469f7b9", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_train_depth_map.h5.gz", "type": "gz", "size": 917859800 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_train_images.h5.gz" }, "checksum": "md5:6e2daf5be0f0bf6eddf611913e718417", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_train_images.h5.gz", "type": "gz", "size": 923855225 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_train_labels.h5.gz" }, "checksum": "md5:95776e7beeb9a769bef25eb336afb5bd", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_train_labels.h5.gz", "type": "gz", "size": 32165804 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/ITOP_top_train_point_cloud.h5.gz" }, "checksum": "md5:f5fd64240296be0bfff5318beca19884", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "ITOP_top_train_point_cloud.h5.gz", "type": "gz", "size": 7620649272 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/sample_front.jpg" }, "checksum": "md5:86d7be54b61841fe22b27949fffc042d", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "sample_front.jpg", "type": "jpg", "size": 20450 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/sample_front_labeled.jpg" }, "checksum": "md5:25aaef40a70ad75f452438824a2bb71f", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "sample_front_labeled.jpg", "type": "jpg", "size": 22911 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/sample_top.jpg" }, "checksum": "md5:0afbd5971faee803d14969e4c2a71267", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "sample_top.jpg", "type": "jpg", "size": 18689 }, { "links": { "self": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4/sample_top_labeled.jpg" }, "checksum": "md5:5d6c045333e9f520c24d335f57e0422e", "bucket": "aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "key": "sample_top_labeled.jpg", "type": "jpg", "size": 17461 } ], "owners": [ 84934 ], "doi": "10.5281/zenodo.3932973", "stats": { "version_unique_downloads": 1012.0, "unique_views": 1320.0, "views": 1592.0, "version_views": 1592.0, "unique_downloads": 1012.0, "version_unique_views": 1320.0, "volume": 18835680174982.0, "version_downloads": 6614.0, "downloads": 6614.0, "version_volume": 18835680174982.0 }, "links": { "thumb250": "https://zenodo.org/api/iiif/v2/aefee483-2fb7-4f1c-a24e-7fe82399c5f4:4a51cc97-6636-4825-b738-98139ef2588c:sample_front.jpg/full/250,/0/default.jpg", "doi": "https://doi.org/10.5281/zenodo.3932973", "thumbs": { "10": "https://zenodo.org/record/3932973/thumb10", "750": "https://zenodo.org/record/3932973/thumb750", "50": "https://zenodo.org/record/3932973/thumb50", "1200": "https://zenodo.org/record/3932973/thumb1200", "100": "https://zenodo.org/record/3932973/thumb100", "250": "https://zenodo.org/record/3932973/thumb250" }, "conceptdoi": "https://doi.org/10.5281/zenodo.3932972", "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.3932972.svg", "latest_html": "https://zenodo.org/record/3932973", "bucket": "https://zenodo.org/api/files/aefee483-2fb7-4f1c-a24e-7fe82399c5f4", "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.3932973.svg", "html": "https://zenodo.org/record/3932973", "latest": "https://zenodo.org/api/records/3932973" }, "conceptdoi": "10.5281/zenodo.3932972", "created": "2020-07-07T19:19:39.659225+00:00", "updated": "2020-07-31T06:24:25.797000+00:00", "conceptrecid": "3932972", "revision": 5, "id": 3932973, "metadata": { "access_right_category": "success", "doi": "10.5281/zenodo.3932973", "description": "<p><strong>Summary</strong></p>\n\n<p>The ITOP dataset (Invariant Top View) contains 100K depth images from side and top views of a person in a scene. For each image, the location of 15 human body parts are labeled with 3-dimensional (x,y,z) coordinates, relative to the sensor's position. Read the full paper for more context [<a href=\"https://arxiv.org/pdf/1603.07076.pdf\">pdf</a>].</p>\n\n<p><strong>Getting Started</strong></p>\n\n<p>Download then decompress the h5.gz file.</p>\n\n<pre><code class=\"language-bash\">gunzip ITOP_side_test_depth_map.h5.gz</code></pre>\n\n<p>Using Python and <a href=\"https://www.h5py.org/\">h5py</a> (<em>pip install h5py</em> or <em>conda install h5py</em>), we can load the contents:</p>\n\n<pre><code class=\"language-python\">import h5py\nimport numpy as np\n\nf = h5py.File('ITOP_side_test_depth_map.h5', 'r')\ndata, ids = f.get('data'), f.get('id')\ndata, ids = np.asarray(data), np.asarray(ids)\n\nprint(data.shape, ids.shape)\n# (10501, 240, 320) (10501,)</code></pre>\n\n<p><strong>Note:</strong> For any of the <em>*_images.h5.gz</em> files, the underlying file is a tar file and not a h5 file. Please rename the file extension from <em>h5.gz</em> to <em>tar.gz</em> before opening. The following commands will work:</p>\n\n<pre><code class=\"language-bash\">mv ITOP_side_test_images.h5.gz ITOP_side_test_images.tar.gz\ntar xf ITOP_side_test_images.tar.gz</code></pre>\n\n<p><strong>Metadata</strong></p>\n\n<p>File sizes for images, depth maps, point clouds, and labels refer to the uncompressed size.</p>\n\n<pre><code>+-------+--------+---------+---------+----------+------------+--------------+---------+\n| View | Split | Frames | People | Images | Depth Map | Point Cloud | Labels |\n+-------+--------+---------+---------+----------+------------+--------------+---------+\n| Side | Train | 39,795 | 16 | 1.1 GiB | 5.7 GiB | 18 GiB | 2.9 GiB |\n| Side | Test | 10,501 | 4 | 276 MiB | 1.6 GiB | 4.6 GiB | 771 MiB |\n| Top | Train | 39,795 | 16 | 974 MiB | 5.7 GiB | 18 GiB | 2.9 GiB |\n| Top | Test | 10,501 | 4 | 261 MiB | 1.6 GiB | 4.6 GiB | 771 MiB |\n+-------+--------+---------+---------+----------+------------+--------------+---------+</code></pre>\n\n<p><strong>Data Schema</strong></p>\n\n<p>Each file contains several HDF5 datasets at the root level. Dimensions, attributes, and data types are listed below. The key refers to the (HDF5) dataset name. Let <span class=\"math-tex\">\\(n\\)</span> denote the number of images.<br>\n<br>\n<strong>Transformation</strong></p>\n\n<p>To convert from point clouds to a <span class=\"math-tex\">\\(240 \\times 320\\)</span> image, the following transformations were used. Let <span class=\"math-tex\">\\(x_{\\textrm{img}}\\)</span> and <span class=\"math-tex\">\\(y_{\\textrm{img}}\\)</span> denote the <span class=\"math-tex\">\\((x,y)\\)</span> coordinate in the image plane. Using the raw point cloud <span class=\"math-tex\">\\((x,y,z)\\)</span> real world coordinates, we compute the depth map as follows: <span class=\"math-tex\">\\(x_{\\textrm{img}} = \\frac{x}{Cz} + 160\\)</span> and <span class=\"math-tex\">\\(y_{\\textrm{img}} = -\\frac{y}{Cz} + 120\\)</span> where <span class=\"math-tex\">\\(C\\approx 3.50\u00d710^{\u22123} = 0.0035\\)</span> is the intrinsic camera calibration parameter. This results in the depth map: <span class=\"math-tex\">\\((x_{\\textrm{img}}, y_{\\textrm{img}}, z)\\)</span>.</p>\n\n<p><strong>Joint ID (Index) Mapping</strong></p>\n\n<pre><code>joint_id_to_name = {\n 0: 'Head', 8: 'Torso',\n 1: 'Neck', 9: 'R Hip',\n 2: 'R Shoulder', 10: 'L Hip',\n 3: 'L Shoulder', 11: 'R Knee',\n 4: 'R Elbow', 12: 'L Knee',\n 5: 'L Elbow', 13: 'R Foot',\n 6: 'R Hand', 14: 'L Foot',\n 7: 'L Hand',\n}</code></pre>\n\n<p><strong>Depth Maps</strong></p>\n\n<ul>\n\t<li><em>Key:</em> id\n\n\t<ul>\n\t\t<li><em>Dimensions:</em> <span class=\"math-tex\">\\((n,)\\)</span></li>\n\t\t<li><em>Data Type:</em> uint8</li>\n\t\t<li><em>Description:</em> Frame identifier in the form XX_YYYYY where XX is the person's ID number and YYYYY is the frame number.</li>\n\t</ul>\n\t</li>\n\t<li><em>Key: </em>data\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,240,320)\\)</span></li>\n\t\t<li><em>Data Type:</em> float16</li>\n\t\t<li><em>Description:</em> Depth map (i.e. mesh) corresponding to a single frame. Depth values are in real world meters (m).</li>\n\t</ul>\n\t</li>\n</ul>\n\n<p><strong>Point Clouds</strong></p>\n\n<ul>\n\t<li><em>Key:</em> id\n\n\t<ul>\n\t\t<li><em>Dimensions:</em> <span class=\"math-tex\">\\((n,)\\)</span></li>\n\t\t<li><em>Data Type:</em> uint8</li>\n\t\t<li><em>Description:</em> Frame identifier in the form XX_YYYYY where XX is the person's ID number and YYYYY is the frame number.</li>\n\t</ul>\n\t</li>\n\t<li><em>Key: </em>data\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,76800,3)\\)</span></li>\n\t\t<li><em>Data Type: float16</em></li>\n\t\t<li><em>Description:</em> Point cloud containing 76,800 points (240x320). Each point is represented by a 3D tuple measured in real world meters (m).</li>\n\t</ul>\n\t</li>\n</ul>\n\n<p><strong>Labels</strong></p>\n\n<ul>\n\t<li><em>Key: </em>id\n\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,)\\)</span></li>\n\t\t<li><em>Data Type: </em>uint8</li>\n\t\t<li><em>Description:</em> Frame identifier in the form XX_YYYYY where XX is the person's ID number and YYYYY is the frame number.</li>\n\t</ul>\n\t</li>\n\t<li><em>Key: </em>is_valid\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,)\\)</span></li>\n\t\t<li><em>Data Type: </em>uint8</li>\n\t\t<li><em>Description:</em> Flag corresponding to the result of the human labeling effort. This is a boolean value (represented by an integer) where a one (1) denotes clean, human-approved data. A zero (0) denotes noisy human body part labels. If is_valid is equal to zero, you should not use any of the provided human joint locations for the particular frame.</li>\n\t</ul>\n\t</li>\n\t<li><em>Key: </em>visible_joints\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,15)\\)</span></li>\n\t\t<li><em>Data Type: </em>int16</li>\n\t\t<li><em>Description:</em> Binary mask indicating if each human joint is visible or occluded. This is denoted by <span class=\"math-tex\">\\(\\alpha\\)</span> in the paper. If <span class=\"math-tex\">\\(\\alpha_j=1\\)</span> then the <span class=\"math-tex\">\\(j^{th}\\)</span> joint is visible (i.e. not occluded). Otherwise, if <span class=\"math-tex\">\\(\\alpha_j = 0\\)</span> then the <span class=\"math-tex\">\\(j^{th}\\)</span> joint is occluded.</li>\n\t</ul>\n\t</li>\n\t<li><em>Key: </em>image_coordinates\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,15,2)\\)</span></li>\n\t\t<li><em>Data Type: </em>int16</li>\n\t\t<li><em>Description:</em> Two-dimensional <span class=\"math-tex\">\\((x,y)\\)</span> points corresponding to the location of each joint in the depth image or depth map.</li>\n\t</ul>\n\t</li>\n\t<li><em>Key: </em>real_world_coordinates\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,15,3)\\)</span></li>\n\t\t<li><em>Data Type: </em>float16</li>\n\t\t<li><em>Description:</em> Three-dimensional <span class=\"math-tex\">\\((x,y,z)\\)</span> points corresponding to the location of each joint in real world meters (m).</li>\n\t</ul>\n\t</li>\n\t<li><em>Key: </em>segmentation\n\t<ul>\n\t\t<li><em>Dimensions: </em><span class=\"math-tex\">\\((n,240,320)\\)</span></li>\n\t\t<li><em>Data Type: </em><em>int8</em></li>\n\t\t<li><em>Description:</em> Pixel-wise assignment of body part labels. The background class (i.e. no body part) is denoted by −1.</li>\n\t</ul>\n\t</li>\n</ul>\n\n<p><strong>Citation</strong></p>\n\n<p>If you would like to cite our work, please use the following.</p>\n\n<p><strong>Haque A, Peng B, Luo Z, Alahi A, Yeung S, Fei-Fei L. (2016). Towards Viewpoint Invariant 3D Human Pose Estimation. European Conference on Computer Vision. Amsterdam, Netherlands. Springer.</strong></p>\n\n<pre>@inproceedings{haque2016viewpoint,\n title={Towards Viewpoint Invariant 3D Human Pose Estimation},\n author={Haque, Albert and Peng, Boya and Luo, Zelun and Alahi, Alexandre and Yeung, Serena and Fei-Fei, Li},\n booktitle = {European Conference on Computer Vision},\n month = {October},\n year = {2016}\n}</pre>\n\n<ul>\n</ul>", "language": "eng", "title": "ITOP Dataset", "license": { "id": "CC-BY-4.0" }, "relations": { "version": [ { "count": 1, "index": 0, "parent": { "pid_type": "recid", "pid_value": "3932972" }, "is_last": true, "last_child": { "pid_type": "recid", "pid_value": "3932973" } } ] }, "version": "1.0", "keywords": [ "depth sensor", "human pose estimation", "computer vision", "3D vision" ], "publication_date": "2016-10-08", "creators": [ { "orcid": "0000-0001-6769-6370", "affiliation": "Stanford University", "name": "Haque, Albert" }, { "affiliation": "Stanford University", "name": "Peng, Boya" }, { "affiliation": "Stanford University", "name": "Luo, Zelun" }, { "affiliation": "Stanford University", "name": "Alahi, Alexandre" }, { "orcid": "0000-0003-0529-0628", "affiliation": "Stanford University", "name": "Yeung, Serena" }, { "orcid": "0000-0002-7481-0810", "affiliation": "Stanford University", "name": "Fei-Fei, Li" } ], "access_right": "open", "resource_type": { "type": "dataset", "title": "Dataset" }, "related_identifiers": [ { "scheme": "arxiv", "identifier": "arXiv:1603.07076", "relation": "cites", "resource_type": "publication-preprint" }, { "scheme": "doi", "identifier": "10.5281/zenodo.3932972", "relation": "isVersionOf" } ] } }
All versions | This version | |
---|---|---|
Views | 1,592 | 1,592 |
Downloads | 6,614 | 6,614 |
Data volume | 18.8 TB | 18.8 TB |
Unique views | 1,320 | 1,320 |
Unique downloads | 1,012 | 1,012 |