{
  "access": {
    "embargo": {
      "active": false,
      "reason": null
    },
    "files": "public",
    "record": "public",
    "status": "open"
  },
  "created": "2026-04-17T10:16:57.417139+00:00",
  "custom_fields": {
    "code:programmingLanguage": [
      {
        "id": "python",
        "title": {
          "en": "Python"
        }
      }
    ],
    "journal:journal": {
      "title": "Journal of Archaeological Science"
    }
  },
  "deletion_status": {
    "is_deleted": false,
    "status": "P"
  },
  "files": {
    "count": 27,
    "enabled": true,
    "entries": {
      ".dockerignore": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:f236076006e1a92b3592a4bead60354b",
        "ext": "bin",
        "id": "819f4314-adde-4597-8ab2-25b6436b2e17",
        "key": ".dockerignore",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/.dockerignore/content",
          "self": "https://zenodo.org/api/records/19628745/files/.dockerignore"
        },
        "metadata": {},
        "mimetype": "application/octet-stream",
        "size": 757,
        "storage_class": "L"
      },
      "DOCKER_GUIDE.md": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:d0256e83ac8202fee6aa7890b0f3242d",
        "ext": "md",
        "id": "b222f500-0d0c-42a4-a4d3-7c909139cdd6",
        "key": "DOCKER_GUIDE.md",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/DOCKER_GUIDE.md/content",
          "self": "https://zenodo.org/api/records/19628745/files/DOCKER_GUIDE.md"
        },
        "metadata": {},
        "mimetype": "application/octet-stream",
        "size": 7732,
        "storage_class": "L"
      },
      "Dockerfile": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:3c345f6a43bfa286ab771f7340317d77",
        "ext": "bin",
        "id": "de7db62a-5a29-4c82-b693-8f58ae0d627f",
        "key": "Dockerfile",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/Dockerfile/content",
          "self": "https://zenodo.org/api/records/19628745/files/Dockerfile"
        },
        "metadata": {},
        "mimetype": "application/octet-stream",
        "size": 4635,
        "storage_class": "L"
      },
      "Machine Learning Framework for Chronological Classification of Archaeological Samples Based on Lithic Typology Distributions.docx": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:3a05dc4ff81239da99218276308967dd",
        "ext": "docx",
        "id": "5d70efad-5a79-4800-96e9-30812cd1e3b6",
        "key": "Machine Learning Framework for Chronological Classification of Archaeological Samples Based on Lithic Typology Distributions.docx",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/Machine%20Learning%20Framework%20for%20Chronological%20Classification%20of%20Archaeological%20Samples%20Based%20on%20Lithic%20Typology%20Distributions.docx/content",
          "self": "https://zenodo.org/api/records/19628745/files/Machine%20Learning%20Framework%20for%20Chronological%20Classification%20of%20Archaeological%20Samples%20Based%20on%20Lithic%20Typology%20Distributions.docx"
        },
        "metadata": {},
        "mimetype": "application/octet-stream",
        "size": 28852,
        "storage_class": "L"
      },
      "Puntas_TODOS.ods": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:eae25d7c69463288ca1a0b365983eb63",
        "ext": "ods",
        "id": "152404d5-663f-413b-a8fb-0d780a4ff8dc",
        "key": "Puntas_TODOS.ods",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/Puntas_TODOS.ods/content",
          "self": "https://zenodo.org/api/records/19628745/files/Puntas_TODOS.ods"
        },
        "metadata": {},
        "mimetype": "application/octet-stream",
        "size": 28110,
        "storage_class": "L"
      },
      "README_DOCKER.md": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:fc6a00d0f4ac098b0389b67526fa0b63",
        "ext": "md",
        "id": "c86e7df9-05e2-46c4-98f3-bbfa2467ad7e",
        "key": "README_DOCKER.md",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/README_DOCKER.md/content",
          "self": "https://zenodo.org/api/records/19628745/files/README_DOCKER.md"
        },
        "metadata": {},
        "mimetype": "application/octet-stream",
        "size": 10164,
        "storage_class": "L"
      },
      "Source.mat": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:9622d6f05b17dbaff62730e73410a220",
        "ext": "mat",
        "id": "8320c2ae-6cd7-4ced-b962-cc9549d17083",
        "key": "Source.mat",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/Source.mat/content",
          "self": "https://zenodo.org/api/records/19628745/files/Source.mat"
        },
        "metadata": {},
        "mimetype": "application/octet-stream",
        "size": 9930,
        "storage_class": "L"
      },
      "aposteriori_predictions_wide.csv": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:3dc1a5c67494a3d04cbbdcdc2cabdf1f",
        "ext": "csv",
        "id": "970d9e2d-acdf-42ea-9006-b761112b0323",
        "key": "aposteriori_predictions_wide.csv",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/aposteriori_predictions_wide.csv/content",
          "self": "https://zenodo.org/api/records/19628745/files/aposteriori_predictions_wide.csv"
        },
        "metadata": {},
        "mimetype": "text/csv",
        "size": 51331,
        "storage_class": "L"
      },
      "augmentation.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:a2822a5984d3c49d76d2f735c99bd486",
        "ext": "py",
        "id": "afe81aa7-2bb0-4d38-a340-6266cc14871d",
        "key": "augmentation.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/augmentation.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/augmentation.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 2450,
        "storage_class": "L"
      },
      "benchmark_runner.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:6196d16b3e3811297488557ea00ec785",
        "ext": "py",
        "id": "70731378-60de-403f-bff0-ba90d142a816",
        "key": "benchmark_runner.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/benchmark_runner.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/benchmark_runner.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 5498,
        "storage_class": "L"
      },
      "collect_predictions_by_id.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:0ba57539b262886b83ba4f242adb1f59",
        "ext": "py",
        "id": "8eee7de0-7cb5-403f-9bac-ab4a743894ad",
        "key": "collect_predictions_by_id.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/collect_predictions_by_id.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/collect_predictions_by_id.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 4995,
        "storage_class": "L"
      },
      "data_loader.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:c5702c7a2a946c6d5a02f6e3463335b8",
        "ext": "py",
        "id": "928eb6d3-8948-4a57-90bb-086da5890f49",
        "key": "data_loader.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/data_loader.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/data_loader.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 7024,
        "storage_class": "L"
      },
      "data_pipeline.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:f6beb6805d0705dbeeea3bbd6397df01",
        "ext": "py",
        "id": "d4315890-f456-48d0-a9f4-058ed462f715",
        "key": "data_pipeline.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/data_pipeline.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/data_pipeline.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 2741,
        "storage_class": "L"
      },
      "dirichlet_pipeline.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:43da751543df4c4fb751abd6fb175190",
        "ext": "py",
        "id": "9f54ac85-3be2-4cd5-9c70-6103867c9e61",
        "key": "dirichlet_pipeline.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/dirichlet_pipeline.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/dirichlet_pipeline.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 7936,
        "storage_class": "L"
      },
      "dirichlet_predictive_model.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:b8bf56f76f29dc5718a0be883e03c2c7",
        "ext": "py",
        "id": "49f1b83a-b8b9-49ca-b880-981e01fd0429",
        "key": "dirichlet_predictive_model.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/dirichlet_predictive_model.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/dirichlet_predictive_model.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 10341,
        "storage_class": "L"
      },
      "ensemble.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:5e278da55caf37e724d7045493b1ef06",
        "ext": "py",
        "id": "647627e4-7a81-4c9f-94b9-6d27648215a9",
        "key": "ensemble.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/ensemble.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/ensemble.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 3914,
        "storage_class": "L"
      },
      "generate_figures.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:69e16fe6f359906964c32b200f0fdef7",
        "ext": "py",
        "id": "911b38c0-9637-4962-8d95-e59a89b2af9e",
        "key": "generate_figures.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/generate_figures.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/generate_figures.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 9746,
        "storage_class": "L"
      },
      "generate_global_training_report.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:13aa71828ae8dd27ee24fc763a3fcd05",
        "ext": "py",
        "id": "72327b7d-23a6-437b-8729-e52dfb44bd2d",
        "key": "generate_global_training_report.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/generate_global_training_report.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/generate_global_training_report.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 2575,
        "storage_class": "L"
      },
      "io_paths.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:c4e40b5407859dd11f97cb61e03086ca",
        "ext": "py",
        "id": "b863ce43-95d3-4647-b629-e4689e79f360",
        "key": "io_paths.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/io_paths.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/io_paths.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 798,
        "storage_class": "L"
      },
      "main.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:1e64c3ab075f6f507cf2126251d45651",
        "ext": "py",
        "id": "52f5aaeb-3ef9-4fe4-bf3c-b2dec8e36fce",
        "key": "main.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/main.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/main.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 47986,
        "storage_class": "L"
      },
      "model_loader.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:ec32727565230a0502f487eb6a18392a",
        "ext": "py",
        "id": "e4549895-4749-454d-b412-11aa0cf6992a",
        "key": "model_loader.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/model_loader.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/model_loader.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 754,
        "storage_class": "L"
      },
      "predictions_summary.csv": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:aed28557250045a0ac2cccc82e020aad",
        "ext": "csv",
        "id": "8f43710c-aa98-4a3f-866d-f9d4fda811a9",
        "key": "predictions_summary.csv",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/predictions_summary.csv/content",
          "self": "https://zenodo.org/api/records/19628745/files/predictions_summary.csv"
        },
        "metadata": {},
        "mimetype": "text/csv",
        "size": 80497,
        "storage_class": "L"
      },
      "report_html.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:ed2889ee0404a1bb89c37f4880351719",
        "ext": "py",
        "id": "d35d5c6b-c7db-49ea-8676-5ac6cc9b2b26",
        "key": "report_html.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/report_html.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/report_html.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 24836,
        "storage_class": "L"
      },
      "requirements.txt": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:b221831699a65c976aec7cc135b7d641",
        "ext": "txt",
        "id": "15a7edcb-41b8-4de4-bdcf-a6556d7a8897",
        "key": "requirements.txt",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/requirements.txt/content",
          "self": "https://zenodo.org/api/records/19628745/files/requirements.txt"
        },
        "metadata": {},
        "mimetype": "text/plain",
        "size": 780,
        "storage_class": "L"
      },
      "run_pipeline.sh": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:5e03646405ba248ca8e7badfc213a9a2",
        "ext": "sh",
        "id": "7c729013-63e2-41fb-a8b8-beb49e794954",
        "key": "run_pipeline.sh",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/run_pipeline.sh/content",
          "self": "https://zenodo.org/api/records/19628745/files/run_pipeline.sh"
        },
        "metadata": {},
        "mimetype": "application/x-sh",
        "size": 3688,
        "storage_class": "L"
      },
      "siamese_coincidencia_por_yacimiento.csv": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:782dc160b3f8fd8cb582ab3ee031ba99",
        "ext": "csv",
        "id": "cb6980df-2d18-4f5b-8f2f-eddb19e1d9d5",
        "key": "siamese_coincidencia_por_yacimiento.csv",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/siamese_coincidencia_por_yacimiento.csv/content",
          "self": "https://zenodo.org/api/records/19628745/files/siamese_coincidencia_por_yacimiento.csv"
        },
        "metadata": {},
        "mimetype": "text/csv",
        "size": 12649,
        "storage_class": "L"
      },
      "train_validate.py": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:d47a00a76f80e10693a188d7b134311b",
        "ext": "py",
        "id": "87b59da7-aca8-4a9c-b6f6-3fd9bfe3ddc1",
        "key": "train_validate.py",
        "links": {
          "content": "https://zenodo.org/api/records/19628745/files/train_validate.py/content",
          "self": "https://zenodo.org/api/records/19628745/files/train_validate.py"
        },
        "metadata": {},
        "mimetype": "text/x-python",
        "size": 43325,
        "storage_class": "L"
      }
    },
    "order": [],
    "total_bytes": 414044
  },
  "id": "19628745",
  "is_draft": false,
  "is_published": true,
  "links": {
    "access": "https://zenodo.org/api/records/19628745/access",
    "access_grants": "https://zenodo.org/api/records/19628745/access/grants",
    "access_links": "https://zenodo.org/api/records/19628745/access/links",
    "access_request": "https://zenodo.org/api/records/19628745/access/request",
    "access_users": "https://zenodo.org/api/records/19628745/access/users",
    "archive": "https://zenodo.org/api/records/19628745/files-archive",
    "archive_media": "https://zenodo.org/api/records/19628745/media-files-archive",
    "communities": "https://zenodo.org/api/records/19628745/communities",
    "communities-suggestions": "https://zenodo.org/api/records/19628745/communities-suggestions",
    "doi": "https://doi.org/10.5281/zenodo.19628745",
    "draft": "https://zenodo.org/api/records/19628745/draft",
    "file_modification": "https://zenodo.org/api/records/19628745/file-modification",
    "files": "https://zenodo.org/api/records/19628745/files",
    "latest": "https://zenodo.org/api/records/19628745/versions/latest",
    "latest_html": "https://zenodo.org/records/19628745/latest",
    "media_files": "https://zenodo.org/api/records/19628745/media-files",
    "parent": "https://zenodo.org/api/records/19628744",
    "parent_doi": "https://doi.org/10.5281/zenodo.19628744",
    "parent_doi_html": "https://zenodo.org/doi/10.5281/zenodo.19628744",
    "parent_html": "https://zenodo.org/records/19628744",
    "preview_html": "https://zenodo.org/records/19628745?preview=1",
    "quota_increase": "https://zenodo.org/api/records/19628745/quota-increase",
    "request_deletion": "https://zenodo.org/api/records/19628745/request-deletion",
    "requests": "https://zenodo.org/api/records/19628745/requests",
    "reserve_doi": "https://zenodo.org/api/records/19628745/draft/pids/doi",
    "self": "https://zenodo.org/api/records/19628745",
    "self_doi": "https://doi.org/10.5281/zenodo.19628745",
    "self_doi_html": "https://zenodo.org/doi/10.5281/zenodo.19628745",
    "self_html": "https://zenodo.org/records/19628745",
    "self_iiif_manifest": "https://zenodo.org/api/iiif/record:19628745/manifest",
    "self_iiif_sequence": "https://zenodo.org/api/iiif/record:19628745/sequence/default",
    "versions": "https://zenodo.org/api/records/19628745/versions"
  },
  "media_files": {
    "count": 0,
    "enabled": false,
    "entries": {},
    "order": [],
    "total_bytes": 0
  },
  "metadata": {
    "additional_descriptions": [
      {
        "description": "<p>Computational Reproducibility Guide</p>\n<p><strong>Article:</strong> <em>A Machine Learning Framework for Chronological Classification of Archaeological Samples Based on Lithic Typology Distributions</em> <strong>Journal:</strong> Journal of Archaeological Science <strong>Complies with:</strong> JAS Data &amp; Code Availability / Transparency &amp; Replicability Policy</p>\n\n\n<h2 class=\"atx\">Overview</h2>\n<p>This repository contains all code and data required to fully reproduce the quantitative results reported in the article. The pipeline is containerised with <a href=\"https://www.docker.com/\">Docker</a> so that any researcher can obtain identical outputs on any operating system (Windows, macOS, Linux) without manual dependency installation.</p>\n\n\n<h2 class=\"atx\">Repository structure</h2>\n<pre><code class=\"fenced-code-block\">scripts/\n\u251c\u2500\u2500 Dockerfile                        &larr; Container definition (pinned base image + dependencies)\n\u251c\u2500\u2500 requirements.txt                  &larr; Exact Python package versions\n\u251c\u2500\u2500 run_pipeline.sh                   &larr; Entrypoint: runs training &rarr; validation &rarr; predictions\n\u251c\u2500\u2500 .dockerignore                     &larr; Files excluded from the Docker build context\n\u2502\n\u251c\u2500\u2500 main.py                           &larr; Pipeline orchestrator (CLI entry point)\n\u251c\u2500\u2500 train_validate.py                 &larr; Model training &amp; validation (all classifiers)\n\u251c\u2500\u2500 benchmark_runner.py               &larr; Runs all model &times; augmentation combinations\n\u251c\u2500\u2500 dirichlet_predictive_model.py     &larr; Bayesian Dirichlet-multinomial classifier\n\u251c\u2500\u2500 dirichlet_pipeline.py             &larr; Dirichlet feature extraction pipeline\n\u251c\u2500\u2500 data_loader.py                    &larr; ODS / CSV data loading &amp; cleaning\n\u251c\u2500\u2500 data_pipeline.py                  &larr; Unified train / valid / predict data bundle\n\u251c\u2500\u2500 augmentation.py                   &larr; Bootstrap &amp; Poisson jitter augmentation\n\u251c\u2500\u2500 ensemble.py                       &larr; Ensemble consensus &amp; King-model selection\n\u251c\u2500\u2500 model_loader.py                   &larr; Deserialise saved joblib models\n\u251c\u2500\u2500 collect_predictions_by_id.py      &larr; Aggregate per-model CSVs into wide table\n\u251c\u2500\u2500 report_html.py                    &larr; HTML report generation\n\u251c\u2500\u2500 generate_figures.py               &larr; Generates publication figures (Sankey, confusion matrix, type portrait, stacked bar)\n\u251c\u2500\u2500 generate_global_training_report.py&larr; Cross-experiment global summary\n\u251c\u2500\u2500 io_paths.py                       &larr; Output directory layout helpers\n\u2502\n\u2514\u2500\u2500 data/\n    \u2514\u2500\u2500 Puntas_TODOS.ods              &larr; Input dataset (APRIORI / MODEL_CHECKING / APOSTERIORI sheets)</code></pre>\n\n\n<h2 class=\"atx\">Prerequisites</h2>\n<table>\n\n<tbody><tr>\n<th>Software</th>\n<th>Minimum version</th>\n<th>Download</th>\n</tr>\n\n</tbody><tbody>\n<tr>\n<td>Docker Desktop (Windows / macOS)</td>\n<td>24.x</td>\n<td><a href=\"https://www.docker.com/products/docker-desktop\">https://www.docker.com/products/docker-desktop</a></td>\n</tr>\n<tr>\n<td>Docker Engine (Linux)</td>\n<td>24.x</td>\n<td><a href=\"https://docs.docker.com/engine/install/\">https://docs.docker.com/engine/install/</a></td>\n</tr>\n</tbody>\n</table>\n<p>No Python installation is required on the host machine.</p>\n\n\n<h2 class=\"atx\">Step-by-step reproduction</h2>\n<h3 class=\"atx\">1 &mdash; Clone / download the code</h3>\n<pre><code class=\"fenced-code-block language-bash\"># If distributed via a repository:\ngit clone &lt;repository-url&gt;\ncd &lt;repository&gt;/scripts\n\n# Or simply unzip the supplementary archive and enter the scripts/ folder.</code></pre>\n<h3 class=\"atx\">2 &mdash; Build the Docker image</h3>\n<p>Run this command from inside the <code>scripts/</code> directory (where the <code>Dockerfile</code> lives):</p>\n<pre><code class=\"fenced-code-block language-bash\">docker build -t puntas-ml .</code></pre>\n<p>This will:</p>\n<ul>\n<li>Pull the pinned base image (<code>python:3.11.9-slim-bookworm</code>)</li>\n<li>Install all Python packages at their exact pinned versions (see <code>requirements.txt</code>)</li>\n<li>Copy all source files and the input dataset into the image</li>\n</ul>\n<p>Expected build time: <strong>2&ndash;5 minutes</strong> (depends on internet speed; packages are ~500 MB).</p>\n<h3 class=\"atx\">3 &mdash; Create an output directory on your machine</h3>\n<p><strong>Windows (PowerShell):</strong></p>\n<pre><code class=\"fenced-code-block language-powershell\">mkdir output</code></pre>\n<p><strong>macOS / Linux:</strong></p>\n<pre><code class=\"fenced-code-block language-bash\">mkdir output</code></pre>\n<h3 class=\"atx\">4 &mdash; Run the full pipeline</h3>\n<p><strong>Windows (PowerShell):</strong></p>\n<pre><code class=\"fenced-code-block language-powershell\">docker run --rm -v \"${PWD}\\output:/app/output\" puntas-ml</code></pre>\n<p><strong>macOS / Linux:</strong></p>\n<pre><code class=\"fenced-code-block language-bash\">docker run --rm -v \"$(pwd)/output:/app/output\" puntas-ml</code></pre>\n<p>The container will execute three sequential steps:</p>\n<ol>\n<li><strong>Training &amp; validation</strong> &mdash; trains all classifiers (Logistic Regression, SVM, Random Forest, MLP, Prototypical, four Siamese variants) under all augmentation strategies (none, bootstrap, bootstrap+Poisson) and both cross-validation modes (holdout, LOOCV), with and without Dirichlet features.</li>\n<li><strong>Consensus prediction</strong> &mdash; aggregates per-model predictions into a wide table with majority-vote consensus and King-model selection (by F1-macro).</li>\n<li><strong>Output collection</strong> &mdash; copies all artefacts to <code>/app/output</code> (bound to your local <code>output/</code> folder).</li>\n</ol>\n<p>Expected runtime: <strong>20&ndash;60 minutes</strong> on a modern laptop (CPU only; no GPU required).</p>\n<h3 class=\"atx\">5 &mdash; Inspect the results</h3>\n<p>After the run completes, the <code>output/</code> directory will contain:</p>\n<pre><code class=\"fenced-code-block\">output/\n\u251c\u2500\u2500 ml_train/\n\u2502   \u251c\u2500\u2500 index.html                            &larr; Interactive benchmark overview\n\u2502   \u251c\u2500\u2500 benchmark_report_*.html               &larr; Per-strategy HTML reports\n\u2502   \u251c\u2500\u2500 summary_*.csv                         &larr; Metrics table (accuracy, F1, etc.)\n\u2502   \u251c\u2500\u2500 &lt;run_id&gt;/\n\u2502   \u2502   \u251c\u2500\u2500 models/model.joblib               &larr; Serialised trained model\n\u2502   \u2502   \u251c\u2500\u2500 metrics/metrics.json              &larr; Validation metrics (JSON)\n\u2502   \u2502   \u251c\u2500\u2500 confusion_matrix.png              &larr; Confusion matrix plot\n\u2502   \u2502   \u2514\u2500\u2500 roc_curve.png                     &larr; ROC curve plot\n\u2502   \u2514\u2500\u2500 ...\n\u251c\u2500\u2500 predictions/\n\u2502   \u251c\u2500\u2500 &lt;run_id&gt;.csv                          &larr; Per-model APOSTERIORI predictions\n\u2502   \u251c\u2500\u2500 aposteriori_predictions_wide.csv      &larr; Wide consensus table\n\u2502   \u251c\u2500\u2500 aposteriori_predictions_wide.html     &larr; Human-readable consensus report\n\u2502   \u2514\u2500\u2500 predictions_summary.csv              &larr; King + Matched + per-model columns\n\u2514\u2500\u2500 figures/\n    \u251c\u2500\u2500 fig_sankey_600dpi.png                 &larr; Sankey: expert type &rarr; model prediction\n    \u251c\u2500\u2500 fig_confusion_norm_600dpi.png         &larr; Normalised confusion matrix\n    \u251c\u2500\u2500 fig_breakdown_stacked_600dpi.png      &larr; Stacked bar prediction breakdown\n    \u2514\u2500\u2500 fig_type_portrait_600dpi.png          &larr; Per-type prediction portrait\n    \u251c\u2500\u2500 predictions/\n    \u2502   \u251c\u2500\u2500 dirichlet_posteriors_train.csv\n    \u2502   \u251c\u2500\u2500 dirichlet_posteriors_valid.csv\n    \u2502   \u2514\u2500\u2500 dirichlet_posteriors_aposteriori.csv\n    \u2514\u2500\u2500 plots/\n        \u2514\u2500\u2500 Dirichlet_*.svg                   &larr; Per-site Dirichlet probability plots</code></pre>\n\n\n<h2 class=\"atx\">Reproducibility guarantees</h2>\n<table>\n\n<tbody><tr>\n<th>Mechanism</th>\n<th>Implementation</th>\n</tr>\n\n</tbody><tbody>\n<tr>\n<td><strong>Fixed random seed</strong></td>\n<td><code>--random-state 42</code> passed to all stochastic estimators</td>\n</tr>\n<tr>\n<td><strong>Pinned Python version</strong></td>\n<td><code>python:3.11.9-slim-bookworm</code> (Dockerfile <code>FROM</code>)</td>\n</tr>\n<tr>\n<td><strong>Pinned library versions</strong></td>\n<td>All packages fixed in <code>requirements.txt</code></td>\n</tr>\n<tr>\n<td><strong>Disabled thread-level parallelism</strong></td>\n<td><code>OMP_NUM_THREADS=1</code>, <code>OPENBLAS_NUM_THREADS=1</code>, <code>MKL_NUM_THREADS=1</code></td>\n</tr>\n<tr>\n<td><strong>Disabled Python hash randomisation</strong></td>\n<td><code>PYTHONHASHSEED=0</code></td>\n</tr>\n<tr>\n<td><strong>Headless matplotlib backend</strong></td>\n<td><code>MPLBACKEND=Agg</code> &mdash; no display required</td>\n</tr>\n<tr>\n<td><strong>Non-root container execution</strong></td>\n<td>Runs as unprivileged <code>archaeo</code> user</td>\n</tr>\n</tbody>\n</table>\n<blockquote>\n<p><strong>Note on hardware variation:</strong> Floating-point arithmetic is deterministic given the same CPU instruction set. Minor numerical differences (&lt; 1e-10) may appear if comparing results between x86-64 and ARM64 (Apple Silicon) hosts, but these do not affect any classification outcome or reported metric.</p>\n</blockquote>\n\n\n<h2 class=\"atx\">Running individual scripts (advanced)</h2>\n<p>You can also enter the container interactively and run scripts manually:</p>\n<pre><code class=\"fenced-code-block language-bash\">docker run --rm -it -v \"$(pwd)/output:/app/output\" puntas-ml bash</code></pre>\n<p>Inside the container:</p>\n<pre><code class=\"fenced-code-block language-bash\"># Train only specific models\npython main.py data/Puntas_TODOS.ods \\\n    --models rf svm \\\n    --augmentations none \\\n    --outdir ml_train \\\n    --preddir predictions \\\n    --random-state 42\n\n# Run Dirichlet model only\npython dirichlet_predictive_model.py data/Puntas_TODOS.ods \\\n    --plot --verbose\n\n# Collect and summarise predictions\npython collect_predictions_by_id.py \\\n    --pred-dir predictions \\\n    --outdir ml_train \\\n    --metric f1_macro</code></pre>\n\n\n<h2 class=\"atx\">Software dependencies</h2>\n<p>All packages are installed from <a href=\"https://pypi.org/\">PyPI</a> at the exact versions listed below:</p>\n<table>\n\n<tbody><tr>\n<th>Package</th>\n<th>Version</th>\n<th>Purpose</th>\n</tr>\n\n</tbody><tbody>\n<tr>\n<td><code>numpy</code></td>\n<td>1.26.4</td>\n<td>Array operations</td>\n</tr>\n<tr>\n<td><code>pandas</code></td>\n<td>2.2.2</td>\n<td>Tabular data / ODS reading</td>\n</tr>\n<tr>\n<td><code>scipy</code></td>\n<td>1.13.1</td>\n<td>Statistical functions (Dirichlet log-likelihood)</td>\n</tr>\n<tr>\n<td><code>scikit-learn</code></td>\n<td>1.5.1</td>\n<td>Classical ML classifiers, CV, metrics</td>\n</tr>\n<tr>\n<td><code>joblib</code></td>\n<td>1.4.2</td>\n<td>Model serialisation &amp; parallelism control</td>\n</tr>\n<tr>\n<td><code>torch</code></td>\n<td>2.3.1</td>\n<td>Deep Siamese network (CPU build)</td>\n</tr>\n<tr>\n<td><code>odfpy</code></td>\n<td>1.4.1</td>\n<td>ODS spreadsheet engine</td>\n</tr>\n<tr>\n<td><code>openpyxl</code></td>\n<td>3.1.5</td>\n<td>Excel/ODS compatibility layer</td>\n</tr>\n<tr>\n<td><code>matplotlib</code></td>\n<td>3.9.1</td>\n<td>Figure generation</td>\n</tr>\n</tbody>\n</table>\n\n\n<h2 class=\"atx\">Input data</h2>\n<table>\n\n<tbody><tr>\n<th>File</th>\n<th>Format</th>\n<th>Description</th>\n</tr>\n\n</tbody><tbody>\n<tr>\n<td><code>data/Puntas_TODOS.ods</code></td>\n<td>OpenDocument Spreadsheet</td>\n<td>Lithic assemblage dataset with three sheets: <code>APRIORI</code> (training set of dated assemblages), <code>MODEL_CHECKING</code> (validation set), <code>APOSTERIORI</code> (undated assemblages for chronological prediction)</td>\n</tr>\n</tbody>\n</table>\n<p><strong>Feature columns used:</strong> <code>Type.1</code>, <code>Type.2</code>, <code>Type.3</code>, <code>Type.4</code>, <code>Type.5</code>, <code>Type.7</code>, <code>Metal</code>, <code>Campanif</code> <strong>Target column:</strong> <code>Phase</code> (integer, chronological period) <strong>Identifier column:</strong> <code>Yacimiento</code> (site name, used as observation ID)</p>\n<blockquote>\n<p><code>Type.6</code> is intentionally excluded from all analyses &mdash; see Methods section of the article.</p>\n</blockquote>\n\n\n<h2 class=\"atx\">Licence</h2>\n<p>The source code is released under the <strong>MIT Licence</strong> (see <code>LICENSE</code> file if included). The dataset (<code>Puntas_TODOS.ods</code>) is released under <a href=\"https://creativecommons.org/licenses/by/4.0/\">CC BY 4.0</a>.</p>\n\n\n<h2 class=\"atx\">Citation</h2>\n<p>If you use this code or data, please cite the original article:</p>\n<pre><code class=\"fenced-code-block\">[Author et al. (in press). A Machine Learning Framework for Chronological \nClassification of Archaeological Samples Based on Lithic Typology Distributions. \nJournal of Archaeological Science. DOI: XXXX]</code></pre>\n\n\n<h2 class=\"atx\">Contact</h2>\n<p>For questions about the code or data, please open an issue in the repository or contact</p>",
        "lang": {
          "id": "eng",
          "title": {
            "en": "English"
          }
        },
        "type": {
          "id": "notes",
          "title": {
            "de": "Anmerkungen",
            "en": "Notes"
          }
        }
      }
    ],
    "creators": [
      {
        "person_or_org": {
          "family_name": "Jim\u00e9nez-Puerto",
          "given_name": "Joaqu\u00edn",
          "identifiers": [
            {
              "identifier": "0000-0001-9760-9602",
              "scheme": "orcid"
            }
          ],
          "name": "Jim\u00e9nez-Puerto, Joaqu\u00edn",
          "type": "personal"
        }
      },
      {
        "affiliations": [
          {
            "id": "01460j859",
            "identifiers": [
              {
                "identifier": "01460j859",
                "scheme": "ror"
              },
              {
                "identifier": "grid.157927.f",
                "scheme": "grid"
              },
              {
                "identifier": "0000 0004 1770 5832",
                "scheme": "isni"
              }
            ],
            "name": "Universitat Polit\u00e8cnica de Val\u00e8ncia"
          }
        ],
        "person_or_org": {
          "family_name": "Trull",
          "given_name": "Oscar",
          "identifiers": [
            {
              "identifier": "0000-0003-2896-8606",
              "scheme": "orcid"
            }
          ],
          "name": "Trull, Oscar",
          "type": "personal"
        }
      },
      {
        "person_or_org": {
          "family_name": "Troncoso",
          "given_name": "Alicia",
          "identifiers": [
            {
              "identifier": "0000-0002-9801-7999",
              "scheme": "orcid"
            }
          ],
          "name": "Troncoso, Alicia",
          "type": "personal"
        }
      },
      {
        "affiliations": [
          {
            "id": "01r9z8p25",
            "identifiers": [
              {
                "identifier": "01r9z8p25",
                "scheme": "ror"
              },
              {
                "identifier": "grid.10041.34",
                "scheme": "grid"
              },
              {
                "identifier": "0000 0001 2106 0879",
                "scheme": "isni"
              }
            ],
            "name": "Universidad de La Laguna"
          }
        ],
        "person_or_org": {
          "family_name": "Pardo-Gord\u00f3",
          "given_name": "Salvador",
          "identifiers": [
            {
              "identifier": "0000-0002-1060-1526",
              "scheme": "orcid"
            }
          ],
          "name": "Pardo-Gord\u00f3, Salvador",
          "type": "personal"
        }
      }
    ],
    "description": "<h1><span>Abstract</span></h1>\n<p class=\"MsoNormal\"><span>We present a similarity-based framework for the chronological attribution of undated archaeological assemblages, grounded in Siamese Neural Networks (SNNs) and specifically designed for the small sample sizes, compositional uncertainty, and diffuse phase boundaries that characterize most prehistoric datasets. Unlike conventional supervised classifiers, which require sufficient per-class examples to define stable decision boundaries, SNNs reformulate chronological inference as a pairwise relational problem: the model estimates the probability that two assemblages belong to the same chronological phase, mimicking the comparative reasoning implicit in typological analysis while rendering it reproducible and transferable. The quadratic growth of training pairs with sample size substantially amplifies the effective training set without additional radiocarbon data, a critical advantage in data-scarce contexts.</span></p>\n<p class=\"MsoNormal\"><span>The framework is evaluated on 185 radiocarbon-dated bifacial flint arrowhead assemblages from eastern Iberia (ca. 3500&ndash;1900 cal. BC), organized into six chronological phases spanning the Late Neolithic to the Early Bronze Age. Multiple Siamese configurations &mdash; logistic regression, MLP, random forest, and deep learning encoders &mdash; are compared against standard MLP and SVM baselines. The best-performing configuration (DL with bootstrap augmentation and Dirichlet-multinomial compositional variables) achieves a macro F1 of 21.4% and balanced accuracy of 21.8%, representing a consistent improvement over both baselines and over a stratified random classifier (expected macro F1: ~17%) across all class-balanced metrics. Per-type analysis reveals that predictive accuracy correlates with morphological distinctiveness: foliaceous types reach 96.5% agreement with expert assignments, while pedunculate forms &mdash; whose typological boundaries are inherently contested &mdash; present the greatest classification challenge, a pattern that mirrors the gradient of archaeological interpretive confidence.</span></p>\n<p class=\"MsoNormal\"><span>The input structure of the framework &mdash; assemblage-level frequency vectors of artifact types &mdash; is standard across the main material categories used for chronological inference in prehistoric archaeology, including ceramics, lithic industries, and faunal assemblages. The approach is therefore directly transferable to other material traditions and regional sequences, and its outputs are formally compatible with Bayesian chronological modelling pipelines. The framework requires only typological frequency data routinely collected during excavation and post-excavation analysis, involves negligible computational cost relative to radiometric methods, and can be deployed at early stages of site investigation to generate provisional chronological attributions &mdash; providing an operational basis for decisions about sampling strategy, resource allocation, and the prioritization of contexts for absolute dating. As radiocarbon coverage and regional reference databases expand, the framework scales accordingly, functioning as a cost-effective first-pass instrument within a broader chronological workflow rather than as a replacement for higher-resolution analytical methods.</span></p>",
    "languages": [
      {
        "id": "eng",
        "title": {
          "en": "English"
        }
      }
    ],
    "publication_date": "2026-04-17",
    "publisher": "Zenodo",
    "resource_type": {
      "id": "dataset",
      "title": {
        "de": "Datensatz",
        "en": "Dataset"
      }
    },
    "rights": [
      {
        "description": {
          "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited."
        },
        "icon": "cc-by-icon",
        "id": "cc-by-4.0",
        "props": {
          "scheme": "spdx",
          "url": "https://creativecommons.org/licenses/by/4.0/legalcode"
        },
        "title": {
          "en": "Creative Commons Attribution 4.0 International"
        }
      }
    ],
    "subjects": [
      {
        "subject": "Similarity-based classification"
      },
      {
        "subject": "copper age"
      },
      {
        "subject": "Small sample machine learning"
      },
      {
        "subject": "Bayesian chronological modelling"
      }
    ],
    "title": "SUPPLEMENTARY MATERIAL: Pairwise Similarity Learning for Chronological Attribution of Archaeological Assemblages: A Siamese Neural Network Approach",
    "version": "1.0"
  },
  "parent": {
    "access": {
      "owned_by": {
        "user": "46830"
      },
      "settings": {
        "accept_conditions_text": null,
        "allow_guest_requests": false,
        "allow_user_requests": false,
        "secret_link_expiration": 0
      }
    },
    "communities": {
      "default": "f466a75b-69ca-4b60-abb5-22e87ae9d073",
      "entries": [
        {
          "access": {
            "member_policy": "open",
            "members_visibility": "public",
            "record_submission_policy": "open",
            "review_policy": "open",
            "visibility": "public"
          },
          "children": {
            "allow": false
          },
          "created": "2022-07-20T09:51:14.532802+00:00",
          "custom_fields": {},
          "deletion_status": {
            "is_deleted": false,
            "status": "P"
          },
          "id": "f466a75b-69ca-4b60-abb5-22e87ae9d073",
          "links": {},
          "metadata": {
            "curation_policy": "<p>The aim of this community is to collect research data published in Zenodo&nbsp; by members of Universitat de Val&egrave;ncia.&nbsp;</p>\r\n",
            "page": "<p>Datasets collection from Universitat de Val&egrave;ncia researchers, managed&nbsp;and curated by the&nbsp;Libraries and Documentation&nbsp; Service.</p>\r\n\r\n<p><strong>&nbsp;</strong></p>\r\n\r\n<pre>\r\n&nbsp;</pre>",
            "title": "Universitat de Val\u00e8ncia (UV) Research Data"
          },
          "revision_id": 0,
          "slug": "universitat_valencia",
          "updated": "2022-07-20T10:17:44.029695+00:00"
        }
      ],
      "ids": [
        "f466a75b-69ca-4b60-abb5-22e87ae9d073"
      ]
    },
    "id": "19628744",
    "pids": {
      "doi": {
        "client": "datacite",
        "identifier": "10.5281/zenodo.19628744",
        "provider": "datacite"
      }
    }
  },
  "pids": {
    "doi": {
      "client": "datacite",
      "identifier": "10.5281/zenodo.19628745",
      "provider": "datacite"
    },
    "oai": {
      "identifier": "oai:zenodo.org:19628745",
      "provider": "oai"
    }
  },
  "revision_id": 4,
  "stats": {
    "all_versions": {
      "data_volume": 0.0,
      "downloads": 0,
      "unique_downloads": 0,
      "unique_views": 32,
      "views": 35
    },
    "this_version": {
      "data_volume": 0.0,
      "downloads": 0,
      "unique_downloads": 0,
      "unique_views": 32,
      "views": 35
    }
  },
  "status": "published",
  "swh": {},
  "updated": "2026-04-17T10:16:58.985956+00:00",
  "versions": {
    "index": 1,
    "is_latest": true
  }
}