# =============================================================================
# Dockerfile
# Article: "A Machine Learning Framework for Chronological Classification of
#           Archaeological Samples Based on Lithic Typology Distributions"
# Journal: Journal of Archaeological Science
# Complies with: JAS Data & Code Availability / Transparency & Replicability
# =============================================================================

# ── Base image ────────────────────────────────────────────────────────────────
# Pinned to a specific digest-stable tag for full reproducibility.
FROM python:3.11.9-slim-bookworm

# ── Metadata ──────────────────────────────────────────────────────────────────
LABEL maintainer="Joaquín Jiménez-Puerto" \
      description="Reproducible ML pipeline for lithic typology chronological classification" \
      version="1.0"

# ── Environment variables ─────────────────────────────────────────────────────
# Determinism & headless rendering
ENV PYTHONHASHSEED=0 \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    OMP_NUM_THREADS=1 \
    OPENBLAS_NUM_THREADS=1 \
    MKL_NUM_THREADS=1 \
    MPLBACKEND=Agg

# ── System dependencies ───────────────────────────────────────────────────────
# libgomp1  → OpenMP runtime (required by scikit-learn / joblib)
# libglib2.0 → GLib (indirect matplotlib/pillow dependency on slim)
# No GUI libraries needed; Agg backend is pure-software.
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libgomp1 \
        libglib2.0-0 && \
    rm -rf /var/lib/apt/lists/*

# ── Working directory ─────────────────────────────────────────────────────────
WORKDIR /app

# ── Python dependencies ───────────────────────────────────────────────────────
# Copy requirements first to exploit Docker layer caching:
# a change in source files will not re-trigger the (slow) pip install.
COPY requirements.txt .

# Install PyTorch CPU-only wheel before the rest to avoid pulling CUDA extras.
# torch 2.3.1 CPU index is hosted at download.pytorch.org/whl/cpu.
RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir \
        torch==2.3.1 \
        --index-url https://download.pytorch.org/whl/cpu && \
    pip install --no-cache-dir -r requirements.txt

# ── Source code & data ────────────────────────────────────────────────────────
# Copy every script first, then the dataset separately so that data changes
# do not invalidate the (large) torch layer.
COPY *.py .
COPY run_pipeline.sh .
COPY data/ data/

# ── Output directory ──────────────────────────────────────────────────────────
# /app/output is the bind-mount point exposed to the host.
RUN mkdir -p /app/output/ml_train \
              /app/output/predictions \
              /app/output/figures

# ── Non-root user ─────────────────────────────────────────────────────────────
# Running as an unprivileged user is a security best-practice for containers.
RUN useradd --create-home --shell /bin/bash archaeo && \
    chown -R archaeo:archaeo /app
USER archaeo

# ── Entrypoint ────────────────────────────────────────────────────────────────
# run_pipeline.sh executes the three sequential steps:
#   1. Training & validation  (main.py)
#   2. Prediction aggregation (collect_predictions_by_id.py)
#   3. Figure generation      (generate_figures.py)
RUN chmod +x run_pipeline.sh
ENTRYPOINT ["bash", "run_pipeline.sh"]