"""SEC EDGAR XBRL client.

Pulls structured fundamentals from the SEC EDGAR XBRL Company Facts API:

    https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json

Cache-first. Every fetched JSON is persisted to ``data/raw_sec_filings/``.
Subsequent runs read from the cache deterministically. This is the layer
that breaks the pipeline's network dependency once data has been pulled
once.

SEC policy requires a polite User-Agent identifying the requester. The
client refuses to make network calls without one. Rate limit: 10 req/s.

Run model
---------
This module's network code runs on the user's machine (see APPROACH_MEMO
§ 7b). The Claude sandbox cannot reach SEC.gov. All pipeline code
downstream of this module reads from the local cache and is sandbox-safe.
"""

from __future__ import annotations

import json
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

# requests is imported lazily inside the network methods so unit tests that
# only exercise the parsing layer can run without it installed.


# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

SEC_BASE = "https://data.sec.gov"
TICKERS_URL = "https://www.sec.gov/files/company_tickers.json"
RATE_LIMIT_SLEEP_S = 0.11  # ~9 req/s, just under SEC's 10 req/s cap


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class XBRLFact:
    """One periodic value of one XBRL concept for one company."""

    concept: str           # e.g. "OperatingIncomeLoss"
    value: float
    unit: str              # e.g. "USD" or "shares"
    period_end: str        # ISO date "2022-01-30"
    fiscal_year: int       # e.g. 2022
    fiscal_period: str     # "FY", "Q1", "Q2", "Q3", "Q4"
    form: str              # e.g. "10-K"
    filed: str             # ISO date
    period_start: str = "" # ISO date for flow facts; "" for instant facts


@dataclass(frozen=True)
class CompanyFacts:
    """Parsed XBRL facts for one company."""

    ticker: str
    cik: int
    entity_name: str
    facts: dict[str, list[XBRLFact]]  # concept name -> list of values

    def get_annual(
        self,
        concept: str,
        fiscal_year: int,
        unit: str = "USD",
        prefer_oldest_filed: bool = False,
    ) -> XBRLFact | None:
        """Return the FY value of ``concept`` for ``fiscal_year``.

        Selection rules:
        1. Filter to ``form ∈ {10-K, 10-K/A}`` and ``fiscal_period == "FY"``.
        2. Match on the value's own period: the *year of ``period_end``*
           equals ``fiscal_year``. This is robust to issuers that only
           started tagging a concept in later filings (e.g. NVDA's
           ``PaymentsToAcquireProductiveAssets`` for FY2022 lives in the
           FY2024 10-K with fy=2024, end=2022-01-30). Matching on the
           filing's ``fy`` tag misses these; matching on the period's
           year captures them.
        3. Sort by ``(period_end DESC, filed DESC)`` — period_end wins
           over comparatives in the same filing; filed picks the latest
           restatement among matching periods.
        4. Returns ``None`` if no matching fact exists.

        Note for non-calendar fiscal years: this works because the year
        component of ``period_end`` always equals the fiscal-year label
        for every issuer in the v1.0 ground-truth universe (NVDA Jan,
        MSFT/AAPL/COST/NKE non-calendar all included). Issuers that
        label their FY differently from their period_end year (rare;
        some retailers do) would need a per-issuer override.
        """
        candidates = [
            f for f in self.facts.get(concept, [])
            if f.unit == unit
            and f.fiscal_period == "FY"
            and f.form in {"10-K", "10-K/A"}
            and len(f.period_end) >= 4
            and f.period_end[:4].isdigit()
            and int(f.period_end[:4]) == fiscal_year
        ]
        if not candidates:
            return None

        # Two-way disambiguation, depending on whether the concept is a
        # FLOW (has period_start; e.g. revenue, EBIT, capex) or an
        # INSTANT balance-sheet item (empty period_start; e.g. equity,
        # cash, debt).
        #
        # FLOW issue: some issuers (COST) tag quarterly periods under
        # fp=FY in their 10-K filings. Within the same period_end, the
        # annual figure has the longest duration (≈365 days); quarterlies
        # span 80–120 days. We prefer the longest duration so the annual
        # figure wins, then within ties prefer the latest filed
        # (restatement).
        #
        # INSTANT issue: some issuers (CPRT) tag a "stub day" balance one
        # day past the real FY-end (end=2018-08-01 vs the real
        # 2018-07-31), often with a different value reflecting a fresh
        # capital structure. The real FY-end balance is what we want.
        # Heuristic: prefer the *earliest* period_end within the fiscal
        # year. Within ties, prefer the latest filed (restatement).
        def _duration_days(f: XBRLFact) -> int:
            if not f.period_start or not f.period_end:
                return 0
            try:
                from datetime import date as _date
                s = _date.fromisoformat(f.period_start)
                e = _date.fromisoformat(f.period_end)
                return (e - s).days
            except ValueError:
                return 0

        flow_candidates = [f for f in candidates if f.period_start]
        if flow_candidates:
            flow_candidates.sort(
                key=lambda f: (_duration_days(f), f.period_end, f.filed),
                reverse=True,
            )
            return flow_candidates[0]

        # Instant balances. Stable two-stage sort: filed (so restatements
        # are picked according to caller preference), then period_end
        # ASC (so the real FY-end beats stub-day variants).
        #
        # ``prefer_oldest_filed=True`` is for split-sensitive concepts like
        # ``shares_outstanding``: later 10-Ks retroactively restate prior-
        # period share counts to a post-split basis, but the pipeline's
        # cached prices are unadjusted (Run 3 fix). To keep mcap = price ×
        # shares consistent, we must use the original-filing share count,
        # not the restated one.
        candidates.sort(key=lambda f: f.filed, reverse=not prefer_oldest_filed)
        candidates.sort(key=lambda f: f.period_end)
        return candidates[0]


# ---------------------------------------------------------------------------
# Parsing — pure, sandbox-safe
# ---------------------------------------------------------------------------


def parse_company_facts(raw: dict[str, Any], ticker: str) -> CompanyFacts:
    """Convert an SEC Company Facts JSON blob into a typed CompanyFacts."""
    facts: dict[str, list[XBRLFact]] = {}

    namespaces = raw.get("facts", {})
    for ns_concepts in namespaces.values():
        for concept_name, concept_data in ns_concepts.items():
            units = concept_data.get("units", {})
            collected: list[XBRLFact] = []
            for unit, entries in units.items():
                for entry in entries:
                    if "val" not in entry or "fy" not in entry or "fp" not in entry:
                        continue
                    if entry.get("fy") is None:
                        continue
                    try:
                        fact = XBRLFact(
                            concept=concept_name,
                            value=float(entry["val"]),
                            unit=unit,
                            period_end=entry.get("end", ""),
                            fiscal_year=int(entry["fy"]),
                            fiscal_period=str(entry["fp"]),
                            form=str(entry.get("form", "")),
                            filed=str(entry.get("filed", "")),
                            period_start=str(entry.get("start", "")),
                        )
                    except (TypeError, ValueError):
                        continue
                    collected.append(fact)
            if collected:
                facts.setdefault(concept_name, []).extend(collected)

    return CompanyFacts(
        ticker=ticker,
        cik=int(raw.get("cik", 0)),
        entity_name=str(raw.get("entityName", "")),
        facts=facts,
    )


# ---------------------------------------------------------------------------
# Concept tag fallbacks
# ---------------------------------------------------------------------------
# Each logical field maps to a tuple of (tag, unit) pairs. The pipeline tries
# each pair in order and uses the first hit. Using (tag, unit) together is
# safer than (tag, "USD") because XBRL non-monetary facts (shares, ratios)
# live under different unit names.

# Fields where the original 10-K's value is preferred over later
# restatements. Later filings retroactively split-adjust share counts
# (e.g. GOOGL's CommonStockSharesOutstanding for end-2021 was 662M in
# the FY2021 10-K but 13,242M in the FY2022 10-K after the 20:1 split).
# The cached price series is intentionally unadjusted (see
# fetch_prices.py:_download_yfinance), so to keep market cap = price ×
# shares internally consistent we use the original-filing share count.
SPLIT_SENSITIVE_FIELDS: frozenset[str] = frozenset({"shares_outstanding"})

CONCEPT_TAGS: dict[str, tuple[tuple[str, str], ...]] = {
    "ebit": (
        ("OperatingIncomeLoss", "USD"),
        # NKE and a handful of consumer / apparel issuers don't tag a
        # standalone Operating Income line; their cleanest equivalent is the
        # pretax income from continuing operations. For low-debt issuers
        # this approximates EBIT closely enough for the Gap signal.
        ("IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments", "USD"),
        ("IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest", "USD"),
        ("IncomeBeforeIncomeTaxes", "USD"),
    ),
    "tax_rate": (
        ("EffectiveIncomeTaxRateContinuingOperations", "pure"),
    ),
    "net_ppe": (
        ("PropertyPlantAndEquipmentNet", "USD"),
        # Post-ASC 842 issuers (META and others) consolidate net PP&E with the
        # finance-lease right-of-use asset under this composite tag.
        ("PropertyPlantAndEquipmentAndFinanceLeaseRightOfUseAssetAfterAccumulatedDepreciationAndAmortization", "USD"),
    ),
    "goodwill": (("Goodwill", "USD"),),
    "intangibles_ex_goodwill": (
        ("IntangibleAssetsNetExcludingGoodwill", "USD"),
        ("FiniteLivedIntangibleAssetsNet", "USD"),
    ),
    "accounts_receivable": (
        ("AccountsReceivableNetCurrent", "USD"),
        ("ReceivablesNetCurrent", "USD"),
    ),
    "inventory": (("InventoryNet", "USD"),),
    "accounts_payable": (
        ("AccountsPayableCurrent", "USD"),
        # META and a handful of issuers report only the Trade-AP split.
        ("AccountsPayableTradeCurrent", "USD"),
    ),
    "accrued_liabilities": (("AccruedLiabilitiesCurrent", "USD"),),
    "cash_and_equivalents": (
        ("CashAndCashEquivalentsAtCarryingValue", "USD"),
        ("CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents", "USD"),
    ),
    "short_term_investments": (
        # Order matters: prefer the broader marketable-securities tag first
        # (GOOGL, META, NVDA all use this), then the textbook
        # ShortTermInvestments (COST), then the AFS sub-aggregate.
        ("MarketableSecuritiesCurrent", "USD"),
        ("ShortTermInvestments", "USD"),
        ("AvailableForSaleSecuritiesCurrent", "USD"),
    ),
    "long_term_investments": (
        # Most issuers carry no LTD marketable securities; for those that
        # do (AAPL, GOOGL), the broad operating-IC formula requires
        # subtracting them so non-operating financial assets are excluded.
        ("MarketableSecuritiesNoncurrent", "USD"),
        ("LongTermInvestments", "USD"),
        ("AvailableForSaleSecuritiesNoncurrent", "USD"),
    ),
    "total_assets": (("Assets", "USD"),),
    "short_term_debt": (
        ("ShortTermBorrowings", "USD"),
        ("LongTermDebtCurrent", "USD"),
    ),
    "long_term_debt": (
        ("LongTermDebtNoncurrent", "USD"),
        ("LongTermDebt", "USD"),
    ),
    "shares_outstanding": (
        ("CommonStockSharesOutstanding", "shares"),
        ("EntityCommonStockSharesOutstanding", "shares"),
        # For multi-class issuers (META, GOOGL pre-2022, V) the consolidated
        # tags above are class-specific and undercount the total. Diluted
        # weighted-average is the closest all-class count available from the
        # face of the 10-K; it's a slight time-weighted approximation of the
        # point-in-time end-of-period count.
        ("WeightedAverageNumberOfDilutedSharesOutstanding", "shares"),
    ),
    "revenue": (
        ("Revenues", "USD"),
        ("RevenueFromContractWithCustomerExcludingAssessedTax", "USD"),
        ("SalesRevenueNet", "USD"),
    ),
    "capex": (
        ("PaymentsToAcquirePropertyPlantAndEquipment", "USD"),
        # NVDA (post-2012) tags capex as PaymentsToAcquireProductiveAssets.
        ("PaymentsToAcquireProductiveAssets", "USD"),
        # ADP files capex under the "Other" variant of PP&E. Without
        # this, all ADP rows fail INCOMPLETE on RR (Capex required).
        ("PaymentsToAcquireOtherPropertyPlantAndEquipment", "USD"),
        # VZ (FY2019+) migrated from PaymentsToAcquireProductiveAssets
        # to PaymentsToAcquireOtherProductiveAssets. Without this, VZ
        # post-2018 rows fail INCOMPLETE.
        ("PaymentsToAcquireOtherProductiveAssets", "USD"),
    ),
    "depreciation_amortization": (
        ("DepreciationDepletionAndAmortization", "USD"),
        ("DepreciationAndAmortization", "USD"),
        # Walmart and similar issuers switched to this composite tag
        # post-ASC 842 (the "Accretion" is for finance-lease ROU
        # asset accretion). Without this fallback WMT FY2020+ rows
        # are INCOMPLETE on D&A.
        ("DepreciationAmortizationAndAccretionNet", "USD"),
        # GOOGL, MSFT, META, NKE, FAST, TXN report D&A under the plain
        # ``Depreciation`` tag (with intangibles amortization tracked
        # separately if material).
        ("Depreciation", "USD"),
    ),
    "cfo": (("NetCashProvidedByUsedInOperatingActivities", "USD"),),
    "buybacks": (("PaymentsForRepurchaseOfCommonStock", "USD"),),
    "net_income": (
        # Required for Owner Earnings (paper § 2.1). Prefer the bottom-line
        # NetIncomeLoss; fall back to the alternative for issuers (e.g. GE
        # post-restructuring) that tag NetIncomeLossAvailableToCommonStockholdersBasic.
        ("NetIncomeLoss", "USD"),
        ("NetIncomeLossAvailableToCommonStockholdersBasic", "USD"),
        ("ProfitLoss", "USD"),
    ),
    "total_equity": (
        ("StockholdersEquity", "USD"),
        # Issuers with non-controlling interests use the longer tag for the
        # consolidated total. Fall back to it when the cleaner tag is absent.
        ("StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest", "USD"),
    ),
    "acquisitions": (
        # Cash-flow-statement line item; positive value = cash outflow.
        # Used in Net Reinvestment per v1.0 paper § 3.2.
        ("PaymentsToAcquireBusinessesNetOfCashAcquired", "USD"),
        ("PaymentsToAcquireBusinessesGross", "USD"),
        ("PaymentsToAcquireBusinessesAndInterestInAffiliates", "USD"),
    ),
}


def first_available(
    facts: CompanyFacts,
    field: str,
    fiscal_year: int,
) -> XBRLFact | None:
    """Try each (tag, unit) pair in CONCEPT_TAGS[field]; return the first hit.

    Fields listed in ``SPLIT_SENSITIVE_FIELDS`` get the original-filing
    value rather than the latest restatement (avoids split-adjusted
    share counts being matched against unadjusted prices).
    """
    if field not in CONCEPT_TAGS:
        raise KeyError(f"Unknown field {field!r}; add to CONCEPT_TAGS")
    prefer_oldest = field in SPLIT_SENSITIVE_FIELDS
    for tag, unit in CONCEPT_TAGS[field]:
        fact = facts.get_annual(
            tag, fiscal_year, unit=unit, prefer_oldest_filed=prefer_oldest
        )
        if fact is not None:
            return fact
    return None


# ---------------------------------------------------------------------------
# I/O — cache layer, network calls
# ---------------------------------------------------------------------------


class SECClient:
    """Cache-first client for SEC EDGAR XBRL Company Facts.

    Construct once; ``get_facts(ticker)`` returns a parsed CompanyFacts.
    The first call for a ticker hits the network; subsequent calls read
    from the cache on disk.
    """

    def __init__(
        self,
        cache_dir: Path,
        user_agent: str,
        rate_limit_sleep_s: float = RATE_LIMIT_SLEEP_S,
        offline: bool = False,
    ) -> None:
        if "@" not in user_agent or len(user_agent) < 10:
            raise ValueError(
                "SEC requires a polite User-Agent like "
                "'My Project name@example.com'. Got: " + repr(user_agent)
            )
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.user_agent = user_agent
        self.rate_limit_sleep_s = rate_limit_sleep_s
        self.offline = offline
        self._ticker_map: dict[str, int] | None = None
        self._last_request_at: float = 0.0

    # ---- Ticker → CIK -----------------------------------------------------

    def cik_for_ticker(self, ticker: str) -> int:
        if self._ticker_map is None:
            self._ticker_map = self._load_ticker_map()
        ticker_u = ticker.upper()
        if ticker_u not in self._ticker_map:
            raise KeyError(f"Ticker {ticker!r} not found in SEC ticker map")
        return self._ticker_map[ticker_u]

    def _load_ticker_map(self) -> dict[str, int]:
        path = self.cache_dir / "company_tickers.json"
        if not path.exists():
            if self.offline:
                raise RuntimeError(
                    f"Ticker map missing at {path} and client is offline. "
                    "Run scripts/populate_cache.py first."
                )
            self._download(TICKERS_URL, path)
        raw = json.loads(path.read_text())
        return {row["ticker"].upper(): int(row["cik_str"]) for row in raw.values()}

    # ---- Company facts ----------------------------------------------------

    def get_facts(self, ticker: str) -> CompanyFacts:
        cik = self.cik_for_ticker(ticker)
        path = self.cache_dir / f"CIK{cik:010d}.json"
        if not path.exists():
            if self.offline:
                raise RuntimeError(
                    f"Company facts missing at {path} and client is offline. "
                    "Run scripts/populate_cache.py first."
                )
            url = f"{SEC_BASE}/api/xbrl/companyfacts/CIK{cik:010d}.json"
            self._download(url, path)
        raw = json.loads(path.read_text())
        return parse_company_facts(raw, ticker.upper())

    # ---- Network primitives ----------------------------------------------

    def _download(self, url: str, dest: Path) -> None:
        import requests  # noqa: PLC0415

        elapsed = time.monotonic() - self._last_request_at
        if elapsed < self.rate_limit_sleep_s:
            time.sleep(self.rate_limit_sleep_s - elapsed)
        headers = {"User-Agent": self.user_agent, "Accept-Encoding": "gzip, deflate"}
        response = requests.get(url, headers=headers, timeout=30)
        self._last_request_at = time.monotonic()
        if response.status_code != 200:
            raise RuntimeError(
                f"SEC fetch failed: GET {url} -> {response.status_code} "
                f"{response.text[:200]!r}"
            )
        dest.write_bytes(response.content)


# ---------------------------------------------------------------------------
# High-level extractor: returns the flat input bag the pipeline needs.
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class TickerYearInputs:
    """Flat bag of XBRL-derived inputs for one ticker-year."""

    ticker: str
    fiscal_year: int

    ebit: float | None = None
    effective_tax_rate: float | None = None

    net_ppe: float | None = None
    goodwill: float | None = None
    intangibles_ex_goodwill: float | None = None

    accounts_receivable: float | None = None
    inventory: float | None = None
    accounts_payable: float | None = None
    accrued_liabilities: float | None = None

    cash: float | None = None
    short_term_investments: float | None = None
    long_term_investments: float | None = None
    short_term_debt: float | None = None
    long_term_debt: float | None = None
    total_assets: float | None = None

    shares_outstanding: float | None = None
    shares_outstanding_filed: str | None = None
    revenue: float | None = None
    capex: float | None = None
    depreciation_amortization: float | None = None
    cfo: float | None = None
    buybacks: float | None = None
    net_income: float | None = None

    total_equity: float | None = None
    acquisitions: float | None = None

    period_end: str | None = None
    source_form: str | None = None


def extract_inputs(facts: CompanyFacts, fiscal_year: int) -> TickerYearInputs:
    """Pull every input the math layer needs from one CompanyFacts."""

    def _val(field: str) -> float | None:
        fact = first_available(facts, field, fiscal_year)
        return fact.value if fact else None

    ebit_fact = first_available(facts, "ebit", fiscal_year)
    period_end = ebit_fact.period_end if ebit_fact else None
    form = ebit_fact.form if ebit_fact else None

    # Effective tax rate may be published as a fraction (0.18) or percentage (18).
    raw_tax = _val("tax_rate")
    tax_rate = None
    if raw_tax is not None:
        tax_rate = raw_tax / 100.0 if raw_tax > 1.0 else raw_tax

    # Capture the filed date of the chosen shares_outstanding fact. The
    # pipeline needs this to detect splits-after-FY-end: when a split's
    # date falls between FY-end and the filing date, the SEC-reported
    # share count is already post-split and must be divided back to
    # pre-split basis to match the unadjusted cached price.
    shares_fact = first_available(facts, "shares_outstanding", fiscal_year)

    return TickerYearInputs(
        ticker=facts.ticker,
        fiscal_year=fiscal_year,
        ebit=ebit_fact.value if ebit_fact else None,
        effective_tax_rate=tax_rate,
        net_ppe=_val("net_ppe"),
        goodwill=_val("goodwill"),
        intangibles_ex_goodwill=_val("intangibles_ex_goodwill"),
        accounts_receivable=_val("accounts_receivable"),
        inventory=_val("inventory"),
        accounts_payable=_val("accounts_payable"),
        accrued_liabilities=_val("accrued_liabilities"),
        cash=_val("cash_and_equivalents"),
        short_term_investments=_val("short_term_investments"),
        long_term_investments=_val("long_term_investments"),
        short_term_debt=_val("short_term_debt"),
        long_term_debt=_val("long_term_debt"),
        total_assets=_val("total_assets"),
        shares_outstanding=shares_fact.value if shares_fact else None,
        shares_outstanding_filed=shares_fact.filed if shares_fact else None,
        revenue=_val("revenue"),
        capex=_val("capex"),
        depreciation_amortization=_val("depreciation_amortization"),
        cfo=_val("cfo"),
        buybacks=_val("buybacks"),
        net_income=_val("net_income"),
        total_equity=_val("total_equity"),
        acquisitions=_val("acquisitions"),
        period_end=period_end,
        source_form=form,
    )


def available_fiscal_years(
    facts: CompanyFacts,
    concept: str = "OperatingIncomeLoss",
) -> list[int]:
    """List of FY periods available for ``concept``, ascending."""
    years = {
        f.fiscal_year
        for f in facts.facts.get(concept, [])
        if f.fiscal_period == "FY"
        and f.form in {"10-K", "10-K/A"}
        and f.unit == "USD"
    }
    return sorted(years)
