"""High-level LLM extractor.

The job is *narrow*: take the raw OCR text plus the partial header that
came back from the regex layer, and return an LLM-derived header that the
caller can merge in. We never let the LLM populate the personnel table —
PP-Structure is more accurate and cheaper for that.
"""

from __future__ import annotations

from datetime import date

from pydantic import BaseModel, Field

from ocr_sprint.llm.client import LLMUnavailableError, OllamaClient
from ocr_sprint.llm.prompts import SYSTEM_HEADER, build_user_prompt
from ocr_sprint.schemas.extraction import HeaderFields
from ocr_sprint.utils.logging import get_logger

_logger = get_logger(__name__)


class LLMHeaderResult(BaseModel):
    """Schema we ask the model to fill. Mirrors ``HeaderFields`` but is
    intentionally separate so we control exactly what the prompt and
    validation surface look like — the public ``HeaderFields`` may grow
    fields later that we don't want the LLM touching.
    """

    nomor_sprint: str | None = None
    tanggal: date | None = None
    satuan_penerbit: str | None = None
    perihal: str | None = None
    dasar: list[str] = Field(default_factory=list)


def llm_fill_header(
    raw_text: str,
    regex_header: HeaderFields,
    *,
    client: OllamaClient | None = None,
) -> HeaderFields | None:
    """Run the LLM extractor and return a *merged* HeaderFields.

    Returns ``None`` if the model is unavailable so the caller can decide
    what to do (typically: keep the regex result and emit a fallback
    review flag).
    """
    client = client or OllamaClient()

    user = build_user_prompt(
        raw_text=raw_text,
        regex_partial=regex_header.model_dump(mode="json"),
    )

    try:
        llm = client.chat_json(SYSTEM_HEADER, user, LLMHeaderResult)
    except LLMUnavailableError as exc:
        _logger.warning("llm.unavailable", error=str(exc))
        return None

    return _merge(regex_header, llm)


def _merge(regex: HeaderFields, llm: LLMHeaderResult) -> HeaderFields:
    """Merge LLM output into the regex result.

    Policy: regex wins for any field it already filled. The LLM only fills
    the *gaps*. This keeps deterministic / verifiable extractions for the
    fields where regex is reliable and prevents the LLM from "correcting"
    a value that happens to look unusual but is in fact correct.
    """
    merged = regex.model_copy(deep=True)
    if merged.nomor_sprint is None and llm.nomor_sprint:
        merged.nomor_sprint = llm.nomor_sprint
    if merged.tanggal is None and llm.tanggal is not None:
        merged.tanggal = llm.tanggal
    if not merged.satuan_penerbit and llm.satuan_penerbit:
        merged.satuan_penerbit = llm.satuan_penerbit
    if not merged.perihal and llm.perihal:
        merged.perihal = llm.perihal
    if not merged.dasar and llm.dasar:
        merged.dasar = list(llm.dasar)
    return merged