"""High-level LLM extractor. The job is *narrow*: take the raw OCR text plus the partial header that came back from the regex layer, and return an LLM-derived header that the caller can merge in. We never let the LLM populate the personnel table — PP-Structure is more accurate and cheaper for that. """ from __future__ import annotations from datetime import date from pydantic import BaseModel, Field from ocr_sprint.llm.client import LLMUnavailableError, OllamaClient from ocr_sprint.llm.prompts import SYSTEM_HEADER, build_user_prompt from ocr_sprint.schemas.extraction import HeaderFields from ocr_sprint.utils.logging import get_logger _logger = get_logger(__name__) class LLMHeaderResult(BaseModel): """Schema we ask the model to fill. Mirrors ``HeaderFields`` but is intentionally separate so we control exactly what the prompt and validation surface look like — the public ``HeaderFields`` may grow fields later that we don't want the LLM touching. """ nomor_sprint: str | None = None tanggal: date | None = None satuan_penerbit: str | None = None perihal: str | None = None dasar: list[str] = Field(default_factory=list) def llm_fill_header( raw_text: str, regex_header: HeaderFields, *, client: OllamaClient | None = None, ) -> HeaderFields | None: """Run the LLM extractor and return a *merged* HeaderFields. Returns ``None`` if the model is unavailable so the caller can decide what to do (typically: keep the regex result and emit a fallback review flag). """ client = client or OllamaClient() user = build_user_prompt( raw_text=raw_text, regex_partial=regex_header.model_dump(mode="json"), ) try: llm = client.chat_json(SYSTEM_HEADER, user, LLMHeaderResult) except LLMUnavailableError as exc: _logger.warning("llm.unavailable", error=str(exc)) return None return _merge(regex_header, llm) def _merge(regex: HeaderFields, llm: LLMHeaderResult) -> HeaderFields: """Merge LLM output into the regex result. Policy: regex wins for any field it already filled. The LLM only fills the *gaps*. This keeps deterministic / verifiable extractions for the fields where regex is reliable and prevents the LLM from "correcting" a value that happens to look unusual but is in fact correct. """ merged = regex.model_copy(deep=True) if merged.nomor_sprint is None and llm.nomor_sprint: merged.nomor_sprint = llm.nomor_sprint if merged.tanggal is None and llm.tanggal is not None: merged.tanggal = llm.tanggal if not merged.satuan_penerbit and llm.satuan_penerbit: merged.satuan_penerbit = llm.satuan_penerbit if not merged.perihal and llm.perihal: merged.perihal = llm.perihal if not merged.dasar and llm.dasar: merged.dasar = list(llm.dasar) return merged