"""Unit tests for the hybrid LLM header extractor / merger.""" from __future__ import annotations from datetime import date import pytest from pydantic import BaseModel from ocr_sprint.llm.client import LLMUnavailableError, OllamaClient from ocr_sprint.llm.extractor import LLMHeaderResult, _merge, llm_fill_header from ocr_sprint.schemas.extraction import HeaderFields class _StubClient(OllamaClient): """Test double that bypasses HTTP entirely.""" def __init__(self, payload: LLMHeaderResult | Exception) -> None: # Skip the real __init__ — we don't need any real config. self._payload = payload def chat_json( # type: ignore[override] self, system: str, user: str, schema_cls: type[BaseModel] ) -> BaseModel: if isinstance(self._payload, Exception): raise self._payload return self._payload def test_merge_keeps_regex_when_present() -> None: regex = HeaderFields(nomor_sprint="Sprin/123/IV/2025/Reskrim", tanggal=date(2025, 4, 21)) llm = LLMHeaderResult(nomor_sprint="HALLUCINATED", tanggal=date(1999, 1, 1), perihal="ok") out = _merge(regex, llm) assert out.nomor_sprint == "Sprin/123/IV/2025/Reskrim" assert out.tanggal == date(2025, 4, 21) # Gaps get filled. assert out.perihal == "ok" def test_merge_fills_gaps() -> None: regex = HeaderFields() # all None llm = LLMHeaderResult( nomor_sprint="Sprin/9/IX/2024", tanggal=date(2024, 9, 1), satuan_penerbit="Polres Bandung", perihal="Penyelidikan", dasar=["UU 2/2002", "Perkap 6/2017"], ) out = _merge(regex, llm) assert out.nomor_sprint == "Sprin/9/IX/2024" assert out.tanggal == date(2024, 9, 1) assert out.satuan_penerbit == "Polres Bandung" assert out.perihal == "Penyelidikan" assert out.dasar == ["UU 2/2002", "Perkap 6/2017"] def test_llm_fill_header_returns_merged_when_client_succeeds() -> None: regex = HeaderFields(nomor_sprint="Sprin/1/I/2025") # has nomor, missing rest stub = _StubClient( LLMHeaderResult( satuan_penerbit="Polres Bandung", perihal="Penyelidikan", dasar=["UU 2/2002"], ) ) out = llm_fill_header(raw_text="...", regex_header=regex, client=stub) assert out is not None assert out.nomor_sprint == "Sprin/1/I/2025" assert out.satuan_penerbit == "Polres Bandung" assert out.perihal == "Penyelidikan" assert out.dasar == ["UU 2/2002"] def test_llm_fill_header_returns_none_when_unavailable() -> None: stub = _StubClient(LLMUnavailableError("server down")) out = llm_fill_header(raw_text="...", regex_header=HeaderFields(), client=stub) assert out is None def test_merge_does_not_overwrite_dasar_when_regex_has_it() -> None: regex = HeaderFields(dasar=["UU 2/2002"]) llm = LLMHeaderResult(dasar=["something else", "more"]) out = _merge(regex, llm) assert out.dasar == ["UU 2/2002"] def test_llm_extractor_unused_argument_kept_silent() -> None: # A trivial sanity check that the public function signature accepts # keyword-only `client` — this matches how the orchestrator calls it. pytest.importorskip("ocr_sprint.llm.extractor")