"""Orchestrator-level tests for the Phase 5 hybrid LLM wiring. These tests stub out the heavy stages (ingest / preprocess / OCR / table) so we can verify the *branching* behaviour around the LLM step without booting Paddle. """ from __future__ import annotations from datetime import date import pytest from ocr_sprint.pipeline import orchestrator as orch_module from ocr_sprint.pipeline.orchestrator import _header_has_gaps, run_pipeline from ocr_sprint.schemas.document import SourceKind from ocr_sprint.schemas.extraction import HeaderFields, ReviewFlag, Signatory def test_header_has_gaps_detects_missing_fields() -> None: full = HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), satuan_penerbit="Polres X", perihal="ok", dasar=["UU 2/2002"], ) assert _header_has_gaps(full) is False assert _header_has_gaps(HeaderFields()) is True assert _header_has_gaps(full.model_copy(update={"perihal": None})) is True assert _header_has_gaps(full.model_copy(update={"dasar": []})) is True def _stub_pipeline_stages( monkeypatch: pytest.MonkeyPatch, *, raw_text: str, regex_header: HeaderFields, ) -> None: """Replace ingest -> ocr -> tables with cheap fakes so the orchestrator runs without Paddle / PyMuPDF. """ import numpy as np from ocr_sprint.pipeline import ingest as ingest_module from ocr_sprint.pipeline import ocr as ocr_module from ocr_sprint.pipeline.ingest import IngestedPage img = np.full((100, 100, 3), 255, dtype=np.uint8) fake_page = IngestedPage(image=img, page_index=0) fake_ocr_page = ocr_module.OCRPage( lines=[ ocr_module.OCRLine(text=raw_text, confidence=0.95, box=((0, 0), (1, 0), (1, 1), (0, 1))) ], ) monkeypatch.setattr(orch_module, "detect_source_kind", lambda _: SourceKind.PDF) monkeypatch.setattr(orch_module, "ingest", lambda *a, **k: [fake_page]) monkeypatch.setattr(orch_module, "detect_and_correct", lambda image, _cfg: image) monkeypatch.setattr(orch_module, "preprocess", lambda image, _cfg: image) monkeypatch.setattr(orch_module, "run_ocr", lambda _image: fake_ocr_page) # No tables in these tests. monkeypatch.setattr(orch_module, "run_table_extraction", lambda _img: []) monkeypatch.setattr(orch_module, "extract_personnel", lambda _tables: []) # Header / signatory / validators come from the real implementation # for `extract_header`, but we override to control gap state. monkeypatch.setattr(orch_module, "extract_header", lambda _text: regex_header) monkeypatch.setattr(orch_module, "find_signatory", lambda _text: Signatory()) monkeypatch.setattr(orch_module, "validate_extraction", lambda _result: []) # Keep ingest_module referenced so import isn't dropped. assert ingest_module is not None def test_orchestrator_skips_llm_when_disabled(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("LLM_ENABLED", "false") from ocr_sprint.config import get_settings get_settings.cache_clear() _stub_pipeline_stages( monkeypatch, raw_text="dummy", regex_header=HeaderFields(), # all gaps ) called = {"n": 0} def _trip(*_args: object, **_kwargs: object) -> None: called["n"] += 1 return None monkeypatch.setattr(orch_module, "llm_fill_header", _trip) result = run_pipeline(b"%PDF-1.4\n%fake") assert called["n"] == 0 assert ReviewFlag.LLM_FALLBACK not in result.result.review_flags assert ReviewFlag.LLM_UNAVAILABLE not in result.result.review_flags def test_orchestrator_skips_llm_when_header_complete(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("LLM_ENABLED", "true") from ocr_sprint.config import get_settings get_settings.cache_clear() _stub_pipeline_stages( monkeypatch, raw_text="dummy", regex_header=HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), satuan_penerbit="Polres X", perihal="ok", dasar=["UU 2/2002"], ), ) called = {"n": 0} def _trip(*_args: object, **_kwargs: object) -> None: called["n"] += 1 return None monkeypatch.setattr(orch_module, "llm_fill_header", _trip) run_pipeline(b"%PDF-1.4\n%fake") assert called["n"] == 0 def test_orchestrator_calls_llm_and_marks_fallback(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("LLM_ENABLED", "true") from ocr_sprint.config import get_settings get_settings.cache_clear() regex_partial = HeaderFields(nomor_sprint="Sprin/1/I/2025") # rest missing _stub_pipeline_stages(monkeypatch, raw_text="dummy text", regex_header=regex_partial) def _llm(_raw: str, header: HeaderFields, **_: object) -> HeaderFields: return header.model_copy( update={ "satuan_penerbit": "Polres Bandung", "perihal": "Penyelidikan", "dasar": ["UU 2/2002"], } ) monkeypatch.setattr(orch_module, "llm_fill_header", _llm) out = run_pipeline(b"%PDF-1.4\n%fake") assert out.result.header.satuan_penerbit == "Polres Bandung" assert out.result.header.perihal == "Penyelidikan" assert ReviewFlag.LLM_FALLBACK in out.result.review_flags assert ReviewFlag.LLM_UNAVAILABLE not in out.result.review_flags def test_orchestrator_marks_unavailable_when_llm_returns_none( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setenv("LLM_ENABLED", "true") from ocr_sprint.config import get_settings get_settings.cache_clear() _stub_pipeline_stages(monkeypatch, raw_text="dummy", regex_header=HeaderFields()) monkeypatch.setattr(orch_module, "llm_fill_header", lambda *_a, **_k: None) out = run_pipeline(b"%PDF-1.4\n%fake") assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names( monkeypatch: pytest.MonkeyPatch, ) -> None: """When PP-Structure produces low-quality rows (e.g. only ``nama`` filled), the orchestrator must run the text fallback against the raw OCR text and raise the ``personnel_text_fallback`` flag. """ monkeypatch.setenv("LLM_ENABLED", "false") from ocr_sprint.config import get_settings get_settings.cache_clear() raw_text = ( "DAFTAR PERSONIL\n" "1.\n" "SRI WAHYUNI\n" "AIPTU / 75070328\n" "INTELKAM POLRES CIMAHI\n" "2.\n" "AGUNG LUKMAN\n" "BRIPTU / 99030245\n" "SAT INTELKAM\n" ) # PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on # the real Polres Cimahi document). from ocr_sprint.schemas.personnel import PersonnelEntry pp_structure_low_quality = [ PersonnelEntry(nama="SRI WAHYUNI"), PersonnelEntry(nama="AGUNG LUKMAN"), ] _stub_pipeline_stages( monkeypatch, raw_text=raw_text, regex_header=HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), satuan_penerbit="Polres Cimahi", perihal="ok", dasar=["UU 2/2002"], ), ) # Override extract_personnel to return the broken PP-Structure rows. monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality) out = run_pipeline(b"%PDF-1.4\n%fake") assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags # Fallback rows must carry pangkat + nrp (the whole point of the path). assert all(r.pangkat and r.nrp for r in out.result.personel) assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"} def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high( monkeypatch: pytest.MonkeyPatch, ) -> None: """Healthy PP-Structure output (rank+nrp present on most rows) must NOT be replaced by the text fallback. """ monkeypatch.setenv("LLM_ENABLED", "false") from ocr_sprint.config import get_settings get_settings.cache_clear() from ocr_sprint.schemas.personnel import PersonnelEntry healthy = [ PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"), PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"), PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"), ] _stub_pipeline_stages( monkeypatch, raw_text="ignored — should not be parsed", regex_header=HeaderFields( nomor_sprint="Sprin/1/I/2025", tanggal=date(2025, 1, 1), satuan_penerbit="Polres X", perihal="ok", dasar=["UU 2/2002"], ), ) monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy) out = run_pipeline(b"%PDF-1.4\n%fake") assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]