OCR-SPRIN-SERVICE/tests/unit/test_orchestrator_llm.py

"""Orchestrator-level tests for the Phase 5 hybrid LLM wiring.

These tests stub out the heavy stages (ingest / preprocess / OCR / table)
so we can verify the *branching* behaviour around the LLM step without
booting Paddle.
"""

from __future__ import annotations

from datetime import date

import pytest

from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import _header_has_gaps, run_pipeline
from ocr_sprint.schemas.document import SourceKind
from ocr_sprint.schemas.extraction import HeaderFields, ReviewFlag, Signatory


def test_header_has_gaps_detects_missing_fields() -> None:
    full = HeaderFields(
        nomor_sprint="Sprin/1/I/2025",
        tanggal=date(2025, 1, 1),
        satuan_penerbit="Polres X",
        perihal="ok",
        dasar=["UU 2/2002"],
    )
    assert _header_has_gaps(full) is False

    assert _header_has_gaps(HeaderFields()) is True
    assert _header_has_gaps(full.model_copy(update={"perihal": None})) is True
    assert _header_has_gaps(full.model_copy(update={"dasar": []})) is True


def _stub_pipeline_stages(
    monkeypatch: pytest.MonkeyPatch,
    *,
    raw_text: str,
    regex_header: HeaderFields,
) -> None:
    """Replace ingest -> ocr -> tables with cheap fakes so the orchestrator
    runs without Paddle / PyMuPDF.
    """
    import numpy as np

    from ocr_sprint.pipeline import ingest as ingest_module
    from ocr_sprint.pipeline import ocr as ocr_module
    from ocr_sprint.pipeline.ingest import IngestedPage

    img = np.full((100, 100, 3), 255, dtype=np.uint8)
    fake_page = IngestedPage(image=img, page_index=0)
    fake_ocr_page = ocr_module.OCRPage(
        lines=[
            ocr_module.OCRLine(text=raw_text, confidence=0.95, box=((0, 0), (1, 0), (1, 1), (0, 1)))
        ],
    )

    monkeypatch.setattr(orch_module, "detect_source_kind", lambda _: SourceKind.PDF)
    monkeypatch.setattr(orch_module, "ingest", lambda *a, **k: [fake_page])
    monkeypatch.setattr(orch_module, "detect_and_correct", lambda image, _cfg: image)
    monkeypatch.setattr(orch_module, "preprocess", lambda image, _cfg: image)
    monkeypatch.setattr(orch_module, "run_ocr", lambda _image: fake_ocr_page)
    # No tables in these tests.
    monkeypatch.setattr(orch_module, "run_table_extraction", lambda _img: [])
    monkeypatch.setattr(orch_module, "extract_personnel", lambda _tables: [])
    # Header / signatory / validators come from the real implementation
    # for `extract_header`, but we override to control gap state.
    monkeypatch.setattr(orch_module, "extract_header", lambda _text: regex_header)
    monkeypatch.setattr(orch_module, "find_signatory", lambda _text: Signatory())
    monkeypatch.setattr(orch_module, "validate_extraction", lambda _result: [])
    # Keep ingest_module referenced so import isn't dropped.
    assert ingest_module is not None


def test_orchestrator_skips_llm_when_disabled(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("LLM_ENABLED", "false")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    _stub_pipeline_stages(
        monkeypatch,
        raw_text="dummy",
        regex_header=HeaderFields(),  # all gaps
    )

    called = {"n": 0}

    def _trip(*_args: object, **_kwargs: object) -> None:
        called["n"] += 1
        return None

    monkeypatch.setattr(orch_module, "llm_fill_header", _trip)

    result = run_pipeline(b"%PDF-1.4\n%fake")
    assert called["n"] == 0
    assert ReviewFlag.LLM_FALLBACK not in result.result.review_flags
    assert ReviewFlag.LLM_UNAVAILABLE not in result.result.review_flags


def test_orchestrator_skips_llm_when_header_complete(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("LLM_ENABLED", "true")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    _stub_pipeline_stages(
        monkeypatch,
        raw_text="dummy",
        regex_header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="Polres X",
            perihal="ok",
            dasar=["UU 2/2002"],
        ),
    )

    called = {"n": 0}

    def _trip(*_args: object, **_kwargs: object) -> None:
        called["n"] += 1
        return None

    monkeypatch.setattr(orch_module, "llm_fill_header", _trip)

    run_pipeline(b"%PDF-1.4\n%fake")
    assert called["n"] == 0


def test_orchestrator_calls_llm_and_marks_fallback(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("LLM_ENABLED", "true")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    regex_partial = HeaderFields(nomor_sprint="Sprin/1/I/2025")  # rest missing
    _stub_pipeline_stages(monkeypatch, raw_text="dummy text", regex_header=regex_partial)

    def _llm(_raw: str, header: HeaderFields, **_: object) -> HeaderFields:
        return header.model_copy(
            update={
                "satuan_penerbit": "Polres Bandung",
                "perihal": "Penyelidikan",
                "dasar": ["UU 2/2002"],
            }
        )

    monkeypatch.setattr(orch_module, "llm_fill_header", _llm)

    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert out.result.header.satuan_penerbit == "Polres Bandung"
    assert out.result.header.perihal == "Penyelidikan"
    assert ReviewFlag.LLM_FALLBACK in out.result.review_flags
    assert ReviewFlag.LLM_UNAVAILABLE not in out.result.review_flags


def test_orchestrator_marks_unavailable_when_llm_returns_none(
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    monkeypatch.setenv("LLM_ENABLED", "true")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    _stub_pipeline_stages(monkeypatch, raw_text="dummy", regex_header=HeaderFields())
    monkeypatch.setattr(orch_module, "llm_fill_header", lambda *_a, **_k: None)

    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
    assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags


def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    """When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
    the orchestrator must run the text fallback against the raw OCR text and
    raise the ``personnel_text_fallback`` flag.
    """
    monkeypatch.setenv("LLM_ENABLED", "false")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    raw_text = (
        "DAFTAR PERSONIL\n"
        "1.\n"
        "SRI WAHYUNI\n"
        "AIPTU / 75070328\n"
        "INTELKAM POLRES CIMAHI\n"
        "2.\n"
        "AGUNG LUKMAN\n"
        "BRIPTU / 99030245\n"
        "SAT INTELKAM\n"
    )

    # PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
    # the real Polres Cimahi document).
    from ocr_sprint.schemas.personnel import PersonnelEntry

    pp_structure_low_quality = [
        PersonnelEntry(nama="SRI WAHYUNI"),
        PersonnelEntry(nama="AGUNG LUKMAN"),
    ]
    _stub_pipeline_stages(
        monkeypatch,
        raw_text=raw_text,
        regex_header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="Polres Cimahi",
            perihal="ok",
            dasar=["UU 2/2002"],
        ),
    )
    # Override extract_personnel to return the broken PP-Structure rows.
    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)

    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
    # Fallback rows must carry pangkat + nrp (the whole point of the path).
    assert all(r.pangkat and r.nrp for r in out.result.personel)
    assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}


def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    """Healthy PP-Structure output (rank+nrp present on most rows) must NOT
    be replaced by the text fallback.
    """
    monkeypatch.setenv("LLM_ENABLED", "false")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    from ocr_sprint.schemas.personnel import PersonnelEntry

    healthy = [
        PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
        PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
        PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
    ]
    _stub_pipeline_stages(
        monkeypatch,
        raw_text="ignored — should not be parsed",
        regex_header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="Polres X",
            perihal="ok",
            dasar=["UU 2/2002"],
        ),
    )
    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)

    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
    assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]