OCR-SPRIN-SERVICE/tests/unit/test_orchestrator_llm.py

"""Orchestrator-level tests for the Phase 5 hybrid LLM wiring.

These tests stub out the heavy stages (ingest / preprocess / OCR / table)
so we can verify the *branching* behaviour around the LLM step without
booting Paddle.
"""

from __future__ import annotations

from datetime import date

import pytest

from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import _header_has_gaps, run_pipeline
from ocr_sprint.schemas.document import SourceKind
from ocr_sprint.schemas.extraction import HeaderFields, ReviewFlag, Signatory


def test_header_has_gaps_detects_missing_fields() -> None:
    full = HeaderFields(
        nomor_sprint="Sprin/1/I/2025",
        tanggal=date(2025, 1, 1),
        satuan_penerbit="Polres X",
        perihal="ok",
        dasar=["UU 2/2002"],
    )
    assert _header_has_gaps(full) is False

    assert _header_has_gaps(HeaderFields()) is True
    assert _header_has_gaps(full.model_copy(update={"perihal": None})) is True
    assert _header_has_gaps(full.model_copy(update={"dasar": []})) is True


def _stub_pipeline_stages(
    monkeypatch: pytest.MonkeyPatch,
    *,
    raw_text: str,
    regex_header: HeaderFields,
) -> None:
    """Replace ingest -> ocr -> tables with cheap fakes so the orchestrator
    runs without Paddle / PyMuPDF.
    """
    import numpy as np

    from ocr_sprint.pipeline import ingest as ingest_module
    from ocr_sprint.pipeline import ocr as ocr_module
    from ocr_sprint.pipeline.ingest import IngestedPage

    img = np.full((100, 100, 3), 255, dtype=np.uint8)
    fake_page = IngestedPage(image=img, page_index=0)
    fake_ocr_page = ocr_module.OCRPage(
        lines=[
            ocr_module.OCRLine(text=raw_text, confidence=0.95, box=((0, 0), (1, 0), (1, 1), (0, 1)))
        ],
    )

    monkeypatch.setattr(orch_module, "detect_source_kind", lambda _: SourceKind.PDF)
    monkeypatch.setattr(orch_module, "ingest", lambda *a, **k: [fake_page])
    monkeypatch.setattr(orch_module, "detect_and_correct", lambda image, _cfg: image)
    monkeypatch.setattr(orch_module, "preprocess", lambda image, _cfg: image)
    monkeypatch.setattr(orch_module, "run_ocr", lambda _image: fake_ocr_page)
    # No tables in these tests.
    monkeypatch.setattr(orch_module, "run_table_extraction", lambda _img: [])
    monkeypatch.setattr(orch_module, "extract_personnel", lambda _tables: [])
    # Header / signatory / validators come from the real implementation
    # for `extract_header`, but we override to control gap state.
    monkeypatch.setattr(orch_module, "extract_header", lambda _text: regex_header)
    monkeypatch.setattr(orch_module, "find_signatory", lambda _text: Signatory())
    monkeypatch.setattr(orch_module, "validate_extraction", lambda _result: [])
    # Keep ingest_module referenced so import isn't dropped.
    assert ingest_module is not None


def test_orchestrator_skips_llm_when_disabled(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("LLM_ENABLED", "false")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    _stub_pipeline_stages(
        monkeypatch,
        raw_text="dummy",
        regex_header=HeaderFields(),  # all gaps
    )

    called = {"n": 0}

    def _trip(*_args: object, **_kwargs: object) -> None:
        called["n"] += 1
        return None

    monkeypatch.setattr(orch_module, "llm_fill_header", _trip)

    result = run_pipeline(b"%PDF-1.4\n%fake")
    assert called["n"] == 0
    assert ReviewFlag.LLM_FALLBACK not in result.result.review_flags
    assert ReviewFlag.LLM_UNAVAILABLE not in result.result.review_flags


def test_orchestrator_skips_llm_when_header_complete(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("LLM_ENABLED", "true")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    _stub_pipeline_stages(
        monkeypatch,
        raw_text="dummy",
        regex_header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="Polres X",
            perihal="ok",
            dasar=["UU 2/2002"],
        ),
    )

    called = {"n": 0}

    def _trip(*_args: object, **_kwargs: object) -> None:
        called["n"] += 1
        return None

    monkeypatch.setattr(orch_module, "llm_fill_header", _trip)

    run_pipeline(b"%PDF-1.4\n%fake")
    assert called["n"] == 0


def test_orchestrator_calls_llm_and_marks_fallback(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("LLM_ENABLED", "true")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    regex_partial = HeaderFields(nomor_sprint="Sprin/1/I/2025")  # rest missing
    _stub_pipeline_stages(monkeypatch, raw_text="dummy text", regex_header=regex_partial)

    def _llm(_raw: str, header: HeaderFields, **_: object) -> HeaderFields:
        return header.model_copy(
            update={
                "satuan_penerbit": "Polres Bandung",
                "perihal": "Penyelidikan",
                "dasar": ["UU 2/2002"],
            }
        )

    monkeypatch.setattr(orch_module, "llm_fill_header", _llm)

    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert out.result.header.satuan_penerbit == "Polres Bandung"
    assert out.result.header.perihal == "Penyelidikan"
    assert ReviewFlag.LLM_FALLBACK in out.result.review_flags
    assert ReviewFlag.LLM_UNAVAILABLE not in out.result.review_flags


def test_orchestrator_marks_unavailable_when_llm_returns_none(
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    monkeypatch.setenv("LLM_ENABLED", "true")
    from ocr_sprint.config import get_settings

    get_settings.cache_clear()

    _stub_pipeline_stages(monkeypatch, raw_text="dummy", regex_header=HeaderFields())
    monkeypatch.setattr(orch_module, "llm_fill_header", lambda *_a, **_k: None)

    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
    assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags