Adds a small Ollama HTTP client (httpx-based, no extra runtime deps),
prompt builders, and a hybrid header extractor that runs *after* the
deterministic regex layer. The merger never overwrites a regex-filled
field — the LLM only fills gaps. If LLM_ENABLED=false (the default), or
the Ollama server is unreachable, the pipeline degrades gracefully:
- LLM_ENABLED=false -> no LLM call at all, no flag.
- LLM_ENABLED=true,
header complete -> no LLM call.
- LLM_ENABLED=true,
header has gaps,
LLM responded ok -> merge + LLM_FALLBACK flag (review hint).
- LLM_ENABLED=true,
header has gaps,
LLM unavailable -> keep regex result + LLM_UNAVAILABLE flag.
Default model qwen2.5:1.5b on http://localhost:11434 — chosen for CPU
throughput (~5-15s per call) at acceptable accuracy. The LLM only fills
the *header* (nomor, tanggal, satuan, perihal, dasar). Personnel rows
stay with PP-Structure since that's more accurate and doesn't need LLM.
Tests:
- test_llm_client.py: httpx MockTransport-driven tests for the wire
format, error paths (HTTP 5xx, malformed JSON, missing envelope,
ConnectError), and request shape.
- test_llm_extractor.py: merge policy + None-on-unavailable behaviour.
- test_orchestrator_llm.py: end-to-end orchestrator wiring with stubs
for ingest/preprocess/OCR/table — verifies LLM is skipped when
disabled, skipped when header is complete, called and flagged when
gaps exist, and marked unavailable when the client returns None.
162 unit tests pass total (was 146).
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
172 lines
5.9 KiB
Python
172 lines
5.9 KiB
Python
"""Orchestrator-level tests for the Phase 5 hybrid LLM wiring.
|
|
|
|
These tests stub out the heavy stages (ingest / preprocess / OCR / table)
|
|
so we can verify the *branching* behaviour around the LLM step without
|
|
booting Paddle.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
|
|
import pytest
|
|
|
|
from ocr_sprint.pipeline import orchestrator as orch_module
|
|
from ocr_sprint.pipeline.orchestrator import _header_has_gaps, run_pipeline
|
|
from ocr_sprint.schemas.document import SourceKind
|
|
from ocr_sprint.schemas.extraction import HeaderFields, ReviewFlag, Signatory
|
|
|
|
|
|
def test_header_has_gaps_detects_missing_fields() -> None:
|
|
full = HeaderFields(
|
|
nomor_sprint="Sprin/1/I/2025",
|
|
tanggal=date(2025, 1, 1),
|
|
satuan_penerbit="Polres X",
|
|
perihal="ok",
|
|
dasar=["UU 2/2002"],
|
|
)
|
|
assert _header_has_gaps(full) is False
|
|
|
|
assert _header_has_gaps(HeaderFields()) is True
|
|
assert _header_has_gaps(full.model_copy(update={"perihal": None})) is True
|
|
assert _header_has_gaps(full.model_copy(update={"dasar": []})) is True
|
|
|
|
|
|
def _stub_pipeline_stages(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
*,
|
|
raw_text: str,
|
|
regex_header: HeaderFields,
|
|
) -> None:
|
|
"""Replace ingest -> ocr -> tables with cheap fakes so the orchestrator
|
|
runs without Paddle / PyMuPDF.
|
|
"""
|
|
import numpy as np
|
|
|
|
from ocr_sprint.pipeline import ingest as ingest_module
|
|
from ocr_sprint.pipeline import ocr as ocr_module
|
|
from ocr_sprint.pipeline.ingest import IngestedPage
|
|
|
|
img = np.full((100, 100, 3), 255, dtype=np.uint8)
|
|
fake_page = IngestedPage(image=img, page_index=0)
|
|
fake_ocr_page = ocr_module.OCRPage(
|
|
lines=[
|
|
ocr_module.OCRLine(text=raw_text, confidence=0.95, box=((0, 0), (1, 0), (1, 1), (0, 1)))
|
|
],
|
|
)
|
|
|
|
monkeypatch.setattr(orch_module, "detect_source_kind", lambda _: SourceKind.PDF)
|
|
monkeypatch.setattr(orch_module, "ingest", lambda *a, **k: [fake_page])
|
|
monkeypatch.setattr(orch_module, "detect_and_correct", lambda image, _cfg: image)
|
|
monkeypatch.setattr(orch_module, "preprocess", lambda image, _cfg: image)
|
|
monkeypatch.setattr(orch_module, "run_ocr", lambda _image: fake_ocr_page)
|
|
# No tables in these tests.
|
|
monkeypatch.setattr(orch_module, "run_table_extraction", lambda _img: [])
|
|
monkeypatch.setattr(orch_module, "extract_personnel", lambda _tables: [])
|
|
# Header / signatory / validators come from the real implementation
|
|
# for `extract_header`, but we override to control gap state.
|
|
monkeypatch.setattr(orch_module, "extract_header", lambda _text: regex_header)
|
|
monkeypatch.setattr(orch_module, "find_signatory", lambda _text: Signatory())
|
|
monkeypatch.setattr(orch_module, "validate_extraction", lambda _result: [])
|
|
# Keep ingest_module referenced so import isn't dropped.
|
|
assert ingest_module is not None
|
|
|
|
|
|
def test_orchestrator_skips_llm_when_disabled(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setenv("LLM_ENABLED", "false")
|
|
from ocr_sprint.config import get_settings
|
|
|
|
get_settings.cache_clear()
|
|
|
|
_stub_pipeline_stages(
|
|
monkeypatch,
|
|
raw_text="dummy",
|
|
regex_header=HeaderFields(), # all gaps
|
|
)
|
|
|
|
called = {"n": 0}
|
|
|
|
def _trip(*_args: object, **_kwargs: object) -> None:
|
|
called["n"] += 1
|
|
return None
|
|
|
|
monkeypatch.setattr(orch_module, "llm_fill_header", _trip)
|
|
|
|
result = run_pipeline(b"%PDF-1.4\n%fake")
|
|
assert called["n"] == 0
|
|
assert ReviewFlag.LLM_FALLBACK not in result.result.review_flags
|
|
assert ReviewFlag.LLM_UNAVAILABLE not in result.result.review_flags
|
|
|
|
|
|
def test_orchestrator_skips_llm_when_header_complete(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setenv("LLM_ENABLED", "true")
|
|
from ocr_sprint.config import get_settings
|
|
|
|
get_settings.cache_clear()
|
|
|
|
_stub_pipeline_stages(
|
|
monkeypatch,
|
|
raw_text="dummy",
|
|
regex_header=HeaderFields(
|
|
nomor_sprint="Sprin/1/I/2025",
|
|
tanggal=date(2025, 1, 1),
|
|
satuan_penerbit="Polres X",
|
|
perihal="ok",
|
|
dasar=["UU 2/2002"],
|
|
),
|
|
)
|
|
|
|
called = {"n": 0}
|
|
|
|
def _trip(*_args: object, **_kwargs: object) -> None:
|
|
called["n"] += 1
|
|
return None
|
|
|
|
monkeypatch.setattr(orch_module, "llm_fill_header", _trip)
|
|
|
|
run_pipeline(b"%PDF-1.4\n%fake")
|
|
assert called["n"] == 0
|
|
|
|
|
|
def test_orchestrator_calls_llm_and_marks_fallback(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
monkeypatch.setenv("LLM_ENABLED", "true")
|
|
from ocr_sprint.config import get_settings
|
|
|
|
get_settings.cache_clear()
|
|
|
|
regex_partial = HeaderFields(nomor_sprint="Sprin/1/I/2025") # rest missing
|
|
_stub_pipeline_stages(monkeypatch, raw_text="dummy text", regex_header=regex_partial)
|
|
|
|
def _llm(_raw: str, header: HeaderFields, **_: object) -> HeaderFields:
|
|
return header.model_copy(
|
|
update={
|
|
"satuan_penerbit": "Polres Bandung",
|
|
"perihal": "Penyelidikan",
|
|
"dasar": ["UU 2/2002"],
|
|
}
|
|
)
|
|
|
|
monkeypatch.setattr(orch_module, "llm_fill_header", _llm)
|
|
|
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
|
assert out.result.header.satuan_penerbit == "Polres Bandung"
|
|
assert out.result.header.perihal == "Penyelidikan"
|
|
assert ReviewFlag.LLM_FALLBACK in out.result.review_flags
|
|
assert ReviewFlag.LLM_UNAVAILABLE not in out.result.review_flags
|
|
|
|
|
|
def test_orchestrator_marks_unavailable_when_llm_returns_none(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
monkeypatch.setenv("LLM_ENABLED", "true")
|
|
from ocr_sprint.config import get_settings
|
|
|
|
get_settings.cache_clear()
|
|
|
|
_stub_pipeline_stages(monkeypatch, raw_text="dummy", regex_header=HeaderFields())
|
|
monkeypatch.setattr(orch_module, "llm_fill_header", lambda *_a, **_k: None)
|
|
|
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
|
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
|
|
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
|