Phase 5: hybrid LLM extraction (Ollama) for header gaps

Adds a small Ollama HTTP client (httpx-based, no extra runtime deps),
prompt builders, and a hybrid header extractor that runs *after* the
deterministic regex layer. The merger never overwrites a regex-filled
field — the LLM only fills gaps. If LLM_ENABLED=false (the default), or
the Ollama server is unreachable, the pipeline degrades gracefully:

  - LLM_ENABLED=false  ->  no LLM call at all, no flag.
  - LLM_ENABLED=true,
    header complete    ->  no LLM call.
  - LLM_ENABLED=true,
    header has gaps,
    LLM responded ok   ->  merge + LLM_FALLBACK flag (review hint).
  - LLM_ENABLED=true,
    header has gaps,
    LLM unavailable    ->  keep regex result + LLM_UNAVAILABLE flag.

Default model qwen2.5:1.5b on http://localhost:11434 — chosen for CPU
throughput (~5-15s per call) at acceptable accuracy. The LLM only fills
the *header* (nomor, tanggal, satuan, perihal, dasar). Personnel rows
stay with PP-Structure since that's more accurate and doesn't need LLM.

Tests:
 - test_llm_client.py: httpx MockTransport-driven tests for the wire
   format, error paths (HTTP 5xx, malformed JSON, missing envelope,
   ConnectError), and request shape.
 - test_llm_extractor.py: merge policy + None-on-unavailable behaviour.
 - test_orchestrator_llm.py: end-to-end orchestrator wiring with stubs
   for ingest/preprocess/OCR/table — verifies LLM is skipped when
   disabled, skipped when header is complete, called and flagged when
   gaps exist, and marked unavailable when the client returns None.

162 unit tests pass total (was 146).

Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
Devin AI
2026-04-25 16:56:43 +00:00
parent 2112023b6e
commit 45fbfdabb7
9 changed files with 646 additions and 1 deletions

View File

@@ -0,0 +1,108 @@
"""Unit tests for the Ollama HTTP client wrapper.
We swap ``httpx.Client`` inside ``ocr_sprint.llm.client`` for a builder that
returns a real ``httpx.Client`` wrapping a ``MockTransport``. Capturing the
original constructor *before* patching avoids infinite recursion in the
patched callable.
"""
from __future__ import annotations
from typing import Any
import httpx
import pytest
from pydantic import BaseModel
import ocr_sprint.llm.client as llm_client_module
from ocr_sprint.llm.client import LLMUnavailableError, OllamaClient
class _Schema(BaseModel):
foo: str
bar: int
def _ollama_envelope(content: str) -> dict[str, object]:
"""Mimic the shape Ollama's /api/chat returns."""
return {"message": {"role": "assistant", "content": content}, "done": True}
def _patch_transport(
monkeypatch: pytest.MonkeyPatch,
handler: Any,
) -> None:
transport = httpx.MockTransport(handler)
real_client = httpx.Client # capture before patching
def _factory(*_args: object, **kwargs: object) -> httpx.Client:
# Strip any caller-provided transport kwarg; we always inject ours.
kwargs.pop("transport", None)
return real_client(transport=transport, **kwargs)
monkeypatch.setattr(llm_client_module.httpx, "Client", _factory)
def test_chat_json_returns_validated_model(monkeypatch: pytest.MonkeyPatch) -> None:
captured: dict[str, object] = {}
def _handler(request: httpx.Request) -> httpx.Response:
captured["url"] = str(request.url)
captured["body"] = request.read()
return httpx.Response(200, json=_ollama_envelope('{"foo": "x", "bar": 7}'))
_patch_transport(monkeypatch, _handler)
client = OllamaClient(base_url="http://ollama:11434", model="m", timeout_s=5)
out = client.chat_json("system msg", "user msg", _Schema)
assert out == _Schema(foo="x", bar=7)
assert captured["url"] == "http://ollama:11434/api/chat"
body = captured["body"]
assert isinstance(body, bytes)
assert b'"format":"json"' in body
assert b'"system msg"' in body
def test_chat_json_raises_on_http_error(monkeypatch: pytest.MonkeyPatch) -> None:
def _handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(500, text="boom")
_patch_transport(monkeypatch, _handler)
client = OllamaClient(base_url="http://x", model="m", timeout_s=5)
with pytest.raises(LLMUnavailableError, match="Ollama request failed"):
client.chat_json("s", "u", _Schema)
def test_chat_json_raises_on_invalid_json(monkeypatch: pytest.MonkeyPatch) -> None:
def _handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(200, json=_ollama_envelope("this is not json"))
_patch_transport(monkeypatch, _handler)
client = OllamaClient(base_url="http://x", model="m", timeout_s=5)
with pytest.raises(LLMUnavailableError, match="schema"):
client.chat_json("s", "u", _Schema)
def test_chat_json_raises_on_missing_envelope(monkeypatch: pytest.MonkeyPatch) -> None:
def _handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(200, json={"oops": True})
_patch_transport(monkeypatch, _handler)
client = OllamaClient(base_url="http://x", model="m", timeout_s=5)
with pytest.raises(LLMUnavailableError, match=r"message\.content"):
client.chat_json("s", "u", _Schema)
def test_chat_json_raises_on_connection_error(monkeypatch: pytest.MonkeyPatch) -> None:
def _handler(request: httpx.Request) -> httpx.Response:
raise httpx.ConnectError("nobody home", request=request)
_patch_transport(monkeypatch, _handler)
client = OllamaClient(base_url="http://x", model="m", timeout_s=1)
with pytest.raises(LLMUnavailableError):
client.chat_json("s", "u", _Schema)

View File

@@ -0,0 +1,90 @@
"""Unit tests for the hybrid LLM header extractor / merger."""
from __future__ import annotations
from datetime import date
import pytest
from pydantic import BaseModel
from ocr_sprint.llm.client import LLMUnavailableError, OllamaClient
from ocr_sprint.llm.extractor import LLMHeaderResult, _merge, llm_fill_header
from ocr_sprint.schemas.extraction import HeaderFields
class _StubClient(OllamaClient):
"""Test double that bypasses HTTP entirely."""
def __init__(self, payload: LLMHeaderResult | Exception) -> None:
# Skip the real __init__ — we don't need any real config.
self._payload = payload
def chat_json( # type: ignore[override]
self, system: str, user: str, schema_cls: type[BaseModel]
) -> BaseModel:
if isinstance(self._payload, Exception):
raise self._payload
return self._payload
def test_merge_keeps_regex_when_present() -> None:
regex = HeaderFields(nomor_sprint="Sprin/123/IV/2025/Reskrim", tanggal=date(2025, 4, 21))
llm = LLMHeaderResult(nomor_sprint="HALLUCINATED", tanggal=date(1999, 1, 1), perihal="ok")
out = _merge(regex, llm)
assert out.nomor_sprint == "Sprin/123/IV/2025/Reskrim"
assert out.tanggal == date(2025, 4, 21)
# Gaps get filled.
assert out.perihal == "ok"
def test_merge_fills_gaps() -> None:
regex = HeaderFields() # all None
llm = LLMHeaderResult(
nomor_sprint="Sprin/9/IX/2024",
tanggal=date(2024, 9, 1),
satuan_penerbit="Polres Bandung",
perihal="Penyelidikan",
dasar=["UU 2/2002", "Perkap 6/2017"],
)
out = _merge(regex, llm)
assert out.nomor_sprint == "Sprin/9/IX/2024"
assert out.tanggal == date(2024, 9, 1)
assert out.satuan_penerbit == "Polres Bandung"
assert out.perihal == "Penyelidikan"
assert out.dasar == ["UU 2/2002", "Perkap 6/2017"]
def test_llm_fill_header_returns_merged_when_client_succeeds() -> None:
regex = HeaderFields(nomor_sprint="Sprin/1/I/2025") # has nomor, missing rest
stub = _StubClient(
LLMHeaderResult(
satuan_penerbit="Polres Bandung",
perihal="Penyelidikan",
dasar=["UU 2/2002"],
)
)
out = llm_fill_header(raw_text="...", regex_header=regex, client=stub)
assert out is not None
assert out.nomor_sprint == "Sprin/1/I/2025"
assert out.satuan_penerbit == "Polres Bandung"
assert out.perihal == "Penyelidikan"
assert out.dasar == ["UU 2/2002"]
def test_llm_fill_header_returns_none_when_unavailable() -> None:
stub = _StubClient(LLMUnavailableError("server down"))
out = llm_fill_header(raw_text="...", regex_header=HeaderFields(), client=stub)
assert out is None
def test_merge_does_not_overwrite_dasar_when_regex_has_it() -> None:
regex = HeaderFields(dasar=["UU 2/2002"])
llm = LLMHeaderResult(dasar=["something else", "more"])
out = _merge(regex, llm)
assert out.dasar == ["UU 2/2002"]
def test_llm_extractor_unused_argument_kept_silent() -> None:
# A trivial sanity check that the public function signature accepts
# keyword-only `client` — this matches how the orchestrator calls it.
pytest.importorskip("ocr_sprint.llm.extractor")

View File

@@ -0,0 +1,171 @@
"""Orchestrator-level tests for the Phase 5 hybrid LLM wiring.
These tests stub out the heavy stages (ingest / preprocess / OCR / table)
so we can verify the *branching* behaviour around the LLM step without
booting Paddle.
"""
from __future__ import annotations
from datetime import date
import pytest
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import _header_has_gaps, run_pipeline
from ocr_sprint.schemas.document import SourceKind
from ocr_sprint.schemas.extraction import HeaderFields, ReviewFlag, Signatory
def test_header_has_gaps_detects_missing_fields() -> None:
full = HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres X",
perihal="ok",
dasar=["UU 2/2002"],
)
assert _header_has_gaps(full) is False
assert _header_has_gaps(HeaderFields()) is True
assert _header_has_gaps(full.model_copy(update={"perihal": None})) is True
assert _header_has_gaps(full.model_copy(update={"dasar": []})) is True
def _stub_pipeline_stages(
monkeypatch: pytest.MonkeyPatch,
*,
raw_text: str,
regex_header: HeaderFields,
) -> None:
"""Replace ingest -> ocr -> tables with cheap fakes so the orchestrator
runs without Paddle / PyMuPDF.
"""
import numpy as np
from ocr_sprint.pipeline import ingest as ingest_module
from ocr_sprint.pipeline import ocr as ocr_module
from ocr_sprint.pipeline.ingest import IngestedPage
img = np.full((100, 100, 3), 255, dtype=np.uint8)
fake_page = IngestedPage(image=img, page_index=0)
fake_ocr_page = ocr_module.OCRPage(
lines=[
ocr_module.OCRLine(text=raw_text, confidence=0.95, box=((0, 0), (1, 0), (1, 1), (0, 1)))
],
)
monkeypatch.setattr(orch_module, "detect_source_kind", lambda _: SourceKind.PDF)
monkeypatch.setattr(orch_module, "ingest", lambda *a, **k: [fake_page])
monkeypatch.setattr(orch_module, "detect_and_correct", lambda image, _cfg: image)
monkeypatch.setattr(orch_module, "preprocess", lambda image, _cfg: image)
monkeypatch.setattr(orch_module, "run_ocr", lambda _image: fake_ocr_page)
# No tables in these tests.
monkeypatch.setattr(orch_module, "run_table_extraction", lambda _img: [])
monkeypatch.setattr(orch_module, "extract_personnel", lambda _tables: [])
# Header / signatory / validators come from the real implementation
# for `extract_header`, but we override to control gap state.
monkeypatch.setattr(orch_module, "extract_header", lambda _text: regex_header)
monkeypatch.setattr(orch_module, "find_signatory", lambda _text: Signatory())
monkeypatch.setattr(orch_module, "validate_extraction", lambda _result: [])
# Keep ingest_module referenced so import isn't dropped.
assert ingest_module is not None
def test_orchestrator_skips_llm_when_disabled(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("LLM_ENABLED", "false")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
_stub_pipeline_stages(
monkeypatch,
raw_text="dummy",
regex_header=HeaderFields(), # all gaps
)
called = {"n": 0}
def _trip(*_args: object, **_kwargs: object) -> None:
called["n"] += 1
return None
monkeypatch.setattr(orch_module, "llm_fill_header", _trip)
result = run_pipeline(b"%PDF-1.4\n%fake")
assert called["n"] == 0
assert ReviewFlag.LLM_FALLBACK not in result.result.review_flags
assert ReviewFlag.LLM_UNAVAILABLE not in result.result.review_flags
def test_orchestrator_skips_llm_when_header_complete(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("LLM_ENABLED", "true")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
_stub_pipeline_stages(
monkeypatch,
raw_text="dummy",
regex_header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres X",
perihal="ok",
dasar=["UU 2/2002"],
),
)
called = {"n": 0}
def _trip(*_args: object, **_kwargs: object) -> None:
called["n"] += 1
return None
monkeypatch.setattr(orch_module, "llm_fill_header", _trip)
run_pipeline(b"%PDF-1.4\n%fake")
assert called["n"] == 0
def test_orchestrator_calls_llm_and_marks_fallback(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("LLM_ENABLED", "true")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
regex_partial = HeaderFields(nomor_sprint="Sprin/1/I/2025") # rest missing
_stub_pipeline_stages(monkeypatch, raw_text="dummy text", regex_header=regex_partial)
def _llm(_raw: str, header: HeaderFields, **_: object) -> HeaderFields:
return header.model_copy(
update={
"satuan_penerbit": "Polres Bandung",
"perihal": "Penyelidikan",
"dasar": ["UU 2/2002"],
}
)
monkeypatch.setattr(orch_module, "llm_fill_header", _llm)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert out.result.header.satuan_penerbit == "Polres Bandung"
assert out.result.header.perihal == "Penyelidikan"
assert ReviewFlag.LLM_FALLBACK in out.result.review_flags
assert ReviewFlag.LLM_UNAVAILABLE not in out.result.review_flags
def test_orchestrator_marks_unavailable_when_llm_returns_none(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("LLM_ENABLED", "true")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
_stub_pipeline_stages(monkeypatch, raw_text="dummy", regex_header=HeaderFields())
monkeypatch.setattr(orch_module, "llm_fill_header", lambda *_a, **_k: None)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags