Adds a small Ollama HTTP client (httpx-based, no extra runtime deps),
prompt builders, and a hybrid header extractor that runs *after* the
deterministic regex layer. The merger never overwrites a regex-filled
field — the LLM only fills gaps. If LLM_ENABLED=false (the default), or
the Ollama server is unreachable, the pipeline degrades gracefully:
- LLM_ENABLED=false -> no LLM call at all, no flag.
- LLM_ENABLED=true,
header complete -> no LLM call.
- LLM_ENABLED=true,
header has gaps,
LLM responded ok -> merge + LLM_FALLBACK flag (review hint).
- LLM_ENABLED=true,
header has gaps,
LLM unavailable -> keep regex result + LLM_UNAVAILABLE flag.
Default model qwen2.5:1.5b on http://localhost:11434 — chosen for CPU
throughput (~5-15s per call) at acceptable accuracy. The LLM only fills
the *header* (nomor, tanggal, satuan, perihal, dasar). Personnel rows
stay with PP-Structure since that's more accurate and doesn't need LLM.
Tests:
- test_llm_client.py: httpx MockTransport-driven tests for the wire
format, error paths (HTTP 5xx, malformed JSON, missing envelope,
ConnectError), and request shape.
- test_llm_extractor.py: merge policy + None-on-unavailable behaviour.
- test_orchestrator_llm.py: end-to-end orchestrator wiring with stubs
for ingest/preprocess/OCR/table — verifies LLM is skipped when
disabled, skipped when header is complete, called and flagged when
gaps exist, and marked unavailable when the client returns None.
162 unit tests pass total (was 146).
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
91 lines
3.2 KiB
Python
91 lines
3.2 KiB
Python
"""Unit tests for the hybrid LLM header extractor / merger."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
|
|
import pytest
|
|
from pydantic import BaseModel
|
|
|
|
from ocr_sprint.llm.client import LLMUnavailableError, OllamaClient
|
|
from ocr_sprint.llm.extractor import LLMHeaderResult, _merge, llm_fill_header
|
|
from ocr_sprint.schemas.extraction import HeaderFields
|
|
|
|
|
|
class _StubClient(OllamaClient):
|
|
"""Test double that bypasses HTTP entirely."""
|
|
|
|
def __init__(self, payload: LLMHeaderResult | Exception) -> None:
|
|
# Skip the real __init__ — we don't need any real config.
|
|
self._payload = payload
|
|
|
|
def chat_json( # type: ignore[override]
|
|
self, system: str, user: str, schema_cls: type[BaseModel]
|
|
) -> BaseModel:
|
|
if isinstance(self._payload, Exception):
|
|
raise self._payload
|
|
return self._payload
|
|
|
|
|
|
def test_merge_keeps_regex_when_present() -> None:
|
|
regex = HeaderFields(nomor_sprint="Sprin/123/IV/2025/Reskrim", tanggal=date(2025, 4, 21))
|
|
llm = LLMHeaderResult(nomor_sprint="HALLUCINATED", tanggal=date(1999, 1, 1), perihal="ok")
|
|
out = _merge(regex, llm)
|
|
assert out.nomor_sprint == "Sprin/123/IV/2025/Reskrim"
|
|
assert out.tanggal == date(2025, 4, 21)
|
|
# Gaps get filled.
|
|
assert out.perihal == "ok"
|
|
|
|
|
|
def test_merge_fills_gaps() -> None:
|
|
regex = HeaderFields() # all None
|
|
llm = LLMHeaderResult(
|
|
nomor_sprint="Sprin/9/IX/2024",
|
|
tanggal=date(2024, 9, 1),
|
|
satuan_penerbit="Polres Bandung",
|
|
perihal="Penyelidikan",
|
|
dasar=["UU 2/2002", "Perkap 6/2017"],
|
|
)
|
|
out = _merge(regex, llm)
|
|
assert out.nomor_sprint == "Sprin/9/IX/2024"
|
|
assert out.tanggal == date(2024, 9, 1)
|
|
assert out.satuan_penerbit == "Polres Bandung"
|
|
assert out.perihal == "Penyelidikan"
|
|
assert out.dasar == ["UU 2/2002", "Perkap 6/2017"]
|
|
|
|
|
|
def test_llm_fill_header_returns_merged_when_client_succeeds() -> None:
|
|
regex = HeaderFields(nomor_sprint="Sprin/1/I/2025") # has nomor, missing rest
|
|
stub = _StubClient(
|
|
LLMHeaderResult(
|
|
satuan_penerbit="Polres Bandung",
|
|
perihal="Penyelidikan",
|
|
dasar=["UU 2/2002"],
|
|
)
|
|
)
|
|
out = llm_fill_header(raw_text="...", regex_header=regex, client=stub)
|
|
assert out is not None
|
|
assert out.nomor_sprint == "Sprin/1/I/2025"
|
|
assert out.satuan_penerbit == "Polres Bandung"
|
|
assert out.perihal == "Penyelidikan"
|
|
assert out.dasar == ["UU 2/2002"]
|
|
|
|
|
|
def test_llm_fill_header_returns_none_when_unavailable() -> None:
|
|
stub = _StubClient(LLMUnavailableError("server down"))
|
|
out = llm_fill_header(raw_text="...", regex_header=HeaderFields(), client=stub)
|
|
assert out is None
|
|
|
|
|
|
def test_merge_does_not_overwrite_dasar_when_regex_has_it() -> None:
|
|
regex = HeaderFields(dasar=["UU 2/2002"])
|
|
llm = LLMHeaderResult(dasar=["something else", "more"])
|
|
out = _merge(regex, llm)
|
|
assert out.dasar == ["UU 2/2002"]
|
|
|
|
|
|
def test_llm_extractor_unused_argument_kept_silent() -> None:
|
|
# A trivial sanity check that the public function signature accepts
|
|
# keyword-only `client` — this matches how the orchestrator calls it.
|
|
pytest.importorskip("ocr_sprint.llm.extractor")
|