Files
OCR-SPRIN-SERVICE/tests/unit/test_orchestrator_llm.py
Devin AI 58a2bf2648 Fix personnel extraction + header bugs on real Polres Cimahi sprint
This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:

1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
   INDONESIA' letterhead line instead of the most-specific issuing unit
   (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
   scan for each level independently and return the deepest available.

2. find_dasar_list dropped numbered items when OCR put the marker on
   its own line ("1.\n Undang-Undang ..."). Refactored into
   _collect_numbered_section that buffers a bare-number line and uses
   the next non-empty line as the body. Also reused for the new
   find_untuk_list which extracts the previously-empty 'untuk' bullets.

3. find_perihal returned None for documents that use 'Pertimbangan'
   (very common in Polres-level sprint), forcing the LLM to guess.
   Added a regex fallback that picks up the first line under a
   'Pertimbangan' label so we keep extraction deterministic.

4. Personnel rows were emitted with only nama populated when
   PP-Structure detected a table but the column mapper degraded.
   Added a text-based fallback (extract_personnel_from_text) that
   scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
   the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
   Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.

5. Validation now flags rows with neither pangkat nor nrp as
   INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
   even when individual nrp/pangkat checks pass on empty values.

6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).

Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.

Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-26 05:35:42 +00:00

261 lines
8.9 KiB
Python

"""Orchestrator-level tests for the Phase 5 hybrid LLM wiring.
These tests stub out the heavy stages (ingest / preprocess / OCR / table)
so we can verify the *branching* behaviour around the LLM step without
booting Paddle.
"""
from __future__ import annotations
from datetime import date
import pytest
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import _header_has_gaps, run_pipeline
from ocr_sprint.schemas.document import SourceKind
from ocr_sprint.schemas.extraction import HeaderFields, ReviewFlag, Signatory
def test_header_has_gaps_detects_missing_fields() -> None:
full = HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres X",
perihal="ok",
dasar=["UU 2/2002"],
)
assert _header_has_gaps(full) is False
assert _header_has_gaps(HeaderFields()) is True
assert _header_has_gaps(full.model_copy(update={"perihal": None})) is True
assert _header_has_gaps(full.model_copy(update={"dasar": []})) is True
def _stub_pipeline_stages(
monkeypatch: pytest.MonkeyPatch,
*,
raw_text: str,
regex_header: HeaderFields,
) -> None:
"""Replace ingest -> ocr -> tables with cheap fakes so the orchestrator
runs without Paddle / PyMuPDF.
"""
import numpy as np
from ocr_sprint.pipeline import ingest as ingest_module
from ocr_sprint.pipeline import ocr as ocr_module
from ocr_sprint.pipeline.ingest import IngestedPage
img = np.full((100, 100, 3), 255, dtype=np.uint8)
fake_page = IngestedPage(image=img, page_index=0)
fake_ocr_page = ocr_module.OCRPage(
lines=[
ocr_module.OCRLine(text=raw_text, confidence=0.95, box=((0, 0), (1, 0), (1, 1), (0, 1)))
],
)
monkeypatch.setattr(orch_module, "detect_source_kind", lambda _: SourceKind.PDF)
monkeypatch.setattr(orch_module, "ingest", lambda *a, **k: [fake_page])
monkeypatch.setattr(orch_module, "detect_and_correct", lambda image, _cfg: image)
monkeypatch.setattr(orch_module, "preprocess", lambda image, _cfg: image)
monkeypatch.setattr(orch_module, "run_ocr", lambda _image: fake_ocr_page)
# No tables in these tests.
monkeypatch.setattr(orch_module, "run_table_extraction", lambda _img: [])
monkeypatch.setattr(orch_module, "extract_personnel", lambda _tables: [])
# Header / signatory / validators come from the real implementation
# for `extract_header`, but we override to control gap state.
monkeypatch.setattr(orch_module, "extract_header", lambda _text: regex_header)
monkeypatch.setattr(orch_module, "find_signatory", lambda _text: Signatory())
monkeypatch.setattr(orch_module, "validate_extraction", lambda _result: [])
# Keep ingest_module referenced so import isn't dropped.
assert ingest_module is not None
def test_orchestrator_skips_llm_when_disabled(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("LLM_ENABLED", "false")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
_stub_pipeline_stages(
monkeypatch,
raw_text="dummy",
regex_header=HeaderFields(), # all gaps
)
called = {"n": 0}
def _trip(*_args: object, **_kwargs: object) -> None:
called["n"] += 1
return None
monkeypatch.setattr(orch_module, "llm_fill_header", _trip)
result = run_pipeline(b"%PDF-1.4\n%fake")
assert called["n"] == 0
assert ReviewFlag.LLM_FALLBACK not in result.result.review_flags
assert ReviewFlag.LLM_UNAVAILABLE not in result.result.review_flags
def test_orchestrator_skips_llm_when_header_complete(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("LLM_ENABLED", "true")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
_stub_pipeline_stages(
monkeypatch,
raw_text="dummy",
regex_header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres X",
perihal="ok",
dasar=["UU 2/2002"],
),
)
called = {"n": 0}
def _trip(*_args: object, **_kwargs: object) -> None:
called["n"] += 1
return None
monkeypatch.setattr(orch_module, "llm_fill_header", _trip)
run_pipeline(b"%PDF-1.4\n%fake")
assert called["n"] == 0
def test_orchestrator_calls_llm_and_marks_fallback(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("LLM_ENABLED", "true")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
regex_partial = HeaderFields(nomor_sprint="Sprin/1/I/2025") # rest missing
_stub_pipeline_stages(monkeypatch, raw_text="dummy text", regex_header=regex_partial)
def _llm(_raw: str, header: HeaderFields, **_: object) -> HeaderFields:
return header.model_copy(
update={
"satuan_penerbit": "Polres Bandung",
"perihal": "Penyelidikan",
"dasar": ["UU 2/2002"],
}
)
monkeypatch.setattr(orch_module, "llm_fill_header", _llm)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert out.result.header.satuan_penerbit == "Polres Bandung"
assert out.result.header.perihal == "Penyelidikan"
assert ReviewFlag.LLM_FALLBACK in out.result.review_flags
assert ReviewFlag.LLM_UNAVAILABLE not in out.result.review_flags
def test_orchestrator_marks_unavailable_when_llm_returns_none(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("LLM_ENABLED", "true")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
_stub_pipeline_stages(monkeypatch, raw_text="dummy", regex_header=HeaderFields())
monkeypatch.setattr(orch_module, "llm_fill_header", lambda *_a, **_k: None)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
the orchestrator must run the text fallback against the raw OCR text and
raise the ``personnel_text_fallback`` flag.
"""
monkeypatch.setenv("LLM_ENABLED", "false")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
raw_text = (
"DAFTAR PERSONIL\n"
"1.\n"
"SRI WAHYUNI\n"
"AIPTU / 75070328\n"
"INTELKAM POLRES CIMAHI\n"
"2.\n"
"AGUNG LUKMAN\n"
"BRIPTU / 99030245\n"
"SAT INTELKAM\n"
)
# PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
# the real Polres Cimahi document).
from ocr_sprint.schemas.personnel import PersonnelEntry
pp_structure_low_quality = [
PersonnelEntry(nama="SRI WAHYUNI"),
PersonnelEntry(nama="AGUNG LUKMAN"),
]
_stub_pipeline_stages(
monkeypatch,
raw_text=raw_text,
regex_header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres Cimahi",
perihal="ok",
dasar=["UU 2/2002"],
),
)
# Override extract_personnel to return the broken PP-Structure rows.
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
# Fallback rows must carry pangkat + nrp (the whole point of the path).
assert all(r.pangkat and r.nrp for r in out.result.personel)
assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Healthy PP-Structure output (rank+nrp present on most rows) must NOT
be replaced by the text fallback.
"""
monkeypatch.setenv("LLM_ENABLED", "false")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
from ocr_sprint.schemas.personnel import PersonnelEntry
healthy = [
PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
]
_stub_pipeline_stages(
monkeypatch,
raw_text="ignored — should not be parsed",
regex_header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres X",
perihal="ok",
dasar=["UU 2/2002"],
),
)
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]