Fix personnel extraction + header bugs on real Polres Cimahi sprint
This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:
1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
INDONESIA' letterhead line instead of the most-specific issuing unit
(e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
scan for each level independently and return the deepest available.
2. find_dasar_list dropped numbered items when OCR put the marker on
its own line ("1.\n Undang-Undang ..."). Refactored into
_collect_numbered_section that buffers a bare-number line and uses
the next non-empty line as the body. Also reused for the new
find_untuk_list which extracts the previously-empty 'untuk' bullets.
3. find_perihal returned None for documents that use 'Pertimbangan'
(very common in Polres-level sprint), forcing the LLM to guess.
Added a regex fallback that picks up the first line under a
'Pertimbangan' label so we keep extraction deterministic.
4. Personnel rows were emitted with only nama populated when
PP-Structure detected a table but the column mapper degraded.
Added a text-based fallback (extract_personnel_from_text) that
scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.
5. Validation now flags rows with neither pangkat nor nrp as
INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
even when individual nrp/pangkat checks pass on empty values.
6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).
Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
@@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none(
|
||||
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
|
||||
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
|
||||
|
||||
|
||||
def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
|
||||
the orchestrator must run the text fallback against the raw OCR text and
|
||||
raise the ``personnel_text_fallback`` flag.
|
||||
"""
|
||||
monkeypatch.setenv("LLM_ENABLED", "false")
|
||||
from ocr_sprint.config import get_settings
|
||||
|
||||
get_settings.cache_clear()
|
||||
|
||||
raw_text = (
|
||||
"DAFTAR PERSONIL\n"
|
||||
"1.\n"
|
||||
"SRI WAHYUNI\n"
|
||||
"AIPTU / 75070328\n"
|
||||
"INTELKAM POLRES CIMAHI\n"
|
||||
"2.\n"
|
||||
"AGUNG LUKMAN\n"
|
||||
"BRIPTU / 99030245\n"
|
||||
"SAT INTELKAM\n"
|
||||
)
|
||||
|
||||
# PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
|
||||
# the real Polres Cimahi document).
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
pp_structure_low_quality = [
|
||||
PersonnelEntry(nama="SRI WAHYUNI"),
|
||||
PersonnelEntry(nama="AGUNG LUKMAN"),
|
||||
]
|
||||
_stub_pipeline_stages(
|
||||
monkeypatch,
|
||||
raw_text=raw_text,
|
||||
regex_header=HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
satuan_penerbit="Polres Cimahi",
|
||||
perihal="ok",
|
||||
dasar=["UU 2/2002"],
|
||||
),
|
||||
)
|
||||
# Override extract_personnel to return the broken PP-Structure rows.
|
||||
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
|
||||
|
||||
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
|
||||
# Fallback rows must carry pangkat + nrp (the whole point of the path).
|
||||
assert all(r.pangkat and r.nrp for r in out.result.personel)
|
||||
assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
|
||||
|
||||
|
||||
def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Healthy PP-Structure output (rank+nrp present on most rows) must NOT
|
||||
be replaced by the text fallback.
|
||||
"""
|
||||
monkeypatch.setenv("LLM_ENABLED", "false")
|
||||
from ocr_sprint.config import get_settings
|
||||
|
||||
get_settings.cache_clear()
|
||||
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
healthy = [
|
||||
PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
|
||||
PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
|
||||
PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
|
||||
]
|
||||
_stub_pipeline_stages(
|
||||
monkeypatch,
|
||||
raw_text="ignored — should not be parsed",
|
||||
regex_header=HeaderFields(
|
||||
nomor_sprint="Sprin/1/I/2025",
|
||||
tanggal=date(2025, 1, 1),
|
||||
satuan_penerbit="Polres X",
|
||||
perihal="ok",
|
||||
dasar=["UU 2/2002"],
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
|
||||
|
||||
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
|
||||
assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]
|
||||
|
||||
Reference in New Issue
Block a user