Fix personnel extraction + header bugs on real Polres Cimahi sprint

This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:

1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
   INDONESIA' letterhead line instead of the most-specific issuing unit
   (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
   scan for each level independently and return the deepest available.

2. find_dasar_list dropped numbered items when OCR put the marker on
   its own line ("1.\n Undang-Undang ..."). Refactored into
   _collect_numbered_section that buffers a bare-number line and uses
   the next non-empty line as the body. Also reused for the new
   find_untuk_list which extracts the previously-empty 'untuk' bullets.

3. find_perihal returned None for documents that use 'Pertimbangan'
   (very common in Polres-level sprint), forcing the LLM to guess.
   Added a regex fallback that picks up the first line under a
   'Pertimbangan' label so we keep extraction deterministic.

4. Personnel rows were emitted with only nama populated when
   PP-Structure detected a table but the column mapper degraded.
   Added a text-based fallback (extract_personnel_from_text) that
   scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
   the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
   Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.

5. Validation now flags rows with neither pangkat nor nrp as
   INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
   even when individual nrp/pangkat checks pass on empty values.

6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).

Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.

Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
Devin AI
2026-04-26 05:35:42 +00:00
parent dce77e80e1
commit 58a2bf2648
11 changed files with 747 additions and 39 deletions

View File

@@ -22,7 +22,7 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
# Bintara
"BRIPDA": ("BRIPDA",),
"BRIPTU": ("BRIPTU",),
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL", "BRIGPOL"),
"BRIPKA": ("BRIPKA",),
"AIPDA": ("AIPDA",),
"AIPTU": ("AIPTU",),

View File

@@ -22,6 +22,14 @@ _FLAG_PENALTY: dict[ReviewFlag, float] = {
ReviewFlag.UNKNOWN_PANGKAT: 0.05,
ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
ReviewFlag.DATE_PARSE_FAILED: 0.10,
# Text-based personnel fallback is a recoverable degradation: rank/NRP
# were extracted via regex from raw OCR rather than from a parsed table
# grid. Worth flagging for review but not catastrophic.
ReviewFlag.PERSONNEL_TEXT_FALLBACK: 0.05,
# An incomplete personnel row (no pangkat AND no nrp) is a strong
# signal something went wrong. Penalise heavily so the document
# routes to needs_review even if the rest of the extraction is fine.
ReviewFlag.INCOMPLETE_PERSONNEL_ROW: 0.15,
}
OCR_WEIGHT = 0.6

View File

@@ -0,0 +1,203 @@
"""Text-based fallback personnel extractor.
PP-Structure (Phase 3) is the primary path for personnel rows because it
preserves the table grid. But PP-Structure can fail in two ways on real
sprint scans:
1. The table is not detected at all (low-quality scan, watermark, atypical
layout) — `extract_personnel` returns an empty list.
2. The table IS detected but the column mapping is too sparse, so each row
collapses to a single ``nama`` cell with all other fields ``None``. This
is what was observed on a real Polres Cimahi sprint where the OCR
produced 24 rows with only ``nama`` populated.
This module provides a regex/heuristic fallback that operates directly on
the flat OCR text. It is deliberately conservative: a row must have BOTH a
recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
generate the kind of "name-only" rows that motivated the fallback in the
first place.
"""
from __future__ import annotations
import re
from ocr_sprint.data.master_pangkat import (
PANGKAT_VARIANTS,
is_valid_pangkat,
normalize_pangkat,
)
from ocr_sprint.schemas.personnel import PersonnelEntry
# Build a single alternation of all known rank tokens (longest first so multi-
# word ranks like "KOMBES POL" win over the single-word "KOMBES").
_RANK_TOKENS: tuple[str, ...] = tuple(
sorted(
{variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
key=lambda v: -len(v),
)
)
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
# A line that contains a rank token followed (anywhere on the same line) by
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
# inside e.g. "ABRIPDA-style" text.
_RE_RANK_NRP_LINE = re.compile(
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
re.IGNORECASE,
)
# A bare row number marker like "1." or "12)". OCR often puts it on its own
# line in tabular layouts.
_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
# Lines that should never be interpreted as a personnel name. These are
# section headers, OCR garbage anchors, and column header tokens.
_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = (
"DASAR",
"PERIHAL",
"PERTIMBANGAN",
"DIPERINTAHKAN",
"KEPADA",
"UNTUK",
"TEMBUSAN",
"DIKELUARKAN",
"PADA TANGGAL",
"SELESAI",
"DAFTAR",
"LAMPIRAN",
"NOMOR",
"TANGGAL",
"KEPOLISIAN",
"DAERAH",
"RESOR",
"SEKTOR",
"MABES",
"SURAT PERINTAH",
"NRP",
"NIP",
"PANGKAT",
"JABATAN",
"NAMA",
"KETERANGAN",
"KET",
"NO",
)
# A name should look like a name: mostly letters, common punctuation, and
# at least one alphabetic character. Pure-numeric or pure-symbol lines are
# rejected.
_RE_NAME_OK = re.compile(r"[A-Za-z]")
def _is_plausible_name(line: str) -> bool:
"""Return True iff ``line`` could plausibly be a personnel name."""
stripped = line.strip()
if not stripped or not _RE_NAME_OK.search(stripped):
return False
upper = stripped.upper()
for prefix in _NAME_BLOCKLIST_PREFIXES:
if upper.startswith(prefix):
return False
if _RE_ROW_NUMBER.match(stripped):
return False
if _RE_RANK_NRP_LINE.search(stripped):
return False
# Reject lines that are nothing but a row number with extra punctuation
# ("1 .", "2)") which the bare-number regex above might miss.
return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
def _following_jabatan(lines: list[str], idx: int) -> str | None:
"""Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
Stops at the next rank+NRP line, the next bare row-number line, or any
blocked prefix (section header / column header).
"""
parts: list[str] = []
for fwd in range(idx + 1, min(idx + 4, len(lines))):
candidate = lines[fwd].strip()
if not candidate:
if parts:
break
continue
if _RE_RANK_NRP_LINE.search(candidate):
break
if _RE_ROW_NUMBER.match(candidate):
break
upper = candidate.upper()
if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES):
break
parts.append(candidate)
if not parts:
return None
joined = " ".join(parts)
return " ".join(joined.split()) or None
def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
"""Best-effort personnel extraction from a flat OCR text stream.
Strategy:
1. Iterate every line. Skip lines that don't contain both a known rank
and an 8-digit NRP (those are the only signal we trust).
2. For each rank+NRP line, look back for the most recent plausible name
line, and forward 1-3 lines for jabatan content.
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
The fallback is intentionally rate-limited: the first matching rank
token on a line wins (no greedy multi-match per line), and a name line
can only be consumed once (so a stray ranked text inside a paragraph
doesn't turn into multiple bogus entries).
"""
lines = raw_text.splitlines()
consumed_names: set[int] = set()
rows: list[PersonnelEntry] = []
for idx, raw_line in enumerate(lines):
line = raw_line.strip()
match = _RE_RANK_NRP_LINE.search(line)
if not match:
continue
pangkat = normalize_pangkat(match.group("rank"))
if not pangkat or not is_valid_pangkat(pangkat):
continue
nrp = match.group("nrp")
nama: str | None = None
for back in range(idx - 1, max(idx - 6, -1), -1):
if back in consumed_names:
continue
candidate = lines[back].strip()
if _is_plausible_name(candidate):
nama = candidate
consumed_names.add(back)
break
jabatan = _following_jabatan(lines, idx)
rows.append(
PersonnelEntry(
no=None,
pangkat=pangkat,
nrp=nrp,
nama=nama,
jabatan_dinas=jabatan,
jabatan_sprint=None,
keterangan=None,
)
)
return rows
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
"""Heuristic: did PP-Structure produce useless rows?
A row is useful when it has at least pangkat OR nrp. If most rows have
only ``nama`` (or worse, nothing) the table extraction failed and the
caller should retry with the text-based fallback.
"""
if not rows:
return True
useful = sum(1 for r in rows if r.pangkat or r.nrp)
# Require at least 30% of rows to carry rank/NRP signal. Below that we
# assume the column mapper degraded to "everything is nama" and prefer
# a fresh attempt.
return useful / max(1, len(rows)) < 0.3

View File

@@ -53,19 +53,52 @@ _RE_TANGGAL_ID = re.compile(
re.IGNORECASE,
)
# Satuan penerbit usually appears in the document letterhead, prefixed by
# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
_RE_SATUAN = re.compile(
r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
r"[^\n]{0,80}",
# Polri letterhead pieces. The full letterhead spans multiple lines that are
# often broken across separate OCR rows like:
#
# KEPOLISIAN NEGARA REPUBLIK INDONESIA
# DAERAH JAWA BARAT
# RESOR CIMAHI
#
# We capture each individual level so we can reconstruct the most-specific
# unit (RESOR / SEKTOR > DAERAH > NEGARA) — a downstream consumer cares
# about *which* unit issued the sprint, not just that some Polri unit did.
_RE_LEVEL_NEGARA = re.compile(
r"KEPOLISIAN\s+NEGARA\s+REPUBLIK\s+INDONESIA",
re.IGNORECASE,
)
_RE_LEVEL_DAERAH = re.compile(
r"(?:KEPOLISIAN\s+)?DAERAH\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
re.IGNORECASE | re.MULTILINE,
)
_RE_LEVEL_RESOR = re.compile(
r"(?:KEPOLISIAN\s+)?RESORT?\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
re.IGNORECASE | re.MULTILINE,
)
_RE_LEVEL_SEKTOR = re.compile(
r"(?:KEPOLISIAN\s+)?SEKTOR\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
re.IGNORECASE | re.MULTILINE,
)
_RE_LEVEL_MABES = re.compile(r"MABES\s+POLRI\b", re.IGNORECASE)
# "Perihal : ...." up to end of line.
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
# Many sprint docs (especially Polres-level) use 'Pertimbangan' as the
# single-paragraph rationale block instead of (or alongside) 'Perihal'.
# When `perihal` is missing we fall back to the first non-empty line under
# 'Pertimbangan :' so the LLM doesn't have to guess and so a downstream
# audit trail still has *something* in the perihal slot.
_RE_PERTIMBANGAN_LABEL = re.compile(r"^\s*PERTIMBANGAN\b", re.IGNORECASE)
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
# OCR sometimes splits the number from its content across two lines:
# 1.
# Undang-Undang Nomor 2 Tahun 2002 ...
# We detect a bare-number line and merge with the next non-empty line.
_RE_DASAR_BARE_NUMBER = re.compile(r"^\s*(\d+)\s*[.)]\s*$")
# Generic 'untuk' bullet — same shape as a dasar item.
_RE_UNTUK_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
@@ -99,54 +132,159 @@ def find_tanggal(text: str) -> date | None:
return None
def _clean_unit_tail(tail: str) -> str:
"""Strip trailing punctuation/noise from the captured place name."""
return " ".join(tail.split()).strip(" .,;:'\"")
def find_satuan(text: str) -> str | None:
"""Return the first letterhead match (issuing unit), normalized."""
match = _RE_SATUAN.search(text)
if not match:
return None
return " ".join(match.group(0).split())
"""Return the issuing unit, preferring the most-specific letterhead level.
Polri letterheads are hierarchical (Negara > Daerah > Resor/Sektor). The
actual *issuing* unit is the deepest level present in the letterhead, not
the topmost generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' line. We scan
for each level independently and pick the most specific one available;
if only the generic Negara line is present we return that.
Examples
--------
>>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA\\n"
... "DAERAH JAWA BARAT\\nRESOR CIMAHI")
'KEPOLISIAN RESOR CIMAHI'
>>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
'KEPOLISIAN NEGARA REPUBLIK INDONESIA'
"""
# We only look at the document head — letterheads always sit at the
# very top, and constraining the search prevents false positives from
# body text like '... Polres Cimahi ...' deep in a paragraph.
head = "\n".join(text.splitlines()[:25])
sektor = _RE_LEVEL_SEKTOR.search(head)
if sektor:
return f"KEPOLISIAN SEKTOR {_clean_unit_tail(sektor.group(1))}"
resor = _RE_LEVEL_RESOR.search(head)
if resor:
return f"KEPOLISIAN RESOR {_clean_unit_tail(resor.group(1))}"
daerah = _RE_LEVEL_DAERAH.search(head)
if daerah:
return f"KEPOLISIAN DAERAH {_clean_unit_tail(daerah.group(1))}"
if _RE_LEVEL_MABES.search(head):
return "MABES POLRI"
if _RE_LEVEL_NEGARA.search(head):
return "KEPOLISIAN NEGARA REPUBLIK INDONESIA"
return None
def find_perihal(text: str) -> str | None:
"""Return the first 'Perihal: ...' line, trimmed to that line only."""
"""Return the first 'Perihal: ...' line, trimmed to that line only.
Falls back to the first non-empty line under a 'Pertimbangan' label
(a common variant in Polres-level surat sprint that doesn't have a
distinct 'Perihal' field). We deliberately keep this in regex-land
rather than deferring to the LLM because the LLM tends to hallucinate
perihal content from arbitrary paragraphs.
"""
for line in text.splitlines():
m = _RE_PERIHAL.search(line)
if m:
return m.group(1).strip()
lines = text.splitlines()
for idx, line in enumerate(lines):
if _RE_PERTIMBANGAN_LABEL.match(line):
for follow in lines[idx + 1 : idx + 5]:
stripped = follow.strip(" :\t")
if stripped and stripped != ":":
return stripped
break
return None
def _collect_numbered_section(
lines: list[str],
start_idx: int,
terminators: tuple[str, ...],
) -> list[str]:
"""Walk forward from ``start_idx`` collecting numbered list items.
Robust to OCR splitting the number marker onto its own line:
'1.' -> buffer ``pending_index=1``
next non-empty line starts the item body.
Continuation lines (non-empty, no leading number, after a started item)
are appended to the current item. Stops at any line whose uppercase form
starts with one of ``terminators``.
"""
items: list[str] = []
pending_marker = False
blank_run = 0
for raw_line in lines[start_idx:]:
line = raw_line.strip()
upper = line.upper()
if any(upper.startswith(term) for term in terminators):
break
if not line:
blank_run += 1
# Two consecutive blank lines reliably mark the end of a section.
# A single blank line is tolerated because OCR sprinkles them.
if blank_run >= 2 and items and not pending_marker:
break
continue
blank_run = 0
bare = _RE_DASAR_BARE_NUMBER.match(line)
if bare:
pending_marker = True
continue
m = _RE_DASAR_ITEM.match(line)
if m:
items.append(m.group(2).strip())
pending_marker = False
continue
if pending_marker:
items.append(line)
pending_marker = False
continue
if items:
items[-1] = (items[-1] + " " + line).strip()
return items
def find_dasar_list(text: str) -> list[str]:
"""Extract numbered 'Dasar' items from the text.
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
collect subsequent lines that start with a number. Stops at a blank line
or a line beginning with another section header keyword.
delegate to ``_collect_numbered_section`` which handles three OCR
artefacts:
1. Inline numbered items: ``"1. Undang-Undang ..."``.
2. Bare-number lines (the OCR engine puts the number alone on a line):
``"1.\\n Undang-Undang ..."``.
3. Continuation lines (a line that is the wrapped tail of the previous
item gets appended back onto it).
"""
lines = text.splitlines()
items: list[str] = []
in_dasar = False
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
for raw_line in lines:
line = raw_line.strip()
if not in_dasar:
if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
in_dasar = True
continue
if not line:
if items:
break
continue
upper = line.upper()
if any(upper.startswith(term) for term in section_terminators):
break
m = _RE_DASAR_ITEM.match(line)
if m:
items.append(m.group(2).strip())
elif items:
# continuation of the previous dasar item
items[-1] = (items[-1] + " " + line).strip()
return items
for idx, raw_line in enumerate(lines):
if re.match(r"^\s*DASAR\b", raw_line.strip(), re.IGNORECASE):
return _collect_numbered_section(lines, idx + 1, section_terminators)
return []
def find_untuk_list(text: str) -> list[str]:
"""Extract numbered 'Untuk' / 'DIPERINTAHKAN' bullets from the text.
The 'Untuk' section follows 'DIPERINTAHKAN' / 'Kepada' and lists the
tasks assigned to the personnel. Same OCR shape as Dasar, so we reuse
the collector but with different terminators.
"""
lines = text.splitlines()
# Stop conditions: 'Selesai' (boilerplate), 'Dikeluarkan di' (signature
# block), 'Tembusan' (carbon-copy section).
terminators = ("SELESAI", "DIKELUARKAN", "TEMBUSAN", "PADA TANGGAL")
for idx, raw_line in enumerate(lines):
if re.match(r"^\s*UNTUK\b", raw_line.strip(), re.IGNORECASE):
return _collect_numbered_section(lines, idx + 1, terminators)
return []
def find_signatory(text: str) -> Signatory:

View File

@@ -30,6 +30,13 @@ def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
flags.append(ReviewFlag.INVALID_NRP)
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
# Identification of a personnel row requires at least pangkat OR nrp.
# A row carrying only a name is structurally incomplete - likely a
# mis-aligned table cell or a leaked tembusan/dasar fragment - and must
# be flagged for human review even though pangkat/nrp validation
# individually pass (because they're empty).
if not entry.pangkat and not entry.nrp:
flags.append(ReviewFlag.INCOMPLETE_PERSONNEL_ROW)
return flags

View File

@@ -19,7 +19,15 @@ from ocr_sprint.llm.extractor import llm_fill_header
from ocr_sprint.pipeline.confidence import compute_confidence, route
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
from ocr_sprint.pipeline.extract.personnel import extract_personnel
from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_text,
is_low_quality,
)
from ocr_sprint.pipeline.extract.regex_rules import (
extract_header,
find_signatory,
find_untuk_list,
)
from ocr_sprint.pipeline.extract.validators import validate_extraction
from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
@@ -112,6 +120,7 @@ def run_pipeline(content: bytes) -> PipelineOutput:
header = merged
personel: list[PersonnelEntry] = []
table_flags: list[ReviewFlag] = []
if s.tables_enabled and cleaned_pages:
all_tables: list[DetectedTable] = []
for img in cleaned_pages:
@@ -126,14 +135,33 @@ def run_pipeline(content: bytes) -> PipelineOutput:
personel_rows=len(personel),
)
initial_flags: list[ReviewFlag] = list(llm_flags)
# Text-based fallback: PP-Structure can succeed structurally but emit
# rows with only ``nama`` populated (column mapper degraded), or fail to
# detect the table at all. In both cases the regex fallback that scans
# raw OCR for rank+NRP pairs produces a much more useful result. We
# always run it when the structured path is empty or low-quality, and
# raise a review flag so the operator knows the document didn't go
# through the preferred path.
if is_low_quality(personel):
fallback_rows = extract_personnel_from_text(full_text)
if fallback_rows:
personel = fallback_rows
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
_logger.info(
"pipeline.personnel_text_fallback",
fallback_rows=len(fallback_rows),
)
untuk_items = find_untuk_list(full_text)
initial_flags: list[ReviewFlag] = list(llm_flags) + list(table_flags)
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
result = ExtractionResult(
header=header,
personel=personel,
untuk=[],
untuk=untuk_items,
ttd=ttd,
raw_text=full_text,
confidence=mean_ocr_conf,

View File

@@ -21,6 +21,8 @@ class ReviewFlag(str, Enum):
DATE_PARSE_FAILED = "date_parse_failed"
LLM_FALLBACK = "llm_fallback"
LLM_UNAVAILABLE = "llm_unavailable"
PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"
class Signatory(BaseModel):

View File

@@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none(
out = run_pipeline(b"%PDF-1.4\n%fake")
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
the orchestrator must run the text fallback against the raw OCR text and
raise the ``personnel_text_fallback`` flag.
"""
monkeypatch.setenv("LLM_ENABLED", "false")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
raw_text = (
"DAFTAR PERSONIL\n"
"1.\n"
"SRI WAHYUNI\n"
"AIPTU / 75070328\n"
"INTELKAM POLRES CIMAHI\n"
"2.\n"
"AGUNG LUKMAN\n"
"BRIPTU / 99030245\n"
"SAT INTELKAM\n"
)
# PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
# the real Polres Cimahi document).
from ocr_sprint.schemas.personnel import PersonnelEntry
pp_structure_low_quality = [
PersonnelEntry(nama="SRI WAHYUNI"),
PersonnelEntry(nama="AGUNG LUKMAN"),
]
_stub_pipeline_stages(
monkeypatch,
raw_text=raw_text,
regex_header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres Cimahi",
perihal="ok",
dasar=["UU 2/2002"],
),
)
# Override extract_personnel to return the broken PP-Structure rows.
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
# Fallback rows must carry pangkat + nrp (the whole point of the path).
assert all(r.pangkat and r.nrp for r in out.result.personel)
assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Healthy PP-Structure output (rank+nrp present on most rows) must NOT
be replaced by the text fallback.
"""
monkeypatch.setenv("LLM_ENABLED", "false")
from ocr_sprint.config import get_settings
get_settings.cache_clear()
from ocr_sprint.schemas.personnel import PersonnelEntry
healthy = [
PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
]
_stub_pipeline_stages(
monkeypatch,
raw_text="ignored — should not be parsed",
regex_header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="Polres X",
perihal="ok",
dasar=["UU 2/2002"],
),
)
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
out = run_pipeline(b"%PDF-1.4\n%fake")
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]

View File

@@ -0,0 +1,118 @@
"""Tests for the text-based personnel fallback extractor.
Driven by the real Polres Cimahi sprint document where PP-Structure
produced 24 rows with only ``nama`` populated. The fallback should
recover at least the rank + NRP for every row.
"""
from __future__ import annotations
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_text,
is_low_quality,
)
from ocr_sprint.schemas.personnel import PersonnelEntry
_CIMAHI_FIXTURE = """\
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
NO
NAMA
PANGKAT / NRP
JABATAN
KET
BAUR SKCK SAT
1.
SRI WAHYUNI
AIPTU / 75070328
INTELKAM POLRES
CIMAHI
BA PELAKSANA SKCK
2.
CITRA DWI PUTRI R
BRIPTU / 95070659
SAT INTELKAM
POLRES CIMAHI
BA PELAKSANA SKCK
3.
AGUNG LUKMAN AL
BRIPTU / 99030245
SAT INTELKAM
POLRES CIMAHI
BA POLSEK
8.
ARIEF SYAHRUL ZAMAN
BRIGPOL /96030446
MARGAASIH
"""
class TestExtractPersonnelFromText:
def test_extracts_rank_nrp_and_name(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
assert len(rows) == 4
first = rows[0]
assert first.pangkat == "AIPTU"
assert first.nrp == "75070328"
assert first.nama == "SRI WAHYUNI"
def test_normalizes_brigpol_to_brigadir(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
last = rows[-1]
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
assert last.pangkat == "BRIGADIR"
assert last.nrp == "96030446"
assert last.nama == "ARIEF SYAHRUL ZAMAN"
def test_skips_header_lines_as_names(self) -> None:
# No row should ever have a column-header word as nama.
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
names = [r.nama for r in rows]
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
assert blocked not in names
def test_jabatan_collected_from_following_lines(self) -> None:
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
assert rows[0].jabatan_dinas is not None
assert "INTELKAM" in rows[0].jabatan_dinas
def test_empty_text_returns_empty(self) -> None:
assert extract_personnel_from_text("") == []
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
assert extract_personnel_from_text(text) == []
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
# NRP without a recognised rank token must not produce a row.
text = "Some line\n12345678\nanother line"
assert extract_personnel_from_text(text) == []
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
# A "rank-shaped" word that isn't in the master list must not yield a row.
text = "Some line\nFAKERANK / 12345678\nanother line"
assert extract_personnel_from_text(text) == []
class TestIsLowQuality:
def test_empty_list_is_low_quality(self) -> None:
assert is_low_quality([]) is True
def test_all_rows_with_only_name_is_low_quality(self) -> None:
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
assert is_low_quality(rows) is True
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
rows = [
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
for i in range(10)
]
assert is_low_quality(rows) is False
def test_borderline_30_percent_threshold(self) -> None:
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
useful = [
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
for i in range(3)
]
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
assert is_low_quality(useful + useless) is False

View File

@@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import (
find_satuan,
find_signatory,
find_tanggal,
find_untuk_list,
)
@@ -60,6 +61,36 @@ class TestSatuan:
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
assert result is not None
def test_prefers_resor_over_negara_when_both_present(self) -> None:
# The Polri letterhead lists units hierarchically; the issuing unit
# is the deepest level, not the topmost generic "NEGARA" line.
text = (
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
"DAERAH JAWA BARAT\n"
"RESOR CIMAHI\n"
"SURAT PERINTAH\n"
)
result = find_satuan(text)
assert result == "KEPOLISIAN RESOR CIMAHI"
def test_prefers_sektor_over_resor(self) -> None:
text = (
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
"DAERAH JAWA BARAT\n"
"RESOR CIMAHI\n"
"SEKTOR PADALARANG\n"
)
result = find_satuan(text)
assert result == "KEPOLISIAN SEKTOR PADALARANG"
def test_handles_daerah_only(self) -> None:
text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n"
result = find_satuan(text)
assert result == "KEPOLISIAN DAERAH JAWA BARAT"
def test_returns_none_when_no_letterhead(self) -> None:
assert find_satuan("no police letterhead here") is None
class TestPerihal:
def test_extracts_perihal_line(self) -> None:
@@ -69,6 +100,25 @@ class TestPerihal:
def test_returns_none_when_absent(self) -> None:
assert find_perihal("no perihal field") is None
def test_falls_back_to_pertimbangan_block(self) -> None:
# Many Polres-level sprints use "Pertimbangan" instead of "Perihal".
# The fallback should pick up the first non-empty line under it.
text = (
"Pertimbangan\n"
"Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n"
"DASAR :\n"
"1. ...\n"
)
result = find_perihal(text)
assert result is not None
assert result.startswith("Bahwa dalam rangka mendukung")
def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None:
# If the document has both a Perihal label AND a Pertimbangan
# paragraph, the explicit Perihal wins.
text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n"
assert find_perihal(text) == "The actual perihal."
class TestDasar:
def test_numbered_list(self) -> None:
@@ -88,6 +138,57 @@ class TestDasar:
def test_empty_when_section_missing(self) -> None:
assert find_dasar_list("no dasar section") == []
def test_handles_bare_number_lines_split_by_ocr(self) -> None:
# OCR sometimes places the number marker on its own line and the
# body on the next non-empty line. The collector must merge them
# rather than dropping the body or appending it to the previous
# item (which the old implementation did).
text = (
"Dasar\n"
":\n"
"1.\n"
" Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n"
"2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n"
"3.\n"
"Keterangan Catatan Kepolisian (SKCK);\n"
"4.\n"
"Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n"
"5. DIPA Petikan Satker Polres Cimahi.\n"
"DIPERINTAHKAN\n"
)
items = find_dasar_list(text)
assert len(items) == 5
assert items[0].startswith("Undang - Undang")
assert items[2].startswith("Keterangan Catatan")
assert items[3].startswith("Pelayanan dilingkungan")
assert items[4].startswith("DIPA")
class TestUntuk:
def test_extracts_numbered_untuk_bullets(self) -> None:
text = (
"DIPERINTAHKAN\n"
"Kepada\n"
"Untuk\n"
"1.\n"
"melaksanakan tugas A;\n"
"2.\n"
"melaksanakan tugas B;\n"
"Selesai.\n"
)
items = find_untuk_list(text)
assert len(items) == 2
assert items[0] == "melaksanakan tugas A;"
assert items[1] == "melaksanakan tugas B;"
def test_returns_empty_when_section_missing(self) -> None:
assert find_untuk_list("no untuk section") == []
def test_stops_at_dikeluarkan(self) -> None:
text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n"
items = find_untuk_list(text)
assert items == ["tugas A;"]
class TestSignatory:
def test_extracts_last_nrp(self) -> None:

View File

@@ -62,6 +62,20 @@ class TestPersonnelValidator:
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
def test_row_with_only_name_is_flagged_incomplete(self) -> None:
# A row that captured only `nama` (no pangkat AND no nrp) is the
# signature of mis-aligned table extraction. Must be flagged so
# the operator routes the document to needs_review.
entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE")
flags = validate_personnel_entry(entry)
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags
def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None:
# Having pangkat without NRP is suboptimal but still identifies a
# rank, so we don't raise the structural-incompleteness flag.
entry = PersonnelEntry(pangkat="AKP", nama="Test")
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry)
class TestHeaderValidator:
def test_complete_header_no_flags(self) -> None: