Files
OCR-SPRIN-SERVICE/src/ocr_sprint/pipeline/extract/personnel_text.py
Devin AI 737f4999dd Use word-boundary matching for personnel name blocklist
Devin Review correctly flagged that the bare "NO" and "KET" entries
in the blocklist would silently drop common Indonesian names (KETUT,
NOVA, NOOR, NORMAN, NOVIANTI, ...) because the check used startswith
rather than a word boundary.

Replaced the per-prefix loop with a single compiled regex anchored at
^ with a trailing \b, which still matches column headers like "NO"
or "KET" on their own line but no longer rejects "NOOR HIDAYAT" or
"KETUT WARDANA". Also fixes the same bug in _following_jabatan.

Added two regression tests covering both directions: names starting
with the offending tokens are kept, bare column headers still rejected.

Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-26 05:46:21 +00:00

210 lines
7.2 KiB
Python

"""Text-based fallback personnel extractor.
PP-Structure (Phase 3) is the primary path for personnel rows because it
preserves the table grid. But PP-Structure can fail in two ways on real
sprint scans:
1. The table is not detected at all (low-quality scan, watermark, atypical
layout) — `extract_personnel` returns an empty list.
2. The table IS detected but the column mapping is too sparse, so each row
collapses to a single ``nama`` cell with all other fields ``None``. This
is what was observed on a real Polres Cimahi sprint where the OCR
produced 24 rows with only ``nama`` populated.
This module provides a regex/heuristic fallback that operates directly on
the flat OCR text. It is deliberately conservative: a row must have BOTH a
recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
generate the kind of "name-only" rows that motivated the fallback in the
first place.
"""
from __future__ import annotations
import re
from ocr_sprint.data.master_pangkat import (
PANGKAT_VARIANTS,
is_valid_pangkat,
normalize_pangkat,
)
from ocr_sprint.schemas.personnel import PersonnelEntry
# Build a single alternation of all known rank tokens (longest first so multi-
# word ranks like "KOMBES POL" win over the single-word "KOMBES").
_RANK_TOKENS: tuple[str, ...] = tuple(
sorted(
{variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
key=lambda v: -len(v),
)
)
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
# A line that contains a rank token followed (anywhere on the same line) by
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
# inside e.g. "ABRIPDA-style" text.
_RE_RANK_NRP_LINE = re.compile(
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
re.IGNORECASE,
)
# A bare row number marker like "1." or "12)". OCR often puts it on its own
# line in tabular layouts.
_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
# Lines that should never be interpreted as a personnel name. These are
# section headers, OCR garbage anchors, and column header tokens. We match
# them with a *word-boundary* regex (built from this list) rather than a
# bare ``startswith`` check, because short tokens like ``"NO"`` and
# ``"KET"`` would otherwise reject perfectly valid Indonesian names
# (e.g. ``"NOVA SARI"``, ``"NOOR HIDAYAT"``, ``"KETUT WARDANA"`` — the
# latter being an extremely common Balinese birth-order name).
_NAME_BLOCKLIST_TOKENS: tuple[str, ...] = (
"PADA TANGGAL", # multi-word entries first so they win the alternation
"SURAT PERINTAH",
"DASAR",
"PERIHAL",
"PERTIMBANGAN",
"DIPERINTAHKAN",
"KEPADA",
"UNTUK",
"TEMBUSAN",
"DIKELUARKAN",
"SELESAI",
"DAFTAR",
"LAMPIRAN",
"NOMOR",
"TANGGAL",
"KEPOLISIAN",
"DAERAH",
"RESOR",
"SEKTOR",
"MABES",
"NRP",
"NIP",
"PANGKAT",
"JABATAN",
"NAMA",
"KETERANGAN",
"KET",
"NO",
)
_RE_NAME_BLOCKLIST = re.compile(
r"^(?:" + "|".join(re.escape(tok) for tok in _NAME_BLOCKLIST_TOKENS) + r")\b",
re.IGNORECASE,
)
# A name should look like a name: mostly letters, common punctuation, and
# at least one alphabetic character. Pure-numeric or pure-symbol lines are
# rejected.
_RE_NAME_OK = re.compile(r"[A-Za-z]")
def _is_plausible_name(line: str) -> bool:
"""Return True iff ``line`` could plausibly be a personnel name."""
stripped = line.strip()
if not stripped or not _RE_NAME_OK.search(stripped):
return False
if _RE_NAME_BLOCKLIST.match(stripped):
return False
if _RE_ROW_NUMBER.match(stripped):
return False
if _RE_RANK_NRP_LINE.search(stripped):
return False
# Reject lines that are nothing but a row number with extra punctuation
# ("1 .", "2)") which the bare-number regex above might miss.
return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
def _following_jabatan(lines: list[str], idx: int) -> str | None:
"""Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
Stops at the next rank+NRP line, the next bare row-number line, or any
blocked prefix (section header / column header).
"""
parts: list[str] = []
for fwd in range(idx + 1, min(idx + 4, len(lines))):
candidate = lines[fwd].strip()
if not candidate:
if parts:
break
continue
if _RE_RANK_NRP_LINE.search(candidate):
break
if _RE_ROW_NUMBER.match(candidate):
break
if _RE_NAME_BLOCKLIST.match(candidate):
break
parts.append(candidate)
if not parts:
return None
joined = " ".join(parts)
return " ".join(joined.split()) or None
def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
"""Best-effort personnel extraction from a flat OCR text stream.
Strategy:
1. Iterate every line. Skip lines that don't contain both a known rank
and an 8-digit NRP (those are the only signal we trust).
2. For each rank+NRP line, look back for the most recent plausible name
line, and forward 1-3 lines for jabatan content.
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
The fallback is intentionally rate-limited: the first matching rank
token on a line wins (no greedy multi-match per line), and a name line
can only be consumed once (so a stray ranked text inside a paragraph
doesn't turn into multiple bogus entries).
"""
lines = raw_text.splitlines()
consumed_names: set[int] = set()
rows: list[PersonnelEntry] = []
for idx, raw_line in enumerate(lines):
line = raw_line.strip()
match = _RE_RANK_NRP_LINE.search(line)
if not match:
continue
pangkat = normalize_pangkat(match.group("rank"))
if not pangkat or not is_valid_pangkat(pangkat):
continue
nrp = match.group("nrp")
nama: str | None = None
for back in range(idx - 1, max(idx - 6, -1), -1):
if back in consumed_names:
continue
candidate = lines[back].strip()
if _is_plausible_name(candidate):
nama = candidate
consumed_names.add(back)
break
jabatan = _following_jabatan(lines, idx)
rows.append(
PersonnelEntry(
no=None,
pangkat=pangkat,
nrp=nrp,
nama=nama,
jabatan_dinas=jabatan,
jabatan_sprint=None,
keterangan=None,
)
)
return rows
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
"""Heuristic: did PP-Structure produce useless rows?
A row is useful when it has at least pangkat OR nrp. If most rows have
only ``nama`` (or worse, nothing) the table extraction failed and the
caller should retry with the text-based fallback.
"""
if not rows:
return True
useful = sum(1 for r in rows if r.pangkat or r.nrp)
# Require at least 30% of rows to carry rank/NRP signal. Below that we
# assume the column mapper degraded to "everything is nama" and prefer
# a fresh attempt.
return useful / max(1, len(rows)) < 0.3