Devin Review correctly flagged that the bare "NO" and "KET" entries in the blocklist would silently drop common Indonesian names (KETUT, NOVA, NOOR, NORMAN, NOVIANTI, ...) because the check used startswith rather than a word boundary. Replaced the per-prefix loop with a single compiled regex anchored at ^ with a trailing \b, which still matches column headers like "NO" or "KET" on their own line but no longer rejects "NOOR HIDAYAT" or "KETUT WARDANA". Also fixes the same bug in _following_jabatan. Added two regression tests covering both directions: names starting with the offending tokens are kept, bare column headers still rejected. Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
210 lines
7.2 KiB
Python
210 lines
7.2 KiB
Python
"""Text-based fallback personnel extractor.
|
|
|
|
PP-Structure (Phase 3) is the primary path for personnel rows because it
|
|
preserves the table grid. But PP-Structure can fail in two ways on real
|
|
sprint scans:
|
|
|
|
1. The table is not detected at all (low-quality scan, watermark, atypical
|
|
layout) — `extract_personnel` returns an empty list.
|
|
2. The table IS detected but the column mapping is too sparse, so each row
|
|
collapses to a single ``nama`` cell with all other fields ``None``. This
|
|
is what was observed on a real Polres Cimahi sprint where the OCR
|
|
produced 24 rows with only ``nama`` populated.
|
|
|
|
This module provides a regex/heuristic fallback that operates directly on
|
|
the flat OCR text. It is deliberately conservative: a row must have BOTH a
|
|
recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
|
|
generate the kind of "name-only" rows that motivated the fallback in the
|
|
first place.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
from ocr_sprint.data.master_pangkat import (
|
|
PANGKAT_VARIANTS,
|
|
is_valid_pangkat,
|
|
normalize_pangkat,
|
|
)
|
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
|
|
|
# Build a single alternation of all known rank tokens (longest first so multi-
|
|
# word ranks like "KOMBES POL" win over the single-word "KOMBES").
|
|
_RANK_TOKENS: tuple[str, ...] = tuple(
|
|
sorted(
|
|
{variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
|
|
key=lambda v: -len(v),
|
|
)
|
|
)
|
|
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
|
|
# A line that contains a rank token followed (anywhere on the same line) by
|
|
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
|
|
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
|
|
# inside e.g. "ABRIPDA-style" text.
|
|
_RE_RANK_NRP_LINE = re.compile(
|
|
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
|
|
re.IGNORECASE,
|
|
)
|
|
# A bare row number marker like "1." or "12)". OCR often puts it on its own
|
|
# line in tabular layouts.
|
|
_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
|
|
# Lines that should never be interpreted as a personnel name. These are
|
|
# section headers, OCR garbage anchors, and column header tokens. We match
|
|
# them with a *word-boundary* regex (built from this list) rather than a
|
|
# bare ``startswith`` check, because short tokens like ``"NO"`` and
|
|
# ``"KET"`` would otherwise reject perfectly valid Indonesian names
|
|
# (e.g. ``"NOVA SARI"``, ``"NOOR HIDAYAT"``, ``"KETUT WARDANA"`` — the
|
|
# latter being an extremely common Balinese birth-order name).
|
|
_NAME_BLOCKLIST_TOKENS: tuple[str, ...] = (
|
|
"PADA TANGGAL", # multi-word entries first so they win the alternation
|
|
"SURAT PERINTAH",
|
|
"DASAR",
|
|
"PERIHAL",
|
|
"PERTIMBANGAN",
|
|
"DIPERINTAHKAN",
|
|
"KEPADA",
|
|
"UNTUK",
|
|
"TEMBUSAN",
|
|
"DIKELUARKAN",
|
|
"SELESAI",
|
|
"DAFTAR",
|
|
"LAMPIRAN",
|
|
"NOMOR",
|
|
"TANGGAL",
|
|
"KEPOLISIAN",
|
|
"DAERAH",
|
|
"RESOR",
|
|
"SEKTOR",
|
|
"MABES",
|
|
"NRP",
|
|
"NIP",
|
|
"PANGKAT",
|
|
"JABATAN",
|
|
"NAMA",
|
|
"KETERANGAN",
|
|
"KET",
|
|
"NO",
|
|
)
|
|
_RE_NAME_BLOCKLIST = re.compile(
|
|
r"^(?:" + "|".join(re.escape(tok) for tok in _NAME_BLOCKLIST_TOKENS) + r")\b",
|
|
re.IGNORECASE,
|
|
)
|
|
# A name should look like a name: mostly letters, common punctuation, and
|
|
# at least one alphabetic character. Pure-numeric or pure-symbol lines are
|
|
# rejected.
|
|
_RE_NAME_OK = re.compile(r"[A-Za-z]")
|
|
|
|
|
|
def _is_plausible_name(line: str) -> bool:
|
|
"""Return True iff ``line`` could plausibly be a personnel name."""
|
|
stripped = line.strip()
|
|
if not stripped or not _RE_NAME_OK.search(stripped):
|
|
return False
|
|
if _RE_NAME_BLOCKLIST.match(stripped):
|
|
return False
|
|
if _RE_ROW_NUMBER.match(stripped):
|
|
return False
|
|
if _RE_RANK_NRP_LINE.search(stripped):
|
|
return False
|
|
# Reject lines that are nothing but a row number with extra punctuation
|
|
# ("1 .", "2)") which the bare-number regex above might miss.
|
|
return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
|
|
|
|
|
|
def _following_jabatan(lines: list[str], idx: int) -> str | None:
|
|
"""Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
|
|
|
|
Stops at the next rank+NRP line, the next bare row-number line, or any
|
|
blocked prefix (section header / column header).
|
|
"""
|
|
parts: list[str] = []
|
|
for fwd in range(idx + 1, min(idx + 4, len(lines))):
|
|
candidate = lines[fwd].strip()
|
|
if not candidate:
|
|
if parts:
|
|
break
|
|
continue
|
|
if _RE_RANK_NRP_LINE.search(candidate):
|
|
break
|
|
if _RE_ROW_NUMBER.match(candidate):
|
|
break
|
|
if _RE_NAME_BLOCKLIST.match(candidate):
|
|
break
|
|
parts.append(candidate)
|
|
if not parts:
|
|
return None
|
|
joined = " ".join(parts)
|
|
return " ".join(joined.split()) or None
|
|
|
|
|
|
def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
|
|
"""Best-effort personnel extraction from a flat OCR text stream.
|
|
|
|
Strategy:
|
|
|
|
1. Iterate every line. Skip lines that don't contain both a known rank
|
|
and an 8-digit NRP (those are the only signal we trust).
|
|
2. For each rank+NRP line, look back for the most recent plausible name
|
|
line, and forward 1-3 lines for jabatan content.
|
|
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
|
|
|
|
The fallback is intentionally rate-limited: the first matching rank
|
|
token on a line wins (no greedy multi-match per line), and a name line
|
|
can only be consumed once (so a stray ranked text inside a paragraph
|
|
doesn't turn into multiple bogus entries).
|
|
"""
|
|
lines = raw_text.splitlines()
|
|
consumed_names: set[int] = set()
|
|
rows: list[PersonnelEntry] = []
|
|
|
|
for idx, raw_line in enumerate(lines):
|
|
line = raw_line.strip()
|
|
match = _RE_RANK_NRP_LINE.search(line)
|
|
if not match:
|
|
continue
|
|
pangkat = normalize_pangkat(match.group("rank"))
|
|
if not pangkat or not is_valid_pangkat(pangkat):
|
|
continue
|
|
nrp = match.group("nrp")
|
|
|
|
nama: str | None = None
|
|
for back in range(idx - 1, max(idx - 6, -1), -1):
|
|
if back in consumed_names:
|
|
continue
|
|
candidate = lines[back].strip()
|
|
if _is_plausible_name(candidate):
|
|
nama = candidate
|
|
consumed_names.add(back)
|
|
break
|
|
|
|
jabatan = _following_jabatan(lines, idx)
|
|
rows.append(
|
|
PersonnelEntry(
|
|
no=None,
|
|
pangkat=pangkat,
|
|
nrp=nrp,
|
|
nama=nama,
|
|
jabatan_dinas=jabatan,
|
|
jabatan_sprint=None,
|
|
keterangan=None,
|
|
)
|
|
)
|
|
return rows
|
|
|
|
|
|
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
|
|
"""Heuristic: did PP-Structure produce useless rows?
|
|
|
|
A row is useful when it has at least pangkat OR nrp. If most rows have
|
|
only ``nama`` (or worse, nothing) the table extraction failed and the
|
|
caller should retry with the text-based fallback.
|
|
"""
|
|
if not rows:
|
|
return True
|
|
useful = sum(1 for r in rows if r.pangkat or r.nrp)
|
|
# Require at least 30% of rows to carry rank/NRP signal. Below that we
|
|
# assume the column mapper degraded to "everything is nama" and prefer
|
|
# a fresh attempt.
|
|
return useful / max(1, len(rows)) < 0.3
|