Fix personnel extraction + header bugs on real Polres Cimahi sprint
This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:
1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
INDONESIA' letterhead line instead of the most-specific issuing unit
(e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
scan for each level independently and return the deepest available.
2. find_dasar_list dropped numbered items when OCR put the marker on
its own line ("1.\n Undang-Undang ..."). Refactored into
_collect_numbered_section that buffers a bare-number line and uses
the next non-empty line as the body. Also reused for the new
find_untuk_list which extracts the previously-empty 'untuk' bullets.
3. find_perihal returned None for documents that use 'Pertimbangan'
(very common in Polres-level sprint), forcing the LLM to guess.
Added a regex fallback that picks up the first line under a
'Pertimbangan' label so we keep extraction deterministic.
4. Personnel rows were emitted with only nama populated when
PP-Structure detected a table but the column mapper degraded.
Added a text-based fallback (extract_personnel_from_text) that
scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.
5. Validation now flags rows with neither pangkat nor nrp as
INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
even when individual nrp/pangkat checks pass on empty values.
6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).
Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
203
src/ocr_sprint/pipeline/extract/personnel_text.py
Normal file
203
src/ocr_sprint/pipeline/extract/personnel_text.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""Text-based fallback personnel extractor.
|
||||
|
||||
PP-Structure (Phase 3) is the primary path for personnel rows because it
|
||||
preserves the table grid. But PP-Structure can fail in two ways on real
|
||||
sprint scans:
|
||||
|
||||
1. The table is not detected at all (low-quality scan, watermark, atypical
|
||||
layout) — `extract_personnel` returns an empty list.
|
||||
2. The table IS detected but the column mapping is too sparse, so each row
|
||||
collapses to a single ``nama`` cell with all other fields ``None``. This
|
||||
is what was observed on a real Polres Cimahi sprint where the OCR
|
||||
produced 24 rows with only ``nama`` populated.
|
||||
|
||||
This module provides a regex/heuristic fallback that operates directly on
|
||||
the flat OCR text. It is deliberately conservative: a row must have BOTH a
|
||||
recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
|
||||
generate the kind of "name-only" rows that motivated the fallback in the
|
||||
first place.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from ocr_sprint.data.master_pangkat import (
|
||||
PANGKAT_VARIANTS,
|
||||
is_valid_pangkat,
|
||||
normalize_pangkat,
|
||||
)
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
# Build a single alternation of all known rank tokens (longest first so multi-
|
||||
# word ranks like "KOMBES POL" win over the single-word "KOMBES").
|
||||
_RANK_TOKENS: tuple[str, ...] = tuple(
|
||||
sorted(
|
||||
{variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
|
||||
key=lambda v: -len(v),
|
||||
)
|
||||
)
|
||||
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
|
||||
# A line that contains a rank token followed (anywhere on the same line) by
|
||||
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
|
||||
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
|
||||
# inside e.g. "ABRIPDA-style" text.
|
||||
_RE_RANK_NRP_LINE = re.compile(
|
||||
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# A bare row number marker like "1." or "12)". OCR often puts it on its own
|
||||
# line in tabular layouts.
|
||||
_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
|
||||
# Lines that should never be interpreted as a personnel name. These are
|
||||
# section headers, OCR garbage anchors, and column header tokens.
|
||||
_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = (
|
||||
"DASAR",
|
||||
"PERIHAL",
|
||||
"PERTIMBANGAN",
|
||||
"DIPERINTAHKAN",
|
||||
"KEPADA",
|
||||
"UNTUK",
|
||||
"TEMBUSAN",
|
||||
"DIKELUARKAN",
|
||||
"PADA TANGGAL",
|
||||
"SELESAI",
|
||||
"DAFTAR",
|
||||
"LAMPIRAN",
|
||||
"NOMOR",
|
||||
"TANGGAL",
|
||||
"KEPOLISIAN",
|
||||
"DAERAH",
|
||||
"RESOR",
|
||||
"SEKTOR",
|
||||
"MABES",
|
||||
"SURAT PERINTAH",
|
||||
"NRP",
|
||||
"NIP",
|
||||
"PANGKAT",
|
||||
"JABATAN",
|
||||
"NAMA",
|
||||
"KETERANGAN",
|
||||
"KET",
|
||||
"NO",
|
||||
)
|
||||
# A name should look like a name: mostly letters, common punctuation, and
|
||||
# at least one alphabetic character. Pure-numeric or pure-symbol lines are
|
||||
# rejected.
|
||||
_RE_NAME_OK = re.compile(r"[A-Za-z]")
|
||||
|
||||
|
||||
def _is_plausible_name(line: str) -> bool:
|
||||
"""Return True iff ``line`` could plausibly be a personnel name."""
|
||||
stripped = line.strip()
|
||||
if not stripped or not _RE_NAME_OK.search(stripped):
|
||||
return False
|
||||
upper = stripped.upper()
|
||||
for prefix in _NAME_BLOCKLIST_PREFIXES:
|
||||
if upper.startswith(prefix):
|
||||
return False
|
||||
if _RE_ROW_NUMBER.match(stripped):
|
||||
return False
|
||||
if _RE_RANK_NRP_LINE.search(stripped):
|
||||
return False
|
||||
# Reject lines that are nothing but a row number with extra punctuation
|
||||
# ("1 .", "2)") which the bare-number regex above might miss.
|
||||
return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
|
||||
|
||||
|
||||
def _following_jabatan(lines: list[str], idx: int) -> str | None:
|
||||
"""Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
|
||||
|
||||
Stops at the next rank+NRP line, the next bare row-number line, or any
|
||||
blocked prefix (section header / column header).
|
||||
"""
|
||||
parts: list[str] = []
|
||||
for fwd in range(idx + 1, min(idx + 4, len(lines))):
|
||||
candidate = lines[fwd].strip()
|
||||
if not candidate:
|
||||
if parts:
|
||||
break
|
||||
continue
|
||||
if _RE_RANK_NRP_LINE.search(candidate):
|
||||
break
|
||||
if _RE_ROW_NUMBER.match(candidate):
|
||||
break
|
||||
upper = candidate.upper()
|
||||
if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES):
|
||||
break
|
||||
parts.append(candidate)
|
||||
if not parts:
|
||||
return None
|
||||
joined = " ".join(parts)
|
||||
return " ".join(joined.split()) or None
|
||||
|
||||
|
||||
def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
|
||||
"""Best-effort personnel extraction from a flat OCR text stream.
|
||||
|
||||
Strategy:
|
||||
|
||||
1. Iterate every line. Skip lines that don't contain both a known rank
|
||||
and an 8-digit NRP (those are the only signal we trust).
|
||||
2. For each rank+NRP line, look back for the most recent plausible name
|
||||
line, and forward 1-3 lines for jabatan content.
|
||||
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
|
||||
|
||||
The fallback is intentionally rate-limited: the first matching rank
|
||||
token on a line wins (no greedy multi-match per line), and a name line
|
||||
can only be consumed once (so a stray ranked text inside a paragraph
|
||||
doesn't turn into multiple bogus entries).
|
||||
"""
|
||||
lines = raw_text.splitlines()
|
||||
consumed_names: set[int] = set()
|
||||
rows: list[PersonnelEntry] = []
|
||||
|
||||
for idx, raw_line in enumerate(lines):
|
||||
line = raw_line.strip()
|
||||
match = _RE_RANK_NRP_LINE.search(line)
|
||||
if not match:
|
||||
continue
|
||||
pangkat = normalize_pangkat(match.group("rank"))
|
||||
if not pangkat or not is_valid_pangkat(pangkat):
|
||||
continue
|
||||
nrp = match.group("nrp")
|
||||
|
||||
nama: str | None = None
|
||||
for back in range(idx - 1, max(idx - 6, -1), -1):
|
||||
if back in consumed_names:
|
||||
continue
|
||||
candidate = lines[back].strip()
|
||||
if _is_plausible_name(candidate):
|
||||
nama = candidate
|
||||
consumed_names.add(back)
|
||||
break
|
||||
|
||||
jabatan = _following_jabatan(lines, idx)
|
||||
rows.append(
|
||||
PersonnelEntry(
|
||||
no=None,
|
||||
pangkat=pangkat,
|
||||
nrp=nrp,
|
||||
nama=nama,
|
||||
jabatan_dinas=jabatan,
|
||||
jabatan_sprint=None,
|
||||
keterangan=None,
|
||||
)
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
|
||||
"""Heuristic: did PP-Structure produce useless rows?
|
||||
|
||||
A row is useful when it has at least pangkat OR nrp. If most rows have
|
||||
only ``nama`` (or worse, nothing) the table extraction failed and the
|
||||
caller should retry with the text-based fallback.
|
||||
"""
|
||||
if not rows:
|
||||
return True
|
||||
useful = sum(1 for r in rows if r.pangkat or r.nrp)
|
||||
# Require at least 30% of rows to carry rank/NRP signal. Below that we
|
||||
# assume the column mapper degraded to "everything is nama" and prefer
|
||||
# a fresh attempt.
|
||||
return useful / max(1, len(rows)) < 0.3
|
||||
Reference in New Issue
Block a user