Fix personnel extraction + header bugs on real Polres Cimahi sprint
This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:
1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
INDONESIA' letterhead line instead of the most-specific issuing unit
(e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
scan for each level independently and return the deepest available.
2. find_dasar_list dropped numbered items when OCR put the marker on
its own line ("1.\n Undang-Undang ..."). Refactored into
_collect_numbered_section that buffers a bare-number line and uses
the next non-empty line as the body. Also reused for the new
find_untuk_list which extracts the previously-empty 'untuk' bullets.
3. find_perihal returned None for documents that use 'Pertimbangan'
(very common in Polres-level sprint), forcing the LLM to guess.
Added a regex fallback that picks up the first line under a
'Pertimbangan' label so we keep extraction deterministic.
4. Personnel rows were emitted with only nama populated when
PP-Structure detected a table but the column mapper degraded.
Added a text-based fallback (extract_personnel_from_text) that
scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.
5. Validation now flags rows with neither pangkat nor nrp as
INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
even when individual nrp/pangkat checks pass on empty values.
6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).
Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
@@ -22,7 +22,7 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
|
|||||||
# Bintara
|
# Bintara
|
||||||
"BRIPDA": ("BRIPDA",),
|
"BRIPDA": ("BRIPDA",),
|
||||||
"BRIPTU": ("BRIPTU",),
|
"BRIPTU": ("BRIPTU",),
|
||||||
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
|
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL", "BRIGPOL"),
|
||||||
"BRIPKA": ("BRIPKA",),
|
"BRIPKA": ("BRIPKA",),
|
||||||
"AIPDA": ("AIPDA",),
|
"AIPDA": ("AIPDA",),
|
||||||
"AIPTU": ("AIPTU",),
|
"AIPTU": ("AIPTU",),
|
||||||
|
|||||||
@@ -22,6 +22,14 @@ _FLAG_PENALTY: dict[ReviewFlag, float] = {
|
|||||||
ReviewFlag.UNKNOWN_PANGKAT: 0.05,
|
ReviewFlag.UNKNOWN_PANGKAT: 0.05,
|
||||||
ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
|
ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
|
||||||
ReviewFlag.DATE_PARSE_FAILED: 0.10,
|
ReviewFlag.DATE_PARSE_FAILED: 0.10,
|
||||||
|
# Text-based personnel fallback is a recoverable degradation: rank/NRP
|
||||||
|
# were extracted via regex from raw OCR rather than from a parsed table
|
||||||
|
# grid. Worth flagging for review but not catastrophic.
|
||||||
|
ReviewFlag.PERSONNEL_TEXT_FALLBACK: 0.05,
|
||||||
|
# An incomplete personnel row (no pangkat AND no nrp) is a strong
|
||||||
|
# signal something went wrong. Penalise heavily so the document
|
||||||
|
# routes to needs_review even if the rest of the extraction is fine.
|
||||||
|
ReviewFlag.INCOMPLETE_PERSONNEL_ROW: 0.15,
|
||||||
}
|
}
|
||||||
|
|
||||||
OCR_WEIGHT = 0.6
|
OCR_WEIGHT = 0.6
|
||||||
|
|||||||
203
src/ocr_sprint/pipeline/extract/personnel_text.py
Normal file
203
src/ocr_sprint/pipeline/extract/personnel_text.py
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
"""Text-based fallback personnel extractor.
|
||||||
|
|
||||||
|
PP-Structure (Phase 3) is the primary path for personnel rows because it
|
||||||
|
preserves the table grid. But PP-Structure can fail in two ways on real
|
||||||
|
sprint scans:
|
||||||
|
|
||||||
|
1. The table is not detected at all (low-quality scan, watermark, atypical
|
||||||
|
layout) — `extract_personnel` returns an empty list.
|
||||||
|
2. The table IS detected but the column mapping is too sparse, so each row
|
||||||
|
collapses to a single ``nama`` cell with all other fields ``None``. This
|
||||||
|
is what was observed on a real Polres Cimahi sprint where the OCR
|
||||||
|
produced 24 rows with only ``nama`` populated.
|
||||||
|
|
||||||
|
This module provides a regex/heuristic fallback that operates directly on
|
||||||
|
the flat OCR text. It is deliberately conservative: a row must have BOTH a
|
||||||
|
recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
|
||||||
|
generate the kind of "name-only" rows that motivated the fallback in the
|
||||||
|
first place.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ocr_sprint.data.master_pangkat import (
|
||||||
|
PANGKAT_VARIANTS,
|
||||||
|
is_valid_pangkat,
|
||||||
|
normalize_pangkat,
|
||||||
|
)
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
# Build a single alternation of all known rank tokens (longest first so multi-
|
||||||
|
# word ranks like "KOMBES POL" win over the single-word "KOMBES").
|
||||||
|
_RANK_TOKENS: tuple[str, ...] = tuple(
|
||||||
|
sorted(
|
||||||
|
{variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
|
||||||
|
key=lambda v: -len(v),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
|
||||||
|
# A line that contains a rank token followed (anywhere on the same line) by
|
||||||
|
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
|
||||||
|
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
|
||||||
|
# inside e.g. "ABRIPDA-style" text.
|
||||||
|
_RE_RANK_NRP_LINE = re.compile(
|
||||||
|
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# A bare row number marker like "1." or "12)". OCR often puts it on its own
|
||||||
|
# line in tabular layouts.
|
||||||
|
_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
|
||||||
|
# Lines that should never be interpreted as a personnel name. These are
|
||||||
|
# section headers, OCR garbage anchors, and column header tokens.
|
||||||
|
_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = (
|
||||||
|
"DASAR",
|
||||||
|
"PERIHAL",
|
||||||
|
"PERTIMBANGAN",
|
||||||
|
"DIPERINTAHKAN",
|
||||||
|
"KEPADA",
|
||||||
|
"UNTUK",
|
||||||
|
"TEMBUSAN",
|
||||||
|
"DIKELUARKAN",
|
||||||
|
"PADA TANGGAL",
|
||||||
|
"SELESAI",
|
||||||
|
"DAFTAR",
|
||||||
|
"LAMPIRAN",
|
||||||
|
"NOMOR",
|
||||||
|
"TANGGAL",
|
||||||
|
"KEPOLISIAN",
|
||||||
|
"DAERAH",
|
||||||
|
"RESOR",
|
||||||
|
"SEKTOR",
|
||||||
|
"MABES",
|
||||||
|
"SURAT PERINTAH",
|
||||||
|
"NRP",
|
||||||
|
"NIP",
|
||||||
|
"PANGKAT",
|
||||||
|
"JABATAN",
|
||||||
|
"NAMA",
|
||||||
|
"KETERANGAN",
|
||||||
|
"KET",
|
||||||
|
"NO",
|
||||||
|
)
|
||||||
|
# A name should look like a name: mostly letters, common punctuation, and
|
||||||
|
# at least one alphabetic character. Pure-numeric or pure-symbol lines are
|
||||||
|
# rejected.
|
||||||
|
_RE_NAME_OK = re.compile(r"[A-Za-z]")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_plausible_name(line: str) -> bool:
|
||||||
|
"""Return True iff ``line`` could plausibly be a personnel name."""
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped or not _RE_NAME_OK.search(stripped):
|
||||||
|
return False
|
||||||
|
upper = stripped.upper()
|
||||||
|
for prefix in _NAME_BLOCKLIST_PREFIXES:
|
||||||
|
if upper.startswith(prefix):
|
||||||
|
return False
|
||||||
|
if _RE_ROW_NUMBER.match(stripped):
|
||||||
|
return False
|
||||||
|
if _RE_RANK_NRP_LINE.search(stripped):
|
||||||
|
return False
|
||||||
|
# Reject lines that are nothing but a row number with extra punctuation
|
||||||
|
# ("1 .", "2)") which the bare-number regex above might miss.
|
||||||
|
return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
|
||||||
|
|
||||||
|
|
||||||
|
def _following_jabatan(lines: list[str], idx: int) -> str | None:
|
||||||
|
"""Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
|
||||||
|
|
||||||
|
Stops at the next rank+NRP line, the next bare row-number line, or any
|
||||||
|
blocked prefix (section header / column header).
|
||||||
|
"""
|
||||||
|
parts: list[str] = []
|
||||||
|
for fwd in range(idx + 1, min(idx + 4, len(lines))):
|
||||||
|
candidate = lines[fwd].strip()
|
||||||
|
if not candidate:
|
||||||
|
if parts:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
if _RE_RANK_NRP_LINE.search(candidate):
|
||||||
|
break
|
||||||
|
if _RE_ROW_NUMBER.match(candidate):
|
||||||
|
break
|
||||||
|
upper = candidate.upper()
|
||||||
|
if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES):
|
||||||
|
break
|
||||||
|
parts.append(candidate)
|
||||||
|
if not parts:
|
||||||
|
return None
|
||||||
|
joined = " ".join(parts)
|
||||||
|
return " ".join(joined.split()) or None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
|
||||||
|
"""Best-effort personnel extraction from a flat OCR text stream.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
|
||||||
|
1. Iterate every line. Skip lines that don't contain both a known rank
|
||||||
|
and an 8-digit NRP (those are the only signal we trust).
|
||||||
|
2. For each rank+NRP line, look back for the most recent plausible name
|
||||||
|
line, and forward 1-3 lines for jabatan content.
|
||||||
|
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
|
||||||
|
|
||||||
|
The fallback is intentionally rate-limited: the first matching rank
|
||||||
|
token on a line wins (no greedy multi-match per line), and a name line
|
||||||
|
can only be consumed once (so a stray ranked text inside a paragraph
|
||||||
|
doesn't turn into multiple bogus entries).
|
||||||
|
"""
|
||||||
|
lines = raw_text.splitlines()
|
||||||
|
consumed_names: set[int] = set()
|
||||||
|
rows: list[PersonnelEntry] = []
|
||||||
|
|
||||||
|
for idx, raw_line in enumerate(lines):
|
||||||
|
line = raw_line.strip()
|
||||||
|
match = _RE_RANK_NRP_LINE.search(line)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
pangkat = normalize_pangkat(match.group("rank"))
|
||||||
|
if not pangkat or not is_valid_pangkat(pangkat):
|
||||||
|
continue
|
||||||
|
nrp = match.group("nrp")
|
||||||
|
|
||||||
|
nama: str | None = None
|
||||||
|
for back in range(idx - 1, max(idx - 6, -1), -1):
|
||||||
|
if back in consumed_names:
|
||||||
|
continue
|
||||||
|
candidate = lines[back].strip()
|
||||||
|
if _is_plausible_name(candidate):
|
||||||
|
nama = candidate
|
||||||
|
consumed_names.add(back)
|
||||||
|
break
|
||||||
|
|
||||||
|
jabatan = _following_jabatan(lines, idx)
|
||||||
|
rows.append(
|
||||||
|
PersonnelEntry(
|
||||||
|
no=None,
|
||||||
|
pangkat=pangkat,
|
||||||
|
nrp=nrp,
|
||||||
|
nama=nama,
|
||||||
|
jabatan_dinas=jabatan,
|
||||||
|
jabatan_sprint=None,
|
||||||
|
keterangan=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
|
||||||
|
"""Heuristic: did PP-Structure produce useless rows?
|
||||||
|
|
||||||
|
A row is useful when it has at least pangkat OR nrp. If most rows have
|
||||||
|
only ``nama`` (or worse, nothing) the table extraction failed and the
|
||||||
|
caller should retry with the text-based fallback.
|
||||||
|
"""
|
||||||
|
if not rows:
|
||||||
|
return True
|
||||||
|
useful = sum(1 for r in rows if r.pangkat or r.nrp)
|
||||||
|
# Require at least 30% of rows to carry rank/NRP signal. Below that we
|
||||||
|
# assume the column mapper degraded to "everything is nama" and prefer
|
||||||
|
# a fresh attempt.
|
||||||
|
return useful / max(1, len(rows)) < 0.3
|
||||||
@@ -53,19 +53,52 @@ _RE_TANGGAL_ID = re.compile(
|
|||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Satuan penerbit usually appears in the document letterhead, prefixed by
|
# Polri letterhead pieces. The full letterhead spans multiple lines that are
|
||||||
# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
|
# often broken across separate OCR rows like:
|
||||||
_RE_SATUAN = re.compile(
|
#
|
||||||
r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
|
# KEPOLISIAN NEGARA REPUBLIK INDONESIA
|
||||||
r"[^\n]{0,80}",
|
# DAERAH JAWA BARAT
|
||||||
|
# RESOR CIMAHI
|
||||||
|
#
|
||||||
|
# We capture each individual level so we can reconstruct the most-specific
|
||||||
|
# unit (RESOR / SEKTOR > DAERAH > NEGARA) — a downstream consumer cares
|
||||||
|
# about *which* unit issued the sprint, not just that some Polri unit did.
|
||||||
|
_RE_LEVEL_NEGARA = re.compile(
|
||||||
|
r"KEPOLISIAN\s+NEGARA\s+REPUBLIK\s+INDONESIA",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
_RE_LEVEL_DAERAH = re.compile(
|
||||||
|
r"(?:KEPOLISIAN\s+)?DAERAH\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
|
||||||
|
re.IGNORECASE | re.MULTILINE,
|
||||||
|
)
|
||||||
|
_RE_LEVEL_RESOR = re.compile(
|
||||||
|
r"(?:KEPOLISIAN\s+)?RESORT?\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
|
||||||
|
re.IGNORECASE | re.MULTILINE,
|
||||||
|
)
|
||||||
|
_RE_LEVEL_SEKTOR = re.compile(
|
||||||
|
r"(?:KEPOLISIAN\s+)?SEKTOR\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
|
||||||
|
re.IGNORECASE | re.MULTILINE,
|
||||||
|
)
|
||||||
|
_RE_LEVEL_MABES = re.compile(r"MABES\s+POLRI\b", re.IGNORECASE)
|
||||||
|
|
||||||
# "Perihal : ...." up to end of line.
|
# "Perihal : ...." up to end of line.
|
||||||
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
|
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
|
||||||
|
# Many sprint docs (especially Polres-level) use 'Pertimbangan' as the
|
||||||
|
# single-paragraph rationale block instead of (or alongside) 'Perihal'.
|
||||||
|
# When `perihal` is missing we fall back to the first non-empty line under
|
||||||
|
# 'Pertimbangan :' so the LLM doesn't have to guess and so a downstream
|
||||||
|
# audit trail still has *something* in the perihal slot.
|
||||||
|
_RE_PERTIMBANGAN_LABEL = re.compile(r"^\s*PERTIMBANGAN\b", re.IGNORECASE)
|
||||||
|
|
||||||
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
|
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
|
||||||
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
||||||
|
# OCR sometimes splits the number from its content across two lines:
|
||||||
|
# 1.
|
||||||
|
# Undang-Undang Nomor 2 Tahun 2002 ...
|
||||||
|
# We detect a bare-number line and merge with the next non-empty line.
|
||||||
|
_RE_DASAR_BARE_NUMBER = re.compile(r"^\s*(\d+)\s*[.)]\s*$")
|
||||||
|
# Generic 'untuk' bullet — same shape as a dasar item.
|
||||||
|
_RE_UNTUK_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
||||||
|
|
||||||
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
|
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
|
||||||
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
|
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
|
||||||
@@ -99,54 +132,159 @@ def find_tanggal(text: str) -> date | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_unit_tail(tail: str) -> str:
|
||||||
|
"""Strip trailing punctuation/noise from the captured place name."""
|
||||||
|
return " ".join(tail.split()).strip(" .,;:'\"")
|
||||||
|
|
||||||
|
|
||||||
def find_satuan(text: str) -> str | None:
|
def find_satuan(text: str) -> str | None:
|
||||||
"""Return the first letterhead match (issuing unit), normalized."""
|
"""Return the issuing unit, preferring the most-specific letterhead level.
|
||||||
match = _RE_SATUAN.search(text)
|
|
||||||
if not match:
|
Polri letterheads are hierarchical (Negara > Daerah > Resor/Sektor). The
|
||||||
|
actual *issuing* unit is the deepest level present in the letterhead, not
|
||||||
|
the topmost generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' line. We scan
|
||||||
|
for each level independently and pick the most specific one available;
|
||||||
|
if only the generic Negara line is present we return that.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA\\n"
|
||||||
|
... "DAERAH JAWA BARAT\\nRESOR CIMAHI")
|
||||||
|
'KEPOLISIAN RESOR CIMAHI'
|
||||||
|
>>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
||||||
|
'KEPOLISIAN NEGARA REPUBLIK INDONESIA'
|
||||||
|
"""
|
||||||
|
# We only look at the document head — letterheads always sit at the
|
||||||
|
# very top, and constraining the search prevents false positives from
|
||||||
|
# body text like '... Polres Cimahi ...' deep in a paragraph.
|
||||||
|
head = "\n".join(text.splitlines()[:25])
|
||||||
|
|
||||||
|
sektor = _RE_LEVEL_SEKTOR.search(head)
|
||||||
|
if sektor:
|
||||||
|
return f"KEPOLISIAN SEKTOR {_clean_unit_tail(sektor.group(1))}"
|
||||||
|
resor = _RE_LEVEL_RESOR.search(head)
|
||||||
|
if resor:
|
||||||
|
return f"KEPOLISIAN RESOR {_clean_unit_tail(resor.group(1))}"
|
||||||
|
daerah = _RE_LEVEL_DAERAH.search(head)
|
||||||
|
if daerah:
|
||||||
|
return f"KEPOLISIAN DAERAH {_clean_unit_tail(daerah.group(1))}"
|
||||||
|
if _RE_LEVEL_MABES.search(head):
|
||||||
|
return "MABES POLRI"
|
||||||
|
if _RE_LEVEL_NEGARA.search(head):
|
||||||
|
return "KEPOLISIAN NEGARA REPUBLIK INDONESIA"
|
||||||
return None
|
return None
|
||||||
return " ".join(match.group(0).split())
|
|
||||||
|
|
||||||
|
|
||||||
def find_perihal(text: str) -> str | None:
|
def find_perihal(text: str) -> str | None:
|
||||||
"""Return the first 'Perihal: ...' line, trimmed to that line only."""
|
"""Return the first 'Perihal: ...' line, trimmed to that line only.
|
||||||
|
|
||||||
|
Falls back to the first non-empty line under a 'Pertimbangan' label
|
||||||
|
(a common variant in Polres-level surat sprint that doesn't have a
|
||||||
|
distinct 'Perihal' field). We deliberately keep this in regex-land
|
||||||
|
rather than deferring to the LLM because the LLM tends to hallucinate
|
||||||
|
perihal content from arbitrary paragraphs.
|
||||||
|
"""
|
||||||
for line in text.splitlines():
|
for line in text.splitlines():
|
||||||
m = _RE_PERIHAL.search(line)
|
m = _RE_PERIHAL.search(line)
|
||||||
if m:
|
if m:
|
||||||
return m.group(1).strip()
|
return m.group(1).strip()
|
||||||
|
|
||||||
|
lines = text.splitlines()
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
if _RE_PERTIMBANGAN_LABEL.match(line):
|
||||||
|
for follow in lines[idx + 1 : idx + 5]:
|
||||||
|
stripped = follow.strip(" :\t")
|
||||||
|
if stripped and stripped != ":":
|
||||||
|
return stripped
|
||||||
|
break
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_numbered_section(
|
||||||
|
lines: list[str],
|
||||||
|
start_idx: int,
|
||||||
|
terminators: tuple[str, ...],
|
||||||
|
) -> list[str]:
|
||||||
|
"""Walk forward from ``start_idx`` collecting numbered list items.
|
||||||
|
|
||||||
|
Robust to OCR splitting the number marker onto its own line:
|
||||||
|
'1.' -> buffer ``pending_index=1``
|
||||||
|
next non-empty line starts the item body.
|
||||||
|
|
||||||
|
Continuation lines (non-empty, no leading number, after a started item)
|
||||||
|
are appended to the current item. Stops at any line whose uppercase form
|
||||||
|
starts with one of ``terminators``.
|
||||||
|
"""
|
||||||
|
items: list[str] = []
|
||||||
|
pending_marker = False
|
||||||
|
blank_run = 0
|
||||||
|
for raw_line in lines[start_idx:]:
|
||||||
|
line = raw_line.strip()
|
||||||
|
upper = line.upper()
|
||||||
|
if any(upper.startswith(term) for term in terminators):
|
||||||
|
break
|
||||||
|
if not line:
|
||||||
|
blank_run += 1
|
||||||
|
# Two consecutive blank lines reliably mark the end of a section.
|
||||||
|
# A single blank line is tolerated because OCR sprinkles them.
|
||||||
|
if blank_run >= 2 and items and not pending_marker:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
blank_run = 0
|
||||||
|
bare = _RE_DASAR_BARE_NUMBER.match(line)
|
||||||
|
if bare:
|
||||||
|
pending_marker = True
|
||||||
|
continue
|
||||||
|
m = _RE_DASAR_ITEM.match(line)
|
||||||
|
if m:
|
||||||
|
items.append(m.group(2).strip())
|
||||||
|
pending_marker = False
|
||||||
|
continue
|
||||||
|
if pending_marker:
|
||||||
|
items.append(line)
|
||||||
|
pending_marker = False
|
||||||
|
continue
|
||||||
|
if items:
|
||||||
|
items[-1] = (items[-1] + " " + line).strip()
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
def find_dasar_list(text: str) -> list[str]:
|
def find_dasar_list(text: str) -> list[str]:
|
||||||
"""Extract numbered 'Dasar' items from the text.
|
"""Extract numbered 'Dasar' items from the text.
|
||||||
|
|
||||||
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
|
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
|
||||||
collect subsequent lines that start with a number. Stops at a blank line
|
delegate to ``_collect_numbered_section`` which handles three OCR
|
||||||
or a line beginning with another section header keyword.
|
artefacts:
|
||||||
|
|
||||||
|
1. Inline numbered items: ``"1. Undang-Undang ..."``.
|
||||||
|
2. Bare-number lines (the OCR engine puts the number alone on a line):
|
||||||
|
``"1.\\n Undang-Undang ..."``.
|
||||||
|
3. Continuation lines (a line that is the wrapped tail of the previous
|
||||||
|
item gets appended back onto it).
|
||||||
"""
|
"""
|
||||||
lines = text.splitlines()
|
lines = text.splitlines()
|
||||||
items: list[str] = []
|
|
||||||
in_dasar = False
|
|
||||||
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
|
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
|
||||||
for raw_line in lines:
|
for idx, raw_line in enumerate(lines):
|
||||||
line = raw_line.strip()
|
if re.match(r"^\s*DASAR\b", raw_line.strip(), re.IGNORECASE):
|
||||||
if not in_dasar:
|
return _collect_numbered_section(lines, idx + 1, section_terminators)
|
||||||
if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
|
return []
|
||||||
in_dasar = True
|
|
||||||
continue
|
|
||||||
if not line:
|
def find_untuk_list(text: str) -> list[str]:
|
||||||
if items:
|
"""Extract numbered 'Untuk' / 'DIPERINTAHKAN' bullets from the text.
|
||||||
break
|
|
||||||
continue
|
The 'Untuk' section follows 'DIPERINTAHKAN' / 'Kepada' and lists the
|
||||||
upper = line.upper()
|
tasks assigned to the personnel. Same OCR shape as Dasar, so we reuse
|
||||||
if any(upper.startswith(term) for term in section_terminators):
|
the collector but with different terminators.
|
||||||
break
|
"""
|
||||||
m = _RE_DASAR_ITEM.match(line)
|
lines = text.splitlines()
|
||||||
if m:
|
# Stop conditions: 'Selesai' (boilerplate), 'Dikeluarkan di' (signature
|
||||||
items.append(m.group(2).strip())
|
# block), 'Tembusan' (carbon-copy section).
|
||||||
elif items:
|
terminators = ("SELESAI", "DIKELUARKAN", "TEMBUSAN", "PADA TANGGAL")
|
||||||
# continuation of the previous dasar item
|
for idx, raw_line in enumerate(lines):
|
||||||
items[-1] = (items[-1] + " " + line).strip()
|
if re.match(r"^\s*UNTUK\b", raw_line.strip(), re.IGNORECASE):
|
||||||
return items
|
return _collect_numbered_section(lines, idx + 1, terminators)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def find_signatory(text: str) -> Signatory:
|
def find_signatory(text: str) -> Signatory:
|
||||||
|
|||||||
@@ -30,6 +30,13 @@ def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
|
|||||||
flags.append(ReviewFlag.INVALID_NRP)
|
flags.append(ReviewFlag.INVALID_NRP)
|
||||||
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
|
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
|
||||||
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
|
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
|
||||||
|
# Identification of a personnel row requires at least pangkat OR nrp.
|
||||||
|
# A row carrying only a name is structurally incomplete - likely a
|
||||||
|
# mis-aligned table cell or a leaked tembusan/dasar fragment - and must
|
||||||
|
# be flagged for human review even though pangkat/nrp validation
|
||||||
|
# individually pass (because they're empty).
|
||||||
|
if not entry.pangkat and not entry.nrp:
|
||||||
|
flags.append(ReviewFlag.INCOMPLETE_PERSONNEL_ROW)
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,15 @@ from ocr_sprint.llm.extractor import llm_fill_header
|
|||||||
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||||
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
|
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
|
||||||
from ocr_sprint.pipeline.extract.personnel import extract_personnel
|
from ocr_sprint.pipeline.extract.personnel import extract_personnel
|
||||||
from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
|
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||||
|
extract_personnel_from_text,
|
||||||
|
is_low_quality,
|
||||||
|
)
|
||||||
|
from ocr_sprint.pipeline.extract.regex_rules import (
|
||||||
|
extract_header,
|
||||||
|
find_signatory,
|
||||||
|
find_untuk_list,
|
||||||
|
)
|
||||||
from ocr_sprint.pipeline.extract.validators import validate_extraction
|
from ocr_sprint.pipeline.extract.validators import validate_extraction
|
||||||
from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest
|
from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest
|
||||||
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
|
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
|
||||||
@@ -112,6 +120,7 @@ def run_pipeline(content: bytes) -> PipelineOutput:
|
|||||||
header = merged
|
header = merged
|
||||||
|
|
||||||
personel: list[PersonnelEntry] = []
|
personel: list[PersonnelEntry] = []
|
||||||
|
table_flags: list[ReviewFlag] = []
|
||||||
if s.tables_enabled and cleaned_pages:
|
if s.tables_enabled and cleaned_pages:
|
||||||
all_tables: list[DetectedTable] = []
|
all_tables: list[DetectedTable] = []
|
||||||
for img in cleaned_pages:
|
for img in cleaned_pages:
|
||||||
@@ -126,14 +135,33 @@ def run_pipeline(content: bytes) -> PipelineOutput:
|
|||||||
personel_rows=len(personel),
|
personel_rows=len(personel),
|
||||||
)
|
)
|
||||||
|
|
||||||
initial_flags: list[ReviewFlag] = list(llm_flags)
|
# Text-based fallback: PP-Structure can succeed structurally but emit
|
||||||
|
# rows with only ``nama`` populated (column mapper degraded), or fail to
|
||||||
|
# detect the table at all. In both cases the regex fallback that scans
|
||||||
|
# raw OCR for rank+NRP pairs produces a much more useful result. We
|
||||||
|
# always run it when the structured path is empty or low-quality, and
|
||||||
|
# raise a review flag so the operator knows the document didn't go
|
||||||
|
# through the preferred path.
|
||||||
|
if is_low_quality(personel):
|
||||||
|
fallback_rows = extract_personnel_from_text(full_text)
|
||||||
|
if fallback_rows:
|
||||||
|
personel = fallback_rows
|
||||||
|
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
|
||||||
|
_logger.info(
|
||||||
|
"pipeline.personnel_text_fallback",
|
||||||
|
fallback_rows=len(fallback_rows),
|
||||||
|
)
|
||||||
|
|
||||||
|
untuk_items = find_untuk_list(full_text)
|
||||||
|
|
||||||
|
initial_flags: list[ReviewFlag] = list(llm_flags) + list(table_flags)
|
||||||
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
|
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
|
||||||
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
|
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
|
||||||
|
|
||||||
result = ExtractionResult(
|
result = ExtractionResult(
|
||||||
header=header,
|
header=header,
|
||||||
personel=personel,
|
personel=personel,
|
||||||
untuk=[],
|
untuk=untuk_items,
|
||||||
ttd=ttd,
|
ttd=ttd,
|
||||||
raw_text=full_text,
|
raw_text=full_text,
|
||||||
confidence=mean_ocr_conf,
|
confidence=mean_ocr_conf,
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ class ReviewFlag(str, Enum):
|
|||||||
DATE_PARSE_FAILED = "date_parse_failed"
|
DATE_PARSE_FAILED = "date_parse_failed"
|
||||||
LLM_FALLBACK = "llm_fallback"
|
LLM_FALLBACK = "llm_fallback"
|
||||||
LLM_UNAVAILABLE = "llm_unavailable"
|
LLM_UNAVAILABLE = "llm_unavailable"
|
||||||
|
PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
|
||||||
|
INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"
|
||||||
|
|
||||||
|
|
||||||
class Signatory(BaseModel):
|
class Signatory(BaseModel):
|
||||||
|
|||||||
@@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none(
|
|||||||
out = run_pipeline(b"%PDF-1.4\n%fake")
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||||
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
|
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
|
||||||
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
|
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
|
||||||
|
|
||||||
|
|
||||||
|
def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
"""When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
|
||||||
|
the orchestrator must run the text fallback against the raw OCR text and
|
||||||
|
raise the ``personnel_text_fallback`` flag.
|
||||||
|
"""
|
||||||
|
monkeypatch.setenv("LLM_ENABLED", "false")
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
raw_text = (
|
||||||
|
"DAFTAR PERSONIL\n"
|
||||||
|
"1.\n"
|
||||||
|
"SRI WAHYUNI\n"
|
||||||
|
"AIPTU / 75070328\n"
|
||||||
|
"INTELKAM POLRES CIMAHI\n"
|
||||||
|
"2.\n"
|
||||||
|
"AGUNG LUKMAN\n"
|
||||||
|
"BRIPTU / 99030245\n"
|
||||||
|
"SAT INTELKAM\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
|
||||||
|
# the real Polres Cimahi document).
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
pp_structure_low_quality = [
|
||||||
|
PersonnelEntry(nama="SRI WAHYUNI"),
|
||||||
|
PersonnelEntry(nama="AGUNG LUKMAN"),
|
||||||
|
]
|
||||||
|
_stub_pipeline_stages(
|
||||||
|
monkeypatch,
|
||||||
|
raw_text=raw_text,
|
||||||
|
regex_header=HeaderFields(
|
||||||
|
nomor_sprint="Sprin/1/I/2025",
|
||||||
|
tanggal=date(2025, 1, 1),
|
||||||
|
satuan_penerbit="Polres Cimahi",
|
||||||
|
perihal="ok",
|
||||||
|
dasar=["UU 2/2002"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# Override extract_personnel to return the broken PP-Structure rows.
|
||||||
|
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
|
||||||
|
|
||||||
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||||
|
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
|
||||||
|
# Fallback rows must carry pangkat + nrp (the whole point of the path).
|
||||||
|
assert all(r.pangkat and r.nrp for r in out.result.personel)
|
||||||
|
assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
"""Healthy PP-Structure output (rank+nrp present on most rows) must NOT
|
||||||
|
be replaced by the text fallback.
|
||||||
|
"""
|
||||||
|
monkeypatch.setenv("LLM_ENABLED", "false")
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
healthy = [
|
||||||
|
PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
|
||||||
|
PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
|
||||||
|
PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
|
||||||
|
]
|
||||||
|
_stub_pipeline_stages(
|
||||||
|
monkeypatch,
|
||||||
|
raw_text="ignored — should not be parsed",
|
||||||
|
regex_header=HeaderFields(
|
||||||
|
nomor_sprint="Sprin/1/I/2025",
|
||||||
|
tanggal=date(2025, 1, 1),
|
||||||
|
satuan_penerbit="Polres X",
|
||||||
|
perihal="ok",
|
||||||
|
dasar=["UU 2/2002"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
|
||||||
|
|
||||||
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||||
|
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
|
||||||
|
assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]
|
||||||
|
|||||||
118
tests/unit/test_personnel_text_fallback.py
Normal file
118
tests/unit/test_personnel_text_fallback.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
"""Tests for the text-based personnel fallback extractor.
|
||||||
|
|
||||||
|
Driven by the real Polres Cimahi sprint document where PP-Structure
|
||||||
|
produced 24 rows with only ``nama`` populated. The fallback should
|
||||||
|
recover at least the rank + NRP for every row.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||||
|
extract_personnel_from_text,
|
||||||
|
is_low_quality,
|
||||||
|
)
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
_CIMAHI_FIXTURE = """\
|
||||||
|
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
||||||
|
NO
|
||||||
|
NAMA
|
||||||
|
PANGKAT / NRP
|
||||||
|
JABATAN
|
||||||
|
KET
|
||||||
|
BAUR SKCK SAT
|
||||||
|
1.
|
||||||
|
SRI WAHYUNI
|
||||||
|
AIPTU / 75070328
|
||||||
|
INTELKAM POLRES
|
||||||
|
CIMAHI
|
||||||
|
BA PELAKSANA SKCK
|
||||||
|
2.
|
||||||
|
CITRA DWI PUTRI R
|
||||||
|
BRIPTU / 95070659
|
||||||
|
SAT INTELKAM
|
||||||
|
POLRES CIMAHI
|
||||||
|
BA PELAKSANA SKCK
|
||||||
|
3.
|
||||||
|
AGUNG LUKMAN AL
|
||||||
|
BRIPTU / 99030245
|
||||||
|
SAT INTELKAM
|
||||||
|
POLRES CIMAHI
|
||||||
|
BA POLSEK
|
||||||
|
8.
|
||||||
|
ARIEF SYAHRUL ZAMAN
|
||||||
|
BRIGPOL /96030446
|
||||||
|
MARGAASIH
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractPersonnelFromText:
|
||||||
|
def test_extracts_rank_nrp_and_name(self) -> None:
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
assert len(rows) == 4
|
||||||
|
first = rows[0]
|
||||||
|
assert first.pangkat == "AIPTU"
|
||||||
|
assert first.nrp == "75070328"
|
||||||
|
assert first.nama == "SRI WAHYUNI"
|
||||||
|
|
||||||
|
def test_normalizes_brigpol_to_brigadir(self) -> None:
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
last = rows[-1]
|
||||||
|
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
|
||||||
|
assert last.pangkat == "BRIGADIR"
|
||||||
|
assert last.nrp == "96030446"
|
||||||
|
assert last.nama == "ARIEF SYAHRUL ZAMAN"
|
||||||
|
|
||||||
|
def test_skips_header_lines_as_names(self) -> None:
|
||||||
|
# No row should ever have a column-header word as nama.
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
names = [r.nama for r in rows]
|
||||||
|
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
|
||||||
|
assert blocked not in names
|
||||||
|
|
||||||
|
def test_jabatan_collected_from_following_lines(self) -> None:
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
assert rows[0].jabatan_dinas is not None
|
||||||
|
assert "INTELKAM" in rows[0].jabatan_dinas
|
||||||
|
|
||||||
|
def test_empty_text_returns_empty(self) -> None:
|
||||||
|
assert extract_personnel_from_text("") == []
|
||||||
|
|
||||||
|
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
|
||||||
|
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
|
||||||
|
assert extract_personnel_from_text(text) == []
|
||||||
|
|
||||||
|
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
|
||||||
|
# NRP without a recognised rank token must not produce a row.
|
||||||
|
text = "Some line\n12345678\nanother line"
|
||||||
|
assert extract_personnel_from_text(text) == []
|
||||||
|
|
||||||
|
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
|
||||||
|
# A "rank-shaped" word that isn't in the master list must not yield a row.
|
||||||
|
text = "Some line\nFAKERANK / 12345678\nanother line"
|
||||||
|
assert extract_personnel_from_text(text) == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsLowQuality:
|
||||||
|
def test_empty_list_is_low_quality(self) -> None:
|
||||||
|
assert is_low_quality([]) is True
|
||||||
|
|
||||||
|
def test_all_rows_with_only_name_is_low_quality(self) -> None:
|
||||||
|
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
|
||||||
|
assert is_low_quality(rows) is True
|
||||||
|
|
||||||
|
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
|
||||||
|
rows = [
|
||||||
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
||||||
|
for i in range(10)
|
||||||
|
]
|
||||||
|
assert is_low_quality(rows) is False
|
||||||
|
|
||||||
|
def test_borderline_30_percent_threshold(self) -> None:
|
||||||
|
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
|
||||||
|
useful = [
|
||||||
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
||||||
|
for i in range(3)
|
||||||
|
]
|
||||||
|
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
|
||||||
|
assert is_low_quality(useful + useless) is False
|
||||||
@@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import (
|
|||||||
find_satuan,
|
find_satuan,
|
||||||
find_signatory,
|
find_signatory,
|
||||||
find_tanggal,
|
find_tanggal,
|
||||||
|
find_untuk_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -60,6 +61,36 @@ class TestSatuan:
|
|||||||
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
||||||
assert result is not None
|
assert result is not None
|
||||||
|
|
||||||
|
def test_prefers_resor_over_negara_when_both_present(self) -> None:
|
||||||
|
# The Polri letterhead lists units hierarchically; the issuing unit
|
||||||
|
# is the deepest level, not the topmost generic "NEGARA" line.
|
||||||
|
text = (
|
||||||
|
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
||||||
|
"DAERAH JAWA BARAT\n"
|
||||||
|
"RESOR CIMAHI\n"
|
||||||
|
"SURAT PERINTAH\n"
|
||||||
|
)
|
||||||
|
result = find_satuan(text)
|
||||||
|
assert result == "KEPOLISIAN RESOR CIMAHI"
|
||||||
|
|
||||||
|
def test_prefers_sektor_over_resor(self) -> None:
|
||||||
|
text = (
|
||||||
|
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
||||||
|
"DAERAH JAWA BARAT\n"
|
||||||
|
"RESOR CIMAHI\n"
|
||||||
|
"SEKTOR PADALARANG\n"
|
||||||
|
)
|
||||||
|
result = find_satuan(text)
|
||||||
|
assert result == "KEPOLISIAN SEKTOR PADALARANG"
|
||||||
|
|
||||||
|
def test_handles_daerah_only(self) -> None:
|
||||||
|
text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n"
|
||||||
|
result = find_satuan(text)
|
||||||
|
assert result == "KEPOLISIAN DAERAH JAWA BARAT"
|
||||||
|
|
||||||
|
def test_returns_none_when_no_letterhead(self) -> None:
|
||||||
|
assert find_satuan("no police letterhead here") is None
|
||||||
|
|
||||||
|
|
||||||
class TestPerihal:
|
class TestPerihal:
|
||||||
def test_extracts_perihal_line(self) -> None:
|
def test_extracts_perihal_line(self) -> None:
|
||||||
@@ -69,6 +100,25 @@ class TestPerihal:
|
|||||||
def test_returns_none_when_absent(self) -> None:
|
def test_returns_none_when_absent(self) -> None:
|
||||||
assert find_perihal("no perihal field") is None
|
assert find_perihal("no perihal field") is None
|
||||||
|
|
||||||
|
def test_falls_back_to_pertimbangan_block(self) -> None:
|
||||||
|
# Many Polres-level sprints use "Pertimbangan" instead of "Perihal".
|
||||||
|
# The fallback should pick up the first non-empty line under it.
|
||||||
|
text = (
|
||||||
|
"Pertimbangan\n"
|
||||||
|
"Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n"
|
||||||
|
"DASAR :\n"
|
||||||
|
"1. ...\n"
|
||||||
|
)
|
||||||
|
result = find_perihal(text)
|
||||||
|
assert result is not None
|
||||||
|
assert result.startswith("Bahwa dalam rangka mendukung")
|
||||||
|
|
||||||
|
def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None:
|
||||||
|
# If the document has both a Perihal label AND a Pertimbangan
|
||||||
|
# paragraph, the explicit Perihal wins.
|
||||||
|
text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n"
|
||||||
|
assert find_perihal(text) == "The actual perihal."
|
||||||
|
|
||||||
|
|
||||||
class TestDasar:
|
class TestDasar:
|
||||||
def test_numbered_list(self) -> None:
|
def test_numbered_list(self) -> None:
|
||||||
@@ -88,6 +138,57 @@ class TestDasar:
|
|||||||
def test_empty_when_section_missing(self) -> None:
|
def test_empty_when_section_missing(self) -> None:
|
||||||
assert find_dasar_list("no dasar section") == []
|
assert find_dasar_list("no dasar section") == []
|
||||||
|
|
||||||
|
def test_handles_bare_number_lines_split_by_ocr(self) -> None:
|
||||||
|
# OCR sometimes places the number marker on its own line and the
|
||||||
|
# body on the next non-empty line. The collector must merge them
|
||||||
|
# rather than dropping the body or appending it to the previous
|
||||||
|
# item (which the old implementation did).
|
||||||
|
text = (
|
||||||
|
"Dasar\n"
|
||||||
|
":\n"
|
||||||
|
"1.\n"
|
||||||
|
" Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n"
|
||||||
|
"2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n"
|
||||||
|
"3.\n"
|
||||||
|
"Keterangan Catatan Kepolisian (SKCK);\n"
|
||||||
|
"4.\n"
|
||||||
|
"Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n"
|
||||||
|
"5. DIPA Petikan Satker Polres Cimahi.\n"
|
||||||
|
"DIPERINTAHKAN\n"
|
||||||
|
)
|
||||||
|
items = find_dasar_list(text)
|
||||||
|
assert len(items) == 5
|
||||||
|
assert items[0].startswith("Undang - Undang")
|
||||||
|
assert items[2].startswith("Keterangan Catatan")
|
||||||
|
assert items[3].startswith("Pelayanan dilingkungan")
|
||||||
|
assert items[4].startswith("DIPA")
|
||||||
|
|
||||||
|
|
||||||
|
class TestUntuk:
|
||||||
|
def test_extracts_numbered_untuk_bullets(self) -> None:
|
||||||
|
text = (
|
||||||
|
"DIPERINTAHKAN\n"
|
||||||
|
"Kepada\n"
|
||||||
|
"Untuk\n"
|
||||||
|
"1.\n"
|
||||||
|
"melaksanakan tugas A;\n"
|
||||||
|
"2.\n"
|
||||||
|
"melaksanakan tugas B;\n"
|
||||||
|
"Selesai.\n"
|
||||||
|
)
|
||||||
|
items = find_untuk_list(text)
|
||||||
|
assert len(items) == 2
|
||||||
|
assert items[0] == "melaksanakan tugas A;"
|
||||||
|
assert items[1] == "melaksanakan tugas B;"
|
||||||
|
|
||||||
|
def test_returns_empty_when_section_missing(self) -> None:
|
||||||
|
assert find_untuk_list("no untuk section") == []
|
||||||
|
|
||||||
|
def test_stops_at_dikeluarkan(self) -> None:
|
||||||
|
text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n"
|
||||||
|
items = find_untuk_list(text)
|
||||||
|
assert items == ["tugas A;"]
|
||||||
|
|
||||||
|
|
||||||
class TestSignatory:
|
class TestSignatory:
|
||||||
def test_extracts_last_nrp(self) -> None:
|
def test_extracts_last_nrp(self) -> None:
|
||||||
|
|||||||
@@ -62,6 +62,20 @@ class TestPersonnelValidator:
|
|||||||
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
||||||
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
||||||
|
|
||||||
|
def test_row_with_only_name_is_flagged_incomplete(self) -> None:
|
||||||
|
# A row that captured only `nama` (no pangkat AND no nrp) is the
|
||||||
|
# signature of mis-aligned table extraction. Must be flagged so
|
||||||
|
# the operator routes the document to needs_review.
|
||||||
|
entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE")
|
||||||
|
flags = validate_personnel_entry(entry)
|
||||||
|
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags
|
||||||
|
|
||||||
|
def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None:
|
||||||
|
# Having pangkat without NRP is suboptimal but still identifies a
|
||||||
|
# rank, so we don't raise the structural-incompleteness flag.
|
||||||
|
entry = PersonnelEntry(pangkat="AKP", nama="Test")
|
||||||
|
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry)
|
||||||
|
|
||||||
|
|
||||||
class TestHeaderValidator:
|
class TestHeaderValidator:
|
||||||
def test_complete_header_no_flags(self) -> None:
|
def test_complete_header_no_flags(self) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user