feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing

This commit is contained in:
Adriankf59
2026-04-26 17:16:47 +07:00
parent dbcf480130
commit 002821ca07
20 changed files with 3326 additions and 20 deletions

View File

@@ -86,14 +86,18 @@ def _row_to_response(row: object) -> DocumentResponse:
assert isinstance(row, JobRow)
status_enum = DocumentStatus(row.status)
result_obj: ExtractionResult | None = None
personel_list = None
if row.result is not None:
result_obj = ExtractionResult.model_validate(row.result)
# Auto-number personnel entries sequentially (1, 2, 3, ...)
for idx, entry in enumerate(result_obj.personel, start=1):
entry.no = idx
personel_list = result_obj.personel
return DocumentResponse(
job_id=row.job_id,
status=status_enum,
confidence=row.confidence,
data=result_obj,
data=personel_list,
review_flags=list(row.review_flags or []),
error=row.error,
approved=bool(row.approved),

View File

@@ -33,12 +33,45 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
# Perwira Menengah
"KOMPOL": ("KOMPOL",),
"AKBP": ("AKBP",),
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP", "KOMBES"),
# Perwira Tinggi
"BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
"IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
"KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
"JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
# PNS Polri (Pegawai Negeri Sipil di lingkungan Polri). PNS appear
# routinely on sprint panitia / undangan templates alongside Polri
# personnel, so we treat them as valid ranks for extraction.
# Sources: PP 11/2017 jo PP 17/2020 (Manajemen PNS); golongan I-IV.
# Golongan I (Juru)
"JURU MUDA": ("JURU MUDA",),
"JURU MUDA TK I": ("JURU MUDA TK I", "JURU MUDA TK.I", "JURU MUDA TINGKAT I"),
"JURU": ("JURU",),
"JURU TK I": ("JURU TK I", "JURU TK.I", "JURU TINGKAT I"),
# Golongan II (Pengatur)
"PENGATUR MUDA": ("PENGATUR MUDA",),
"PENGATUR MUDA TK I": (
"PENGATUR MUDA TK I",
"PENGATUR MUDA TK.I",
"PENGATUR MUDA TINGKAT I",
),
"PENGATUR": ("PENGATUR",),
"PENGATUR TK I": ("PENGATUR TK I", "PENGATUR TK.I", "PENGATUR TINGKAT I"),
# Golongan III (Penata)
"PENATA MUDA": ("PENATA MUDA",),
"PENATA MUDA TK I": (
"PENATA MUDA TK I",
"PENATA MUDA TK.I",
"PENATA MUDA TINGKAT I",
),
"PENATA": ("PENATA",),
"PENATA TK I": ("PENATA TK I", "PENATA TK.I", "PENATA TINGKAT I"),
# Golongan IV (Pembina)
"PEMBINA": ("PEMBINA",),
"PEMBINA TK I": ("PEMBINA TK I", "PEMBINA TK.I", "PEMBINA TINGKAT I"),
"PEMBINA UTAMA MUDA": ("PEMBINA UTAMA MUDA",),
"PEMBINA UTAMA MADYA": ("PEMBINA UTAMA MADYA",),
"PEMBINA UTAMA": ("PEMBINA UTAMA",),
}
# Reverse lookup: any variant (uppercased) → canonical form.

View File

@@ -64,6 +64,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
"jabatan dinas": "jabatan_dinas",
"jabatan dalam dinas": "jabatan_dinas",
"jbt dinas": "jabatan_dinas",
"struktural": "jabatan_dinas",
"jabatan struktural": "jabatan_dinas",
# jabatan dalam sprint (role for this dispatch)
"jabatan dalam sprint": "jabatan_sprint",
"jabatan dalam sprin": "jabatan_sprint",
@@ -72,6 +74,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
"jabatan sprin": "jabatan_sprint",
"tugas": "jabatan_sprint",
"penugasan": "jabatan_sprint",
"dalam penugasan": "jabatan_sprint",
"jabatan dalam penugasan": "jabatan_sprint",
# remarks
"keterangan": "keterangan",
"ket": "keterangan",

View File

@@ -38,12 +38,18 @@ _RANK_TOKENS: tuple[str, ...] = tuple(
)
)
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
# A line that contains a rank token followed (anywhere on the same line) by
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
# inside e.g. "ABRIPDA-style" text.
# A rank token followed (within a few characters) by an 8-digit NRP.
# We allow common separators: '/', '-', '.', ',', ':' or whitespace.
# The trailing ``\b`` plus proximity to the 8-digit NRP is the
# specificity signal — we deliberately do *not* require a leading
# ``\b`` because real Polri sprint OCR routinely mashes the rank into
# the trailing characters of the previous cell (observed on Polres
# Banjar: "...CPHR., CBA, CI" runs straight into "AKP" giving
# "CIAKP 84011113"). Requiring a leading boundary loses that row
# entirely. The longest-first alternation order ensures multi-token
# ranks ("KOMBES POL") still win over short overlaps ("KBP").
_RE_RANK_NRP_LINE = re.compile(
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
rf"(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
re.IGNORECASE,
)
# A bare row number marker like "1." or "12)". OCR often puts it on its own
@@ -143,31 +149,248 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
Strategy:
**Pass 1** — same-line rank+NRP (original strategy):
1. Iterate every line. Skip lines that don't contain both a known rank
and an 8-digit NRP (those are the only signal we trust).
2. For each rank+NRP line, look back for the most recent plausible name
line, and forward 1-3 lines for jabatan content.
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
**Pass 2** — separate-line rank and NRP (for tabular sprint formats):
If pass 1 produces no results, scan for lines containing a standalone
rank token, then look up to 2 lines forward for a standalone NRP.
This handles sprint formats where OCR renders each column on its own
line (e.g. Polres Banjar layout).
**Pass 3** — rank-only (for sprint formats *without* an NRP column):
Some sprint templates (panitia, undangan, etc.) list only nama +
pangkat + jabatan, no NRP. If pass 1 and pass 2 both yield nothing,
fall back to a rank-only scan: every standalone rank line (or
two-line rank like "KOMBES" + "POL" produced by narrow-column OCR)
becomes a row, with name assembled from preceding lines and jabatan
from following lines. ``nrp`` stays ``None``. False-positive risk
is higher (stray rank tokens in body text), so this only fires when
nothing else matched.
The fallback is intentionally rate-limited: the first matching rank
token on a line wins (no greedy multi-match per line), and a name line
can only be consumed once (so a stray ranked text inside a paragraph
doesn't turn into multiple bogus entries).
"""
lines = raw_text.splitlines()
# ── Pass 1: rank+NRP on the same line ────────────────────────────
rows = _extract_same_line(lines)
if rows:
return rows
# ── Pass 2: rank and NRP on separate lines ───────────────────────
rows = _extract_separate_lines(lines)
if rows:
return rows
# ── Pass 3: rank-only (no NRP column) ────────────────────────────
return _extract_rank_only(lines)
# Regex for a line that is *only* a rank token (possibly with punctuation).
_RE_RANK_ONLY = re.compile(
rf"^\s*(?P<rank>{_RANK_ALT})\s*[/.\-,:]*\s*$",
re.IGNORECASE,
)
# Regex for a line that contains a standalone 8-digit NRP.
_RE_NRP_ONLY = re.compile(r"(?<!\d)(?P<nrp>\d{8})(?!\d)")
# Strip a leading row number marker like "1 ", "1.", "12)" from a name
# prefix taken from the same OCR line as a rank+NRP match. Unlike
# _RE_ROW_NUMBER (which matches a *whole* line), this is a prefix strip
# for embedded same-line cases like "1 CUCU JUHANA, A.K.S. KOMPOL ...".
_RE_LEADING_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s+")
def _extract_same_line(lines: list[str]) -> list[PersonnelEntry]:
"""Pass 1: rank+NRP pairs found anywhere in the joined text.
Uses ``finditer`` over the full ``\\n``-joined OCR text rather than
``re.search`` per line so that multiple rank+NRP pairs on the same
OCR line still produce separate rows. This is required for sprint
scans where Paddle merges several table rows into one OCR line
(observed on Polres Banjar where row 2's "...CBA.AKP 77020049 KASAT
RESKRIM" was being swallowed into row 1's jabatan because per-line
``search`` only returns the first match).
For each match we resolve nama from text *before* the match (the
same-line prefix takes precedence; otherwise look back through the
preceding lines bounded by the previous match) and jabatan from text
*after* the match (same-line suffix plus up to ~3 follow-up lines,
bounded by the next match).
"""
if not lines:
return []
full_text = "\n".join(lines)
line_starts: list[int] = []
pos = 0
for line in lines:
line_starts.append(pos)
pos += len(line) + 1 # +1 for the joining "\n"
def offset_to_line(offset: int) -> int:
lo, hi = 0, len(line_starts)
while lo < hi:
mid = (lo + hi) // 2
if line_starts[mid] <= offset:
lo = mid + 1
else:
hi = mid
return max(0, lo - 1)
matches = list(_RE_RANK_NRP_LINE.finditer(full_text))
rows: list[PersonnelEntry] = []
consumed_lines: set[int] = set()
for i, m in enumerate(matches):
pangkat = normalize_pangkat(m.group("rank"))
if not pangkat or not is_valid_pangkat(pangkat):
continue
nrp = m.group("nrp")
ml = offset_to_line(m.start())
prev_ml = (
offset_to_line(matches[i - 1].start()) if i > 0 else -1
)
next_ml = (
offset_to_line(matches[i + 1].start())
if i + 1 < len(matches)
else len(lines)
)
line_text = lines[ml]
line_off = line_starts[ml]
# Same-line prefix: text on this line *before* the rank token.
# If the previous match was on this same line, only consider the
# text after that previous match's NRP (otherwise we'd reuse the
# earlier row's tail as this row's name).
prefix_start_local = 0
if prev_ml == ml and i > 0:
prefix_start_local = max(0, matches[i - 1].end() - line_off)
prefix = line_text[prefix_start_local : m.start() - line_off]
# Same-line suffix: text on this line *after* the NRP, capped at
# the next match's start if it's on this same line.
suffix_end_local = len(line_text)
if next_ml == ml and i + 1 < len(matches):
suffix_end_local = matches[i + 1].start() - line_off
suffix = line_text[m.end() - line_off : suffix_end_local]
# ── Resolve nama ────────────────────────────────────────────
nama: str | None = None
prefix_clean = _RE_LEADING_ROW_NUMBER.sub("", prefix).strip()
if prefix_clean and _is_plausible_name(prefix_clean):
nama = prefix_clean
elif prev_ml < ml:
for back in range(ml - 1, prev_ml, -1):
if back in consumed_lines or back < 0:
continue
candidate = lines[back].strip()
if _is_plausible_name(candidate):
nama = candidate
consumed_lines.add(back)
break
# ── Resolve jabatan ─────────────────────────────────────────
jabatan_parts: list[str] = []
suffix_clean = suffix.strip()
if suffix_clean:
jabatan_parts.append(suffix_clean)
if next_ml > ml:
max_fwd = min(ml + 4, next_ml, len(lines))
for fwd in range(ml + 1, max_fwd):
candidate = lines[fwd].strip()
if not candidate:
if jabatan_parts:
break
continue
if _RE_NAME_BLOCKLIST.match(candidate):
break
if _RE_ROW_NUMBER.match(candidate):
break
jabatan_parts.append(candidate)
jabatan = (
" ".join(" ".join(jabatan_parts).split())
if jabatan_parts
else None
)
rows.append(
PersonnelEntry(
no=None,
pangkat=pangkat,
nrp=nrp,
nama=nama,
jabatan_dinas=jabatan,
jabatan_sprint=None,
keterangan=None,
)
)
return rows
def _extract_separate_lines(lines: list[str]) -> list[PersonnelEntry]:
"""Pass 2: rank and NRP on separate nearby lines.
Handles tabular sprint formats where OCR outputs each column as its
own line, e.g.:
1
CUCU JUHANA, A.K.S.
KOMPOL
70100418
KABAGOPS
"""
consumed_names: set[int] = set()
consumed_nrps: set[int] = set()
rows: list[PersonnelEntry] = []
for idx, raw_line in enumerate(lines):
line = raw_line.strip()
match = _RE_RANK_NRP_LINE.search(line)
if not match:
rank_match = _RE_RANK_ONLY.match(line)
if not rank_match:
# Also try: line starts with a rank token (may have trailing text)
for tok in _RANK_TOKENS:
if line.upper().startswith(tok) and len(line) - len(tok) < 5:
rank_match = re.match(
rf"^\s*(?P<rank>{re.escape(tok)})\s*[/.\-,:]*",
line,
re.IGNORECASE,
)
if rank_match:
break
if not rank_match:
continue
pangkat = normalize_pangkat(match.group("rank"))
pangkat = normalize_pangkat(rank_match.group("rank"))
if not pangkat or not is_valid_pangkat(pangkat):
continue
nrp = match.group("nrp")
# Look forward up to 2 lines for NRP
nrp: str | None = None
nrp_idx: int | None = None
for fwd in range(idx + 1, min(idx + 3, len(lines))):
if fwd in consumed_nrps:
continue
nrp_match = _RE_NRP_ONLY.search(lines[fwd].strip())
if nrp_match:
nrp = nrp_match.group("nrp")
nrp_idx = fwd
break
if not nrp:
continue
assert nrp_idx is not None
consumed_nrps.add(nrp_idx)
# Look back for name
nama: str | None = None
for back in range(idx - 1, max(idx - 6, -1), -1):
if back in consumed_names:
@@ -178,7 +401,8 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
consumed_names.add(back)
break
jabatan = _following_jabatan(lines, idx)
# Look forward after NRP for jabatan
jabatan = _following_jabatan(lines, nrp_idx)
rows.append(
PersonnelEntry(
no=None,
@@ -193,6 +417,370 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
return rows
# Bare row-number markers used by sprint formats without NRP (the dot
# is often missing in narrow-column OCR, e.g. just "1" on its own line).
_RE_BARE_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s*$")
def _try_match_rank_at(lines: list[str], idx: int) -> tuple[str, int] | None:
"""Try to match a standalone rank starting at ``lines[idx]``.
Returns ``(rank_text, lines_consumed)`` on success. Handles narrow-
column OCR that splits a multi-token rank across two lines (e.g.
``"KOMBES"`` + ``"POL"`` or ``"PENATA"`` + ``"TK I"``).
The two-line concatenation is tried *first* so that more-specific
multi-token ranks ("PENATA TK I") win over their less-specific
single-line prefix ("PENATA"). Without this preference, "TK I"
would leak into the jabatan column.
"""
if idx >= len(lines):
return None
line = lines[idx].strip()
if idx + 1 < len(lines):
combined = (line + " " + lines[idx + 1].strip()).strip()
m2 = _RE_RANK_ONLY.match(combined)
if m2:
return m2.group("rank"), 2
m = _RE_RANK_ONLY.match(line)
if m:
return m.group("rank"), 1
return None
def _extract_rank_only(lines: list[str]) -> list[PersonnelEntry]:
"""Pass 3: rank-only fallback for sprint formats without an NRP column.
Each standalone rank line (single line or two-line concatenation) is
treated as the pivot of a personnel row. ``nama`` is assembled from
the preceding contiguous plausible-name lines (typical OCR splits a
long name across 2-3 short lines because of narrow columns); jabatan
is collected from following lines until the next rank or row marker.
``nrp`` is always ``None`` for rows produced by this pass.
"""
rows: list[PersonnelEntry] = []
consumed_lines: set[int] = set()
i = 0
while i < len(lines):
match = _try_match_rank_at(lines, i)
if not match:
i += 1
continue
rank_text, rank_span = match
pangkat = normalize_pangkat(rank_text)
if not pangkat or not is_valid_pangkat(pangkat):
i += 1
continue
# ── Look back for name lines (assemble up to 4 contiguous lines) ──
name_lines: list[str] = []
for back in range(i - 1, max(i - 6, -1), -1):
if back in consumed_lines:
break
candidate = lines[back].strip()
if not candidate:
if name_lines:
break
continue
if _RE_BARE_ROW_NUMBER.match(candidate):
break
if _RE_NAME_BLOCKLIST.match(candidate):
break
if _try_match_rank_at(lines, back) is not None:
break
if not _is_plausible_name(candidate):
break
name_lines.insert(0, candidate)
consumed_lines.add(back)
nama = " ".join(" ".join(name_lines).split()) if name_lines else None
# ── Look forward for jabatan (stop at next rank / row marker) ─────
jabatan_parts: list[str] = []
fwd = i + rank_span
steps = 0
while fwd < len(lines) and steps < 8:
candidate = lines[fwd].strip()
if not candidate:
if jabatan_parts:
break
fwd += 1
steps += 1
continue
if _RE_BARE_ROW_NUMBER.match(candidate):
break
if _try_match_rank_at(lines, fwd) is not None:
break
if _RE_NAME_BLOCKLIST.match(candidate):
break
jabatan_parts.append(candidate)
fwd += 1
steps += 1
jabatan = " ".join(" ".join(jabatan_parts).split()) if jabatan_parts else None
rows.append(
PersonnelEntry(
no=None,
pangkat=pangkat,
nrp=None,
nama=nama,
jabatan_dinas=jabatan,
jabatan_sprint=None,
keterangan=None,
)
)
i += rank_span
return rows
# ── Column-aware Pass 3 (uses OCR bounding boxes) ───────────────────────
def _box_x_left(box: tuple[tuple[float, float], ...]) -> float:
return min(p[0] for p in box)
def _box_x_right(box: tuple[tuple[float, float], ...]) -> float:
return max(p[0] for p in box)
def _box_x_center(box: tuple[tuple[float, float], ...]) -> float:
return (_box_x_left(box) + _box_x_right(box)) / 2
def _box_y_top(box: tuple[tuple[float, float], ...]) -> float:
return min(p[1] for p in box)
def _box_y_bottom(box: tuple[tuple[float, float], ...]) -> float:
return max(p[1] for p in box)
def _box_y_center(box: tuple[tuple[float, float], ...]) -> float:
return (_box_y_top(box) + _box_y_bottom(box)) / 2
def _box_height(box: tuple[tuple[float, float], ...]) -> float:
return _box_y_bottom(box) - _box_y_top(box)
def extract_personnel_from_ocr_lines(ocr_lines: list) -> list[PersonnelEntry]:
"""Column-aware Pass 3 for sprint formats without an NRP column.
Each ``ocr_line`` must expose ``text`` (str) and ``box`` (a tuple of
4 ``(x, y)`` corner points). We use the geometry to:
1. Detect rank lines (single-line or vertically-stacked two-line).
2. Estimate the PANGKAT column X-center from those rank lines.
3. For each rank, gather **only** lines in the NAMA column (X left
of PANGKAT) within the row's Y span as the name fragments, and
**only** lines in the JABATAN column (X right of PANGKAT) for
jabatan. This prevents column-bleed that flat-text Pass 3
suffers from on dense tables.
Returns ``[]`` if no rank lines are detected (caller can fall back
to the text-only Pass 3).
"""
if not ocr_lines:
return []
# Sort by (y_top, x_left) for vertical-stacking rank detection.
indexed = sorted(
range(len(ocr_lines)),
key=lambda i: (_box_y_top(ocr_lines[i].box), _box_x_left(ocr_lines[i].box)),
)
# Pass 1: find rank anchors.
# An anchor is one or two stacked OCR lines whose combined text matches
# _RE_RANK_ONLY and normalises to a known pangkat. Two-line stacks must
# X-overlap so we don't accidentally merge cells from different columns.
used: set[int] = set()
anchors: list[dict] = []
for pos, idx in enumerate(indexed):
if idx in used:
continue
ln = ocr_lines[idx]
text = ln.text.strip()
rank_text: str | None = None
member_idxs: list[int] = [idx]
# Try two-line stack first (so PENATA TK I beats PENATA).
for j_pos in range(pos + 1, min(pos + 5, len(indexed))):
j_idx = indexed[j_pos]
if j_idx in used:
continue
other = ocr_lines[j_idx]
x_overlap = (
min(_box_x_right(ln.box), _box_x_right(other.box))
- max(_box_x_left(ln.box), _box_x_left(other.box))
)
if x_overlap <= 0:
continue
y_gap = _box_y_top(other.box) - _box_y_bottom(ln.box)
if y_gap > _box_height(ln.box) * 1.5:
break
combined = (text + " " + other.text.strip()).strip()
m2 = _RE_RANK_ONLY.match(combined)
if m2:
rank_text = m2.group("rank")
member_idxs.append(j_idx)
break
if rank_text is None:
m1 = _RE_RANK_ONLY.match(text)
if m1:
rank_text = m1.group("rank")
if rank_text is None:
continue
pangkat = normalize_pangkat(rank_text)
if not pangkat or not is_valid_pangkat(pangkat):
continue
anchors.append(
{
"member_idxs": member_idxs,
"pangkat": pangkat,
"x_center": _box_x_center(ln.box),
"y_top": min(_box_y_top(ocr_lines[m].box) for m in member_idxs),
"y_bottom": max(_box_y_bottom(ocr_lines[m].box) for m in member_idxs),
}
)
used.update(member_idxs)
if not anchors:
return []
# Sort anchors by Y so we can compute row spans.
anchors.sort(key=lambda a: a["y_top"])
# Estimate PANGKAT column X-center as the median of rank anchor X-centers.
xs_sorted = sorted(a["x_center"] for a in anchors)
pangkat_x = xs_sorted[len(xs_sorted) // 2]
# X tolerance: half the median rank-line width. Lines with x_center
# within ±tolerance of pangkat_x are *in* the PANGKAT column and
# excluded from both NAMA and JABATAN buckets.
rank_widths = [
_box_x_right(ocr_lines[a["member_idxs"][0]].box)
- _box_x_left(ocr_lines[a["member_idxs"][0]].box)
for a in anchors
]
rank_widths.sort()
median_rank_width = rank_widths[len(rank_widths) // 2] if rank_widths else 50.0
column_margin = max(median_rank_width * 0.5, 5.0)
# Try to split the JABATAN side into STRUKTURAL (jabatan_dinas) and
# DALAM SPRIN (jabatan_sprint) by clustering jabatan-side X-centers.
# This is a 2-cluster k-means-style split: collect all X-centers of
# lines to the right of PANGKAT, find the largest X-gap among them,
# and use that gap as the column boundary. KET is typically the
# right-most narrow column we let bleed into jabatan_sprint since
# it's commonly empty.
jabatan_xs: list[float] = []
for ln in ocr_lines:
x = _box_x_center(ln.box)
if x > pangkat_x + column_margin and ln.text.strip():
jabatan_xs.append(x)
jabatan_split_x: float | None = None
if len(jabatan_xs) >= 4:
jabatan_xs.sort()
max_gap = 0.0
max_gap_x: float | None = None
for k in range(1, len(jabatan_xs)):
gap = jabatan_xs[k] - jabatan_xs[k - 1]
if gap > max_gap:
max_gap = gap
max_gap_x = (jabatan_xs[k] + jabatan_xs[k - 1]) / 2
# Only use the split if the gap is meaningfully larger than a
# within-column gap (heuristic: > 1.5× median rank width).
if max_gap_x is not None and max_gap > median_rank_width * 1.5:
jabatan_split_x = max_gap_x
# Pre-compute each anchor's y_center for midpoint row dividers.
anchor_y_centers = [(a["y_top"] + a["y_bottom"]) / 2 for a in anchors]
rows: list[PersonnelEntry] = []
for i, anchor in enumerate(anchors):
# Row Y span: midpoint between this anchor and its neighbours.
# Using the midpoint (rather than the previous anchor's
# y_bottom) prevents row N's tail content (e.g. last name
# fragment "M.H.") from leaking into row N+1's nama bucket
# when rank lines don't extend to the full visual row height.
y_lo = (
(anchor_y_centers[i - 1] + anchor_y_centers[i]) / 2
if i > 0
else float("-inf")
)
y_hi = (
(anchor_y_centers[i] + anchor_y_centers[i + 1]) / 2
if i + 1 < len(anchors)
else float("inf")
)
nama_pieces: list[tuple[float, str]] = []
struktural_pieces: list[tuple[float, str]] = []
sprint_pieces: list[tuple[float, str]] = []
for j, ln in enumerate(ocr_lines):
if j in anchor["member_idxs"]:
continue
text = ln.text.strip()
if not text:
continue
x = _box_x_center(ln.box)
y = _box_y_center(ln.box)
if not (y_lo <= y <= y_hi):
continue
if x < pangkat_x - column_margin:
# NAMA side
if _RE_NAME_BLOCKLIST.match(text):
continue
if _RE_BARE_ROW_NUMBER.match(text):
continue
if not _is_plausible_name(text):
continue
nama_pieces.append((y, text))
elif x > pangkat_x + column_margin:
# JABATAN side — split into STRUKTURAL vs DALAM SPRIN
# using the geometric column boundary detected above.
if _RE_NAME_BLOCKLIST.match(text):
continue
if jabatan_split_x is not None and x > jabatan_split_x:
sprint_pieces.append((y, text))
else:
struktural_pieces.append((y, text))
# else: in PANGKAT column or column margin — skip
nama_pieces.sort(key=lambda p: p[0])
struktural_pieces.sort(key=lambda p: p[0])
sprint_pieces.sort(key=lambda p: p[0])
# Strip leading row number from the first nama piece (e.g. "1 F. GUNTUR"
# collapses to "F. GUNTUR" if the row marker happens to share a box).
if nama_pieces:
head = _RE_LEADING_ROW_NUMBER.sub("", nama_pieces[0][1]).strip()
nama_pieces[0] = (nama_pieces[0][0], head)
def _join(pieces: list[tuple[float, str]]) -> str | None:
text = " ".join(t for _, t in pieces if t).strip()
text = " ".join(text.split())
return text or None
rows.append(
PersonnelEntry(
no=None,
pangkat=anchor["pangkat"],
nrp=None,
nama=_join(nama_pieces),
jabatan_dinas=_join(struktural_pieces),
jabatan_sprint=_join(sprint_pieces),
keterangan=None,
)
)
return rows
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
"""Heuristic: did PP-Structure produce useless rows?

View File

@@ -36,6 +36,73 @@ class OCRLine:
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
def _line_y_center(line: OCRLine) -> float:
return sum(p[1] for p in line.box) / len(line.box)
def _line_x_left(line: OCRLine) -> float:
return min(p[0] for p in line.box)
def _line_height(line: OCRLine) -> float:
ys = [p[1] for p in line.box]
return max(ys) - min(ys)
def sort_lines_by_layout(lines: list[OCRLine]) -> list[OCRLine]:
"""Reorder lines into top-to-bottom, left-to-right reading order.
PaddleOCR's natural output order reflects detection order, not visual
layout. On dense tables (e.g. Polda Kalbar Akpol-panitia sprint) this
interleaves rows and columns — Paddle may emit a row's KET column
before its NAMA column, breaking every downstream extractor that
assumes top-to-bottom row order.
We rebuild reading order by:
1. Sorting by ``y_center``.
2. Grouping consecutive lines into row-bands when their ``y_center``
differs by less than half the median line height (so visually
same-row cells stay together even when their boxes don't perfectly
align).
3. Sorting each band left-to-right by ``x_left``.
"""
if not lines:
return []
heights = [_line_height(ln) for ln in lines if _line_height(ln) > 0]
if not heights:
return list(lines)
median_height = sorted(heights)[len(heights) // 2]
band_threshold = max(1.0, median_height * 0.5)
by_y = sorted(lines, key=_line_y_center)
bands: list[list[OCRLine]] = []
current_band: list[OCRLine] = []
current_y: float | None = None
for ln in by_y:
y = _line_y_center(ln)
if current_y is None or abs(y - current_y) <= band_threshold:
current_band.append(ln)
# Track the band's running y-center as the mean of its
# members so a slowly-drifting set of cells doesn't split
# mid-row.
current_y = (
sum(_line_y_center(b) for b in current_band) / len(current_band)
)
else:
bands.append(current_band)
current_band = [ln]
current_y = y
if current_band:
bands.append(current_band)
ordered: list[OCRLine] = []
for band in bands:
ordered.extend(sorted(band, key=_line_x_left))
return ordered
@dataclass(frozen=True)
class OCRPage:
"""OCR output for a single page."""
@@ -44,8 +111,8 @@ class OCRPage:
@property
def text(self) -> str:
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
return "\n".join(line.text for line in self.lines)
"""Reconstruct page text in visual reading order (top-to-bottom, left-to-right)."""
return "\n".join(line.text for line in sort_lines_by_layout(self.lines))
@property
def mean_confidence(self) -> float:

View File

@@ -20,6 +20,7 @@ from ocr_sprint.pipeline.confidence import compute_confidence, route
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
from ocr_sprint.pipeline.extract.personnel import extract_personnel
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_ocr_lines,
extract_personnel_from_text,
is_low_quality,
)
@@ -144,12 +145,37 @@ def run_pipeline(content: bytes) -> PipelineOutput:
# through the preferred path.
if is_low_quality(personel):
fallback_rows = extract_personnel_from_text(full_text)
# If text-based fallback produced rows but they all lack NRP
# (Pass 3 territory), retry with the column-aware extractor that
# uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar
# Akpol-panitia), text-only Pass 3 bleeds adjacent columns into
# nama/jabatan because lines are interleaved within each Y-band;
# the columnar variant restricts each field to its visual column.
text_only_no_nrp = bool(fallback_rows) and all(
r.nrp is None for r in fallback_rows
)
if (not fallback_rows) or text_only_no_nrp:
ocr_lines = [ln for page in ocr_pages for ln in page.lines]
columnar_rows = extract_personnel_from_ocr_lines(ocr_lines)
if columnar_rows and (
not fallback_rows or len(columnar_rows) >= len(fallback_rows)
):
fallback_rows = columnar_rows
if fallback_rows:
personel = fallback_rows
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
# Pass 3 / columnar emit rows with nrp=None for sprint
# templates without an NRP column. Surface that with a
# distinct flag so operators know to expect missing NRPs by
# design rather than by OCR failure.
no_nrp = all(r.nrp is None for r in fallback_rows)
if no_nrp:
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP)
else:
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
_logger.info(
"pipeline.personnel_text_fallback",
fallback_rows=len(fallback_rows),
no_nrp=no_nrp,
)
untuk_items = find_untuk_list(full_text)

View File

@@ -71,11 +71,16 @@ def _build_pp_structure() -> PPStructure:
from paddleocr import PPStructure
s = get_settings()
_logger.info("pp_structure.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
# PPStructure layout models only support 'en' and 'ch', not 'latin'.
# Use 'en' for layout/table detection — it's language-agnostic (detects
# table structure, not text language). OCR within cells still works for
# Indonesian text because the recognition model handles Latin scripts.
pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang
_logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu)
# layout=True so that PP-Structure also returns figure/text regions; we
# filter to tables only afterwards. show_log=False to keep stdout clean.
return PPStructure(
lang=s.ocr_lang,
lang=pp_lang,
use_gpu=s.ocr_use_gpu,
layout=True,
show_log=False,

View File

@@ -10,6 +10,7 @@ from uuid import UUID, uuid4
from pydantic import BaseModel, ConfigDict, Field
from ocr_sprint.schemas.extraction import ExtractionResult
from ocr_sprint.schemas.personnel import PersonnelEntry
class SourceKind(str, Enum):
@@ -52,7 +53,7 @@ class DocumentResponse(BaseModel):
job_id: UUID
status: DocumentStatus
confidence: float | None = None
data: ExtractionResult | None = None
data: list[PersonnelEntry] | None = None
review_flags: list[str] = Field(default_factory=list)
error: str | None = None
# Phase 6 — HITL review state.

View File

@@ -22,6 +22,7 @@ class ReviewFlag(str, Enum):
LLM_FALLBACK = "llm_fallback"
LLM_UNAVAILABLE = "llm_unavailable"
PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp"
INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"