feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing

This commit is contained in:
Adriankf59
2026-04-26 17:16:47 +07:00
parent dbcf480130
commit 002821ca07
20 changed files with 3326 additions and 20 deletions

View File

@@ -20,6 +20,7 @@ from ocr_sprint.pipeline.confidence import compute_confidence, route
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
from ocr_sprint.pipeline.extract.personnel import extract_personnel
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_ocr_lines,
extract_personnel_from_text,
is_low_quality,
)
@@ -144,12 +145,37 @@ def run_pipeline(content: bytes) -> PipelineOutput:
# through the preferred path.
if is_low_quality(personel):
fallback_rows = extract_personnel_from_text(full_text)
# If text-based fallback produced rows but they all lack NRP
# (Pass 3 territory), retry with the column-aware extractor that
# uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar
# Akpol-panitia), text-only Pass 3 bleeds adjacent columns into
# nama/jabatan because lines are interleaved within each Y-band;
# the columnar variant restricts each field to its visual column.
text_only_no_nrp = bool(fallback_rows) and all(
r.nrp is None for r in fallback_rows
)
if (not fallback_rows) or text_only_no_nrp:
ocr_lines = [ln for page in ocr_pages for ln in page.lines]
columnar_rows = extract_personnel_from_ocr_lines(ocr_lines)
if columnar_rows and (
not fallback_rows or len(columnar_rows) >= len(fallback_rows)
):
fallback_rows = columnar_rows
if fallback_rows:
personel = fallback_rows
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
# Pass 3 / columnar emit rows with nrp=None for sprint
# templates without an NRP column. Surface that with a
# distinct flag so operators know to expect missing NRPs by
# design rather than by OCR failure.
no_nrp = all(r.nrp is None for r in fallback_rows)
if no_nrp:
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP)
else:
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
_logger.info(
"pipeline.personnel_text_fallback",
fallback_rows=len(fallback_rows),
no_nrp=no_nrp,
)
untuk_items = find_untuk_list(full_text)