feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing
This commit is contained in:
@@ -20,6 +20,7 @@ from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
|
||||
from ocr_sprint.pipeline.extract.personnel import extract_personnel
|
||||
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||
extract_personnel_from_ocr_lines,
|
||||
extract_personnel_from_text,
|
||||
is_low_quality,
|
||||
)
|
||||
@@ -144,12 +145,37 @@ def run_pipeline(content: bytes) -> PipelineOutput:
|
||||
# through the preferred path.
|
||||
if is_low_quality(personel):
|
||||
fallback_rows = extract_personnel_from_text(full_text)
|
||||
# If text-based fallback produced rows but they all lack NRP
|
||||
# (Pass 3 territory), retry with the column-aware extractor that
|
||||
# uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar
|
||||
# Akpol-panitia), text-only Pass 3 bleeds adjacent columns into
|
||||
# nama/jabatan because lines are interleaved within each Y-band;
|
||||
# the columnar variant restricts each field to its visual column.
|
||||
text_only_no_nrp = bool(fallback_rows) and all(
|
||||
r.nrp is None for r in fallback_rows
|
||||
)
|
||||
if (not fallback_rows) or text_only_no_nrp:
|
||||
ocr_lines = [ln for page in ocr_pages for ln in page.lines]
|
||||
columnar_rows = extract_personnel_from_ocr_lines(ocr_lines)
|
||||
if columnar_rows and (
|
||||
not fallback_rows or len(columnar_rows) >= len(fallback_rows)
|
||||
):
|
||||
fallback_rows = columnar_rows
|
||||
if fallback_rows:
|
||||
personel = fallback_rows
|
||||
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
|
||||
# Pass 3 / columnar emit rows with nrp=None for sprint
|
||||
# templates without an NRP column. Surface that with a
|
||||
# distinct flag so operators know to expect missing NRPs by
|
||||
# design rather than by OCR failure.
|
||||
no_nrp = all(r.nrp is None for r in fallback_rows)
|
||||
if no_nrp:
|
||||
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP)
|
||||
else:
|
||||
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
|
||||
_logger.info(
|
||||
"pipeline.personnel_text_fallback",
|
||||
fallback_rows=len(fallback_rows),
|
||||
no_nrp=no_nrp,
|
||||
)
|
||||
|
||||
untuk_items = find_untuk_list(full_text)
|
||||
|
||||
Reference in New Issue
Block a user