Adds OpenCV-based phone-photo handling that runs before the standard preprocessing pipeline for IMAGE source kinds (PDF renders are flat by construction and skip this stage). Pipeline additions in src/ocr_sprint/pipeline/document_detect.py: - _find_document_quad: Canny + dilate + contour search, picks the largest convex 4-point polygon above a configurable area threshold; fails gracefully and returns None when no usable quad is found. - _four_point_warp: orders corners (TL/TR/BR/BL via sum/diff trick) and runs cv2.getPerspectiveTransform + warpPerspective. - _remove_shadow: per-channel background-division (dilate + median blur + 255 - absdiff + normalize) for uneven phone-shot lighting. - detect_and_correct: top-level entrypoint with graceful fallback to the original image when detection fails. Wired into the synchronous orchestrator: only enabled for IMAGE sources, skipped for PDF. New settings: - preprocess_detect_document (default: true) - preprocess_remove_shadow (default: true) - preprocess_min_quad_area_fraction (default: 0.20) Tests: 9 new unit tests covering corner ordering, quad detection on synthetic skewed documents, perspective warp output sanity, shadow removal shape preservation, full-pipeline behavior, and graceful fallback when detection fails. 70 tests total, all green. ML-based dewarping (DewarpNet) and DocTR detector are deferred to a future phase per the roadmap; the existing API is structured so they can be added as alternative backends behind DocumentDetectConfig. Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
113 lines
3.9 KiB
Python
113 lines
3.9 KiB
Python
"""Synchronous pipeline orchestrator (Phase 1).
|
|
|
|
Wires the individual stages together:
|
|
|
|
bytes → ingest → preprocess → OCR → regex extract → validate → score
|
|
|
|
Phase 4 will replace this with a Celery task graph; Phase 3/5 will plug
|
|
in PP-Structure for tables and an LLM extractor for variant fields.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from ocr_sprint.config import get_settings
|
|
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
|
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
|
|
from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
|
|
from ocr_sprint.pipeline.extract.validators import validate_extraction
|
|
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest
|
|
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
|
|
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
|
|
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
|
from ocr_sprint.schemas.extraction import ExtractionResult, ReviewFlag
|
|
from ocr_sprint.utils.logging import get_logger
|
|
|
|
_logger = get_logger(__name__)
|
|
|
|
# Below this OCR confidence we automatically flag for review.
|
|
_OCR_CONFIDENCE_FLAG_THRESHOLD = 0.80
|
|
|
|
|
|
@dataclass
|
|
class PipelineOutput:
|
|
"""Bundle returned by the orchestrator."""
|
|
|
|
source_kind: SourceKind
|
|
status: DocumentStatus
|
|
confidence: float
|
|
result: ExtractionResult
|
|
|
|
|
|
def run_pipeline(content: bytes) -> PipelineOutput:
|
|
"""Execute the synchronous OCR + extraction pipeline on raw upload bytes."""
|
|
s = get_settings()
|
|
|
|
kind = detect_source_kind(content)
|
|
if kind == SourceKind.UNKNOWN:
|
|
raise ValueError("Unsupported file type — only PDF and common image formats are accepted.")
|
|
|
|
pages = ingest(content, kind, target_dpi=s.preprocess_target_dpi)
|
|
_logger.info("pipeline.ingested", source_kind=kind.value, pages=len(pages))
|
|
|
|
pre_cfg = PreprocessConfig(
|
|
max_side=s.ocr_max_image_side,
|
|
denoise=s.preprocess_denoise,
|
|
deskew=s.preprocess_deskew,
|
|
adaptive_threshold=s.preprocess_adaptive_threshold,
|
|
)
|
|
# Document detection only makes sense on photographed images. PDF renders
|
|
# are already flat by construction, so we skip the heavy quad search there.
|
|
detect_cfg = DocumentDetectConfig(
|
|
detect_document=s.preprocess_detect_document and kind == SourceKind.IMAGE,
|
|
remove_shadow=s.preprocess_remove_shadow and kind == SourceKind.IMAGE,
|
|
min_area_fraction=s.preprocess_min_quad_area_fraction,
|
|
)
|
|
|
|
ocr_pages: list[OCRPage] = []
|
|
for page in pages:
|
|
corrected = detect_and_correct(page.image, detect_cfg)
|
|
cleaned = preprocess(corrected, pre_cfg)
|
|
ocr_pages.append(run_ocr(cleaned))
|
|
|
|
full_text = "\n".join(p.text for p in ocr_pages)
|
|
mean_ocr_conf = sum(p.mean_confidence for p in ocr_pages) / len(ocr_pages) if ocr_pages else 0.0
|
|
|
|
header = extract_header(full_text)
|
|
ttd = find_signatory(full_text)
|
|
|
|
initial_flags: list[ReviewFlag] = []
|
|
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
|
|
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
|
|
|
|
result = ExtractionResult(
|
|
header=header,
|
|
personel=[], # Phase 3 will populate from PP-Structure
|
|
untuk=[],
|
|
ttd=ttd,
|
|
raw_text=full_text,
|
|
confidence=mean_ocr_conf,
|
|
review_flags=list(initial_flags),
|
|
)
|
|
|
|
flags = validate_extraction(result)
|
|
# merge initial OCR-confidence flag with validation flags, preserving uniqueness
|
|
seen = set(flags)
|
|
for f in initial_flags:
|
|
if f not in seen:
|
|
flags.append(f)
|
|
seen.add(f)
|
|
result.review_flags = flags
|
|
|
|
final_conf = compute_confidence(mean_ocr_conf, flags)
|
|
result.confidence = final_conf
|
|
|
|
status = route(final_conf)
|
|
return PipelineOutput(
|
|
source_kind=kind,
|
|
status=status,
|
|
confidence=final_conf,
|
|
result=result,
|
|
)
|