Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
103
src/ocr_sprint/pipeline/orchestrator.py
Normal file
103
src/ocr_sprint/pipeline/orchestrator.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""Synchronous pipeline orchestrator (Phase 1).
|
||||
|
||||
Wires the individual stages together:
|
||||
|
||||
bytes → ingest → preprocess → OCR → regex extract → validate → score
|
||||
|
||||
Phase 4 will replace this with a Celery task graph; Phase 3/5 will plug
|
||||
in PP-Structure for tables and an LLM extractor for variant fields.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ocr_sprint.config import get_settings
|
||||
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||
from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
|
||||
from ocr_sprint.pipeline.extract.validators import validate_extraction
|
||||
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest
|
||||
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
|
||||
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
|
||||
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
||||
from ocr_sprint.schemas.extraction import ExtractionResult, ReviewFlag
|
||||
from ocr_sprint.utils.logging import get_logger
|
||||
|
||||
_logger = get_logger(__name__)
|
||||
|
||||
# Below this OCR confidence we automatically flag for review.
|
||||
_OCR_CONFIDENCE_FLAG_THRESHOLD = 0.80
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineOutput:
|
||||
"""Bundle returned by the orchestrator."""
|
||||
|
||||
source_kind: SourceKind
|
||||
status: DocumentStatus
|
||||
confidence: float
|
||||
result: ExtractionResult
|
||||
|
||||
|
||||
def run_pipeline(content: bytes) -> PipelineOutput:
|
||||
"""Execute the synchronous OCR + extraction pipeline on raw upload bytes."""
|
||||
s = get_settings()
|
||||
|
||||
kind = detect_source_kind(content)
|
||||
if kind == SourceKind.UNKNOWN:
|
||||
raise ValueError("Unsupported file type — only PDF and common image formats are accepted.")
|
||||
|
||||
pages = ingest(content, kind, target_dpi=s.preprocess_target_dpi)
|
||||
_logger.info("pipeline.ingested", source_kind=kind.value, pages=len(pages))
|
||||
|
||||
pre_cfg = PreprocessConfig(
|
||||
max_side=s.ocr_max_image_side,
|
||||
denoise=s.preprocess_denoise,
|
||||
deskew=s.preprocess_deskew,
|
||||
adaptive_threshold=s.preprocess_adaptive_threshold,
|
||||
)
|
||||
|
||||
ocr_pages: list[OCRPage] = []
|
||||
for page in pages:
|
||||
cleaned = preprocess(page.image, pre_cfg)
|
||||
ocr_pages.append(run_ocr(cleaned))
|
||||
|
||||
full_text = "\n".join(p.text for p in ocr_pages)
|
||||
mean_ocr_conf = sum(p.mean_confidence for p in ocr_pages) / len(ocr_pages) if ocr_pages else 0.0
|
||||
|
||||
header = extract_header(full_text)
|
||||
ttd = find_signatory(full_text)
|
||||
|
||||
initial_flags: list[ReviewFlag] = []
|
||||
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
|
||||
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
|
||||
|
||||
result = ExtractionResult(
|
||||
header=header,
|
||||
personel=[], # Phase 3 will populate from PP-Structure
|
||||
untuk=[],
|
||||
ttd=ttd,
|
||||
raw_text=full_text,
|
||||
confidence=mean_ocr_conf,
|
||||
review_flags=list(initial_flags),
|
||||
)
|
||||
|
||||
flags = validate_extraction(result)
|
||||
# merge initial OCR-confidence flag with validation flags, preserving uniqueness
|
||||
seen = set(flags)
|
||||
for f in initial_flags:
|
||||
if f not in seen:
|
||||
flags.append(f)
|
||||
seen.add(f)
|
||||
result.review_flags = flags
|
||||
|
||||
final_conf = compute_confidence(mean_ocr_conf, flags)
|
||||
result.confidence = final_conf
|
||||
|
||||
status = route(final_conf)
|
||||
return PipelineOutput(
|
||||
source_kind=kind,
|
||||
status=status,
|
||||
confidence=final_conf,
|
||||
result=result,
|
||||
)
|
||||
Reference in New Issue
Block a user