Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
55
src/ocr_sprint/schemas/extraction.py
Normal file
55
src/ocr_sprint/schemas/extraction.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Top-level extraction result schemas."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
|
||||
class ReviewFlag(str, Enum):
|
||||
"""Reasons a document was routed to human review."""
|
||||
|
||||
LOW_OCR_CONFIDENCE = "low_ocr_confidence"
|
||||
MISSING_FIELD = "missing_field"
|
||||
INVALID_NRP = "invalid_nrp"
|
||||
UNKNOWN_PANGKAT = "unknown_pangkat"
|
||||
PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch"
|
||||
DATE_PARSE_FAILED = "date_parse_failed"
|
||||
|
||||
|
||||
class Signatory(BaseModel):
|
||||
"""The official signing the sprint (Penandatangan)."""
|
||||
|
||||
nama: str | None = None
|
||||
pangkat: str | None = None
|
||||
nrp: str | None = None
|
||||
jabatan: str | None = None
|
||||
|
||||
|
||||
class HeaderFields(BaseModel):
|
||||
"""Header fields parsed from the top portion of a sprint."""
|
||||
|
||||
nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.")
|
||||
tanggal: date | None = Field(None, description="Date the sprint was issued.")
|
||||
satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.")
|
||||
perihal: str | None = None
|
||||
dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.")
|
||||
|
||||
|
||||
class ExtractionResult(BaseModel):
|
||||
"""Full structured payload extracted from a single sprint document."""
|
||||
|
||||
header: HeaderFields = Field(default_factory=HeaderFields)
|
||||
personel: list[PersonnelEntry] = Field(default_factory=list)
|
||||
untuk: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.",
|
||||
)
|
||||
ttd: Signatory = Field(default_factory=Signatory)
|
||||
raw_text: str = Field(default="", description="Concatenated OCR text for debugging.")
|
||||
confidence: float = Field(0.0, ge=0.0, le=1.0)
|
||||
review_flags: list[ReviewFlag] = Field(default_factory=list)
|
||||
Reference in New Issue
Block a user