Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
57
src/ocr_sprint/schemas/document.py
Normal file
57
src/ocr_sprint/schemas/document.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Job-level schemas (request, response, status)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from ocr_sprint.schemas.extraction import ExtractionResult
|
||||
|
||||
|
||||
class SourceKind(str, Enum):
|
||||
"""High-level type of the uploaded document."""
|
||||
|
||||
PDF = "pdf"
|
||||
IMAGE = "image"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class DocumentStatus(str, Enum):
|
||||
"""Lifecycle status of an OCR job."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
NEEDS_REVIEW = "needs_review"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class DocumentJob(BaseModel):
|
||||
"""Internal representation of a job (Phase 1 holds it in-memory)."""
|
||||
|
||||
model_config = ConfigDict(use_enum_values=False)
|
||||
|
||||
job_id: UUID = Field(default_factory=uuid4)
|
||||
source_kind: SourceKind = SourceKind.UNKNOWN
|
||||
filename: str
|
||||
status: DocumentStatus = DocumentStatus.PENDING
|
||||
created_at: datetime = Field(default_factory=lambda: datetime.utcnow())
|
||||
updated_at: datetime = Field(default_factory=lambda: datetime.utcnow())
|
||||
error: str | None = None
|
||||
result: ExtractionResult | None = None
|
||||
debug: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
"""Public response payload returned by the documents API."""
|
||||
|
||||
job_id: UUID
|
||||
status: DocumentStatus
|
||||
confidence: float | None = None
|
||||
data: ExtractionResult | None = None
|
||||
review_flags: list[str] = Field(default_factory=list)
|
||||
error: str | None = None
|
||||
Reference in New Issue
Block a user