Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions
--- a/src/ocr_sprint/api/init.py
+++ b/src/ocr_sprint/api/init.py
--- a/src/ocr_sprint/api/errors.py
+++ b/src/ocr_sprint/api/errors.py
@@ -0,0 +1,43 @@
+"""HTTP error handlers."""
+
+from __future__ import annotations
+
+from fastapi import FastAPI, Request, status
+from fastapi.responses import JSONResponse
+
+from ocr_sprint.utils.logging import get_logger
+
+_logger = get_logger(__name__)
+
+
+class OCRServiceError(Exception):
+    """Base class for application errors that should map to a 4xx response."""
+
+    http_status: int = status.HTTP_400_BAD_REQUEST
+
+
+class UnsupportedDocumentError(OCRServiceError):
+    """Uploaded file is neither a PDF nor a recognized image format."""
+
+
+class JobNotFoundError(OCRServiceError):
+    http_status = status.HTTP_404_NOT_FOUND
+
+
+def register_error_handlers(app: FastAPI) -> None:
+    """Wire OCRServiceError + a final fallback for unexpected exceptions."""
+
+    @app.exception_handler(OCRServiceError)
+    async def _ocr_error_handler(_: Request, exc: OCRServiceError) -> JSONResponse:
+        return JSONResponse(
+            status_code=exc.http_status,
+            content={"error": exc.__class__.__name__, "message": str(exc)},
+        )
+
+    @app.exception_handler(Exception)
+    async def _unexpected_handler(_: Request, exc: Exception) -> JSONResponse:
+        _logger.exception("api.unhandled_exception", error=str(exc))
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={"error": "InternalServerError", "message": "Unexpected error"},
+        )
--- a/src/ocr_sprint/api/routes/init.py
+++ b/src/ocr_sprint/api/routes/init.py
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -0,0 +1,58 @@
+"""Documents API — Phase 1 synchronous endpoint.
+
+POST /documents accepts a single PDF or image upload, runs the synchronous
+pipeline inline, and returns the structured result. This is suitable for
+development and low-traffic production; Phase 4 will introduce an async
+queue and a polling-style API at the same path.
+"""
+
+from __future__ import annotations
+
+from uuid import uuid4
+
+from fastapi import APIRouter, File, UploadFile, status
+
+from ocr_sprint.api.errors import UnsupportedDocumentError
+from ocr_sprint.pipeline.orchestrator import run_pipeline
+from ocr_sprint.schemas.document import DocumentResponse
+from ocr_sprint.utils.logging import get_logger
+
+router = APIRouter(prefix="/documents", tags=["documents"])
+_logger = get_logger(__name__)
+
+_MAX_UPLOAD_BYTES = 25 * 1024 * 1024  # 25 MB
+
+
+@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse)
+async def create_document(file: UploadFile = File(...)) -> DocumentResponse:
+    """Run OCR + extraction synchronously on a single upload."""
+    job_id = uuid4()
+    log = _logger.bind(job_id=str(job_id), filename=file.filename or "")
+
+    content = await file.read()
+    if not content:
+        raise UnsupportedDocumentError("Uploaded file is empty.")
+    if len(content) > _MAX_UPLOAD_BYTES:
+        raise UnsupportedDocumentError(
+            f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit."
+        )
+
+    log.info("documents.received", size=len(content))
+    try:
+        output = run_pipeline(content)
+    except ValueError as exc:
+        raise UnsupportedDocumentError(str(exc)) from exc
+
+    log.info(
+        "documents.completed",
+        status=output.status.value,
+        confidence=round(output.confidence, 3),
+        flags=[f.value for f in output.result.review_flags],
+    )
+    return DocumentResponse(
+        job_id=job_id,
+        status=output.status,
+        confidence=output.confidence,
+        data=output.result,
+        review_flags=[f.value for f in output.result.review_flags],
+    )
--- a/src/ocr_sprint/api/routes/health.py
+++ b/src/ocr_sprint/api/routes/health.py
@@ -0,0 +1,15 @@
+"""Liveness / readiness endpoints."""
+
+from __future__ import annotations
+
+from fastapi import APIRouter
+
+from ocr_sprint import __version__
+
+router = APIRouter(tags=["health"])
+
+
+@router.get("/health")
+async def health() -> dict[str, str]:
+    """Lightweight liveness check — does NOT touch the OCR engine."""
+    return {"status": "ok", "version": __version__}