Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -0,0 +1,58 @@
+"""Documents API — Phase 1 synchronous endpoint.
+
+POST /documents accepts a single PDF or image upload, runs the synchronous
+pipeline inline, and returns the structured result. This is suitable for
+development and low-traffic production; Phase 4 will introduce an async
+queue and a polling-style API at the same path.
+"""
+
+from __future__ import annotations
+
+from uuid import uuid4
+
+from fastapi import APIRouter, File, UploadFile, status
+
+from ocr_sprint.api.errors import UnsupportedDocumentError
+from ocr_sprint.pipeline.orchestrator import run_pipeline
+from ocr_sprint.schemas.document import DocumentResponse
+from ocr_sprint.utils.logging import get_logger
+
+router = APIRouter(prefix="/documents", tags=["documents"])
+_logger = get_logger(__name__)
+
+_MAX_UPLOAD_BYTES = 25 * 1024 * 1024  # 25 MB
+
+
+@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse)
+async def create_document(file: UploadFile = File(...)) -> DocumentResponse:
+    """Run OCR + extraction synchronously on a single upload."""
+    job_id = uuid4()
+    log = _logger.bind(job_id=str(job_id), filename=file.filename or "")
+
+    content = await file.read()
+    if not content:
+        raise UnsupportedDocumentError("Uploaded file is empty.")
+    if len(content) > _MAX_UPLOAD_BYTES:
+        raise UnsupportedDocumentError(
+            f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit."
+        )
+
+    log.info("documents.received", size=len(content))
+    try:
+        output = run_pipeline(content)
+    except ValueError as exc:
+        raise UnsupportedDocumentError(str(exc)) from exc
+
+    log.info(
+        "documents.completed",
+        status=output.status.value,
+        confidence=round(output.confidence, 3),
+        flags=[f.value for f in output.result.review_flags],
+    )
+    return DocumentResponse(
+        job_id=job_id,
+        status=output.status,
+        confidence=output.confidence,
+        data=output.result,
+        review_flags=[f.value for f in output.result.review_flags],
+    )