Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
0
src/ocr_sprint/api/__init__.py
Normal file
0
src/ocr_sprint/api/__init__.py
Normal file
43
src/ocr_sprint/api/errors.py
Normal file
43
src/ocr_sprint/api/errors.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""HTTP error handlers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import FastAPI, Request, status
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from ocr_sprint.utils.logging import get_logger
|
||||
|
||||
_logger = get_logger(__name__)
|
||||
|
||||
|
||||
class OCRServiceError(Exception):
|
||||
"""Base class for application errors that should map to a 4xx response."""
|
||||
|
||||
http_status: int = status.HTTP_400_BAD_REQUEST
|
||||
|
||||
|
||||
class UnsupportedDocumentError(OCRServiceError):
|
||||
"""Uploaded file is neither a PDF nor a recognized image format."""
|
||||
|
||||
|
||||
class JobNotFoundError(OCRServiceError):
|
||||
http_status = status.HTTP_404_NOT_FOUND
|
||||
|
||||
|
||||
def register_error_handlers(app: FastAPI) -> None:
|
||||
"""Wire OCRServiceError + a final fallback for unexpected exceptions."""
|
||||
|
||||
@app.exception_handler(OCRServiceError)
|
||||
async def _ocr_error_handler(_: Request, exc: OCRServiceError) -> JSONResponse:
|
||||
return JSONResponse(
|
||||
status_code=exc.http_status,
|
||||
content={"error": exc.__class__.__name__, "message": str(exc)},
|
||||
)
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
async def _unexpected_handler(_: Request, exc: Exception) -> JSONResponse:
|
||||
_logger.exception("api.unhandled_exception", error=str(exc))
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
content={"error": "InternalServerError", "message": "Unexpected error"},
|
||||
)
|
||||
0
src/ocr_sprint/api/routes/__init__.py
Normal file
0
src/ocr_sprint/api/routes/__init__.py
Normal file
58
src/ocr_sprint/api/routes/documents.py
Normal file
58
src/ocr_sprint/api/routes/documents.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Documents API — Phase 1 synchronous endpoint.
|
||||
|
||||
POST /documents accepts a single PDF or image upload, runs the synchronous
|
||||
pipeline inline, and returns the structured result. This is suitable for
|
||||
development and low-traffic production; Phase 4 will introduce an async
|
||||
queue and a polling-style API at the same path.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from uuid import uuid4
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile, status
|
||||
|
||||
from ocr_sprint.api.errors import UnsupportedDocumentError
|
||||
from ocr_sprint.pipeline.orchestrator import run_pipeline
|
||||
from ocr_sprint.schemas.document import DocumentResponse
|
||||
from ocr_sprint.utils.logging import get_logger
|
||||
|
||||
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||
_logger = get_logger(__name__)
|
||||
|
||||
_MAX_UPLOAD_BYTES = 25 * 1024 * 1024 # 25 MB
|
||||
|
||||
|
||||
@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse)
|
||||
async def create_document(file: UploadFile = File(...)) -> DocumentResponse:
|
||||
"""Run OCR + extraction synchronously on a single upload."""
|
||||
job_id = uuid4()
|
||||
log = _logger.bind(job_id=str(job_id), filename=file.filename or "")
|
||||
|
||||
content = await file.read()
|
||||
if not content:
|
||||
raise UnsupportedDocumentError("Uploaded file is empty.")
|
||||
if len(content) > _MAX_UPLOAD_BYTES:
|
||||
raise UnsupportedDocumentError(
|
||||
f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit."
|
||||
)
|
||||
|
||||
log.info("documents.received", size=len(content))
|
||||
try:
|
||||
output = run_pipeline(content)
|
||||
except ValueError as exc:
|
||||
raise UnsupportedDocumentError(str(exc)) from exc
|
||||
|
||||
log.info(
|
||||
"documents.completed",
|
||||
status=output.status.value,
|
||||
confidence=round(output.confidence, 3),
|
||||
flags=[f.value for f in output.result.review_flags],
|
||||
)
|
||||
return DocumentResponse(
|
||||
job_id=job_id,
|
||||
status=output.status,
|
||||
confidence=output.confidence,
|
||||
data=output.result,
|
||||
review_flags=[f.value for f in output.result.review_flags],
|
||||
)
|
||||
15
src/ocr_sprint/api/routes/health.py
Normal file
15
src/ocr_sprint/api/routes/health.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""Liveness / readiness endpoints."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from ocr_sprint import __version__
|
||||
|
||||
router = APIRouter(tags=["health"])
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health() -> dict[str, str]:
|
||||
"""Lightweight liveness check — does NOT touch the OCR engine."""
|
||||
return {"status": "ok", "version": __version__}
|
||||
Reference in New Issue
Block a user