Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
52 lines
1.6 KiB
Docker
52 lines
1.6 KiB
Docker
# syntax=docker/dockerfile:1.6
|
|
# CPU-only image for the OCR Sprint API.
|
|
# PaddleOCR + PyMuPDF + OpenCV-headless work on plain Debian without poppler.
|
|
FROM python:3.11-slim AS base
|
|
|
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
PYTHONUNBUFFERED=1 \
|
|
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
|
PIP_NO_CACHE_DIR=1 \
|
|
DEBIAN_FRONTEND=noninteractive
|
|
|
|
# System deps for OpenCV, libmagic, PaddlePaddle, and image format support.
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
libgl1 \
|
|
libglib2.0-0 \
|
|
libsm6 \
|
|
libxext6 \
|
|
libxrender1 \
|
|
libgomp1 \
|
|
libmagic1 \
|
|
ca-certificates \
|
|
curl \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
WORKDIR /app
|
|
|
|
# ----- builder layer (install deps separately for caching) -----
|
|
FROM base AS builder
|
|
COPY pyproject.toml README.md ./
|
|
COPY src/ ./src/
|
|
RUN pip install --upgrade pip && pip install ".[dev]"
|
|
|
|
# ----- runtime layer -----
|
|
FROM base AS runtime
|
|
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
|
COPY --from=builder /usr/local/bin /usr/local/bin
|
|
COPY pyproject.toml README.md ./
|
|
COPY src/ ./src/
|
|
|
|
# Pre-create cache dirs so PaddleOCR can write models on first run.
|
|
RUN mkdir -p /home/app/.paddleocr /app/storage \
|
|
&& useradd --create-home --uid 1000 app \
|
|
&& chown -R app:app /home/app /app
|
|
|
|
USER app
|
|
EXPOSE 8000
|
|
|
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
|
|
CMD curl -fsS http://localhost:8000/api/v1/health || exit 1
|
|
|
|
CMD ["uvicorn", "ocr_sprint.main:app", "--host", "0.0.0.0", "--port", "8000"]
|