Files
OCR-SPRIN-SERVICE/pyproject.toml
Devin AI ca0c0a0428 Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service:
- FastAPI app with /api/v1/health and /api/v1/documents (sync upload)
- Pydantic v2 schemas for documents, extraction result, personnel
- Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew,
  denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based
  header extraction (nomor sprint, tanggal, satuan, perihal, dasar),
  signatory NRP, master-pangkat validation, confidence scoring + routing.
- Tests: 61 unit tests covering regex rules, validators, preprocess,
  ingest, confidence, and API contract (PaddleOCR mocked).
- Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit,
  Dockerfile, docker-compose, Makefile.
- Docs: README + docs/architecture.md (full hybrid stack rationale and
  6-phase roadmap).

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00

137 lines
3.1 KiB
TOML

[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "ocr-sprint-service"
version = "0.1.0"
description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)"
readme = "README.md"
requires-python = ">=3.10,<3.13"
license = { text = "Proprietary" }
authors = [{ name = "Adrian Kuman Firmansah" }]
dependencies = [
# Web framework
"fastapi>=0.115,<0.116",
"uvicorn[standard]>=0.30,<0.34",
"python-multipart>=0.0.9",
"pydantic>=2.7,<3",
"pydantic-settings>=2.4,<3",
# Image / PDF
"pillow>=10.4,<12",
"opencv-python-headless>=4.10,<5",
"numpy>=1.26,<2.2",
"PyMuPDF>=1.24,<2",
"python-magic>=0.4.27",
# OCR (CPU build of paddle; GPU users override via extra index)
"paddlepaddle==2.6.1",
"paddleocr>=2.7.5,<3",
# Logging / observability
"structlog>=24.1",
"prometheus-client>=0.20",
# Misc
"httpx>=0.27",
"tenacity>=8.5",
]
[project.optional-dependencies]
dev = [
"pytest>=8.2",
"pytest-asyncio>=0.23",
"pytest-cov>=5.0",
"ruff>=0.6.9",
"mypy>=1.11",
"types-Pillow",
"pre-commit>=3.7",
]
# Extraction layer (Phase 5) — kept optional so MVP install stays light
llm = [
"ollama>=0.3",
"instructor>=1.4",
]
# Async pipeline (Phase 4)
async-pipeline = [
"celery[redis]>=5.4",
"redis>=5.0",
"minio>=7.2",
"sqlalchemy>=2.0",
"psycopg[binary]>=3.2",
"alembic>=1.13",
]
[project.scripts]
ocr-sprint-api = "ocr_sprint.main:run"
[tool.setuptools.packages.find]
where = ["src"]
[tool.setuptools.package-data]
"ocr_sprint" = ["py.typed"]
# ---------- Tooling ----------
[tool.ruff]
line-length = 100
target-version = "py310"
src = ["src", "tests"]
[tool.ruff.lint]
select = [
"E", "F", "W", # pycodestyle / pyflakes
"I", # isort
"B", # bugbear
"UP", # pyupgrade
"SIM", # simplify
"RUF", # ruff-specific
"C4", # comprehensions
"PIE",
"PT", # pytest style
"TID", # tidy imports
]
ignore = [
"E501", # line length handled by formatter
"B008", # FastAPI Depends() pattern
]
[tool.ruff.format]
quote-style = "double"
[tool.mypy]
python_version = "3.10"
strict = true
warn_unused_ignores = true
warn_redundant_casts = true
disallow_untyped_defs = true
plugins = ["pydantic.mypy"]
mypy_path = "src"
namespace_packages = true
explicit_package_bases = true
[[tool.mypy.overrides]]
module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"]
ignore_missing_imports = true
[tool.pytest.ini_options]
minversion = "8.0"
addopts = "-ra --strict-markers --strict-config"
testpaths = ["tests"]
asyncio_mode = "auto"
filterwarnings = [
"ignore::DeprecationWarning:paddle.*",
"ignore::DeprecationWarning:paddleocr.*",
]
[tool.coverage.run]
source = ["src/ocr_sprint"]
branch = true
[tool.coverage.report]
exclude_lines = [
"pragma: no cover",
"raise NotImplementedError",
"if TYPE_CHECKING:",
]