Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
137 lines
3.1 KiB
TOML
137 lines
3.1 KiB
TOML
[build-system]
|
|
requires = ["setuptools>=68", "wheel"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
name = "ocr-sprint-service"
|
|
version = "0.1.0"
|
|
description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)"
|
|
readme = "README.md"
|
|
requires-python = ">=3.10,<3.13"
|
|
license = { text = "Proprietary" }
|
|
authors = [{ name = "Adrian Kuman Firmansah" }]
|
|
|
|
dependencies = [
|
|
# Web framework
|
|
"fastapi>=0.115,<0.116",
|
|
"uvicorn[standard]>=0.30,<0.34",
|
|
"python-multipart>=0.0.9",
|
|
"pydantic>=2.7,<3",
|
|
"pydantic-settings>=2.4,<3",
|
|
# Image / PDF
|
|
"pillow>=10.4,<12",
|
|
"opencv-python-headless>=4.10,<5",
|
|
"numpy>=1.26,<2.2",
|
|
"PyMuPDF>=1.24,<2",
|
|
"python-magic>=0.4.27",
|
|
# OCR (CPU build of paddle; GPU users override via extra index)
|
|
"paddlepaddle==2.6.1",
|
|
"paddleocr>=2.7.5,<3",
|
|
# Logging / observability
|
|
"structlog>=24.1",
|
|
"prometheus-client>=0.20",
|
|
# Misc
|
|
"httpx>=0.27",
|
|
"tenacity>=8.5",
|
|
]
|
|
|
|
[project.optional-dependencies]
|
|
dev = [
|
|
"pytest>=8.2",
|
|
"pytest-asyncio>=0.23",
|
|
"pytest-cov>=5.0",
|
|
"ruff>=0.6.9",
|
|
"mypy>=1.11",
|
|
"types-Pillow",
|
|
"pre-commit>=3.7",
|
|
]
|
|
|
|
# Extraction layer (Phase 5) — kept optional so MVP install stays light
|
|
llm = [
|
|
"ollama>=0.3",
|
|
"instructor>=1.4",
|
|
]
|
|
|
|
# Async pipeline (Phase 4)
|
|
async-pipeline = [
|
|
"celery[redis]>=5.4",
|
|
"redis>=5.0",
|
|
"minio>=7.2",
|
|
"sqlalchemy>=2.0",
|
|
"psycopg[binary]>=3.2",
|
|
"alembic>=1.13",
|
|
]
|
|
|
|
[project.scripts]
|
|
ocr-sprint-api = "ocr_sprint.main:run"
|
|
|
|
[tool.setuptools.packages.find]
|
|
where = ["src"]
|
|
|
|
[tool.setuptools.package-data]
|
|
"ocr_sprint" = ["py.typed"]
|
|
|
|
# ---------- Tooling ----------
|
|
|
|
[tool.ruff]
|
|
line-length = 100
|
|
target-version = "py310"
|
|
src = ["src", "tests"]
|
|
|
|
[tool.ruff.lint]
|
|
select = [
|
|
"E", "F", "W", # pycodestyle / pyflakes
|
|
"I", # isort
|
|
"B", # bugbear
|
|
"UP", # pyupgrade
|
|
"SIM", # simplify
|
|
"RUF", # ruff-specific
|
|
"C4", # comprehensions
|
|
"PIE",
|
|
"PT", # pytest style
|
|
"TID", # tidy imports
|
|
]
|
|
ignore = [
|
|
"E501", # line length handled by formatter
|
|
"B008", # FastAPI Depends() pattern
|
|
]
|
|
|
|
[tool.ruff.format]
|
|
quote-style = "double"
|
|
|
|
[tool.mypy]
|
|
python_version = "3.10"
|
|
strict = true
|
|
warn_unused_ignores = true
|
|
warn_redundant_casts = true
|
|
disallow_untyped_defs = true
|
|
plugins = ["pydantic.mypy"]
|
|
mypy_path = "src"
|
|
namespace_packages = true
|
|
explicit_package_bases = true
|
|
|
|
[[tool.mypy.overrides]]
|
|
module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"]
|
|
ignore_missing_imports = true
|
|
|
|
[tool.pytest.ini_options]
|
|
minversion = "8.0"
|
|
addopts = "-ra --strict-markers --strict-config"
|
|
testpaths = ["tests"]
|
|
asyncio_mode = "auto"
|
|
filterwarnings = [
|
|
"ignore::DeprecationWarning:paddle.*",
|
|
"ignore::DeprecationWarning:paddleocr.*",
|
|
]
|
|
|
|
[tool.coverage.run]
|
|
source = ["src/ocr_sprint"]
|
|
branch = true
|
|
|
|
[tool.coverage.report]
|
|
exclude_lines = [
|
|
"pragma: no cover",
|
|
"raise NotImplementedError",
|
|
"if TYPE_CHECKING:",
|
|
]
|