Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
136
pyproject.toml
Normal file
136
pyproject.toml
Normal file
@@ -0,0 +1,136 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "ocr-sprint-service"
|
||||
version = "0.1.0"
|
||||
description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10,<3.13"
|
||||
license = { text = "Proprietary" }
|
||||
authors = [{ name = "Adrian Kuman Firmansah" }]
|
||||
|
||||
dependencies = [
|
||||
# Web framework
|
||||
"fastapi>=0.115,<0.116",
|
||||
"uvicorn[standard]>=0.30,<0.34",
|
||||
"python-multipart>=0.0.9",
|
||||
"pydantic>=2.7,<3",
|
||||
"pydantic-settings>=2.4,<3",
|
||||
# Image / PDF
|
||||
"pillow>=10.4,<12",
|
||||
"opencv-python-headless>=4.10,<5",
|
||||
"numpy>=1.26,<2.2",
|
||||
"PyMuPDF>=1.24,<2",
|
||||
"python-magic>=0.4.27",
|
||||
# OCR (CPU build of paddle; GPU users override via extra index)
|
||||
"paddlepaddle==2.6.1",
|
||||
"paddleocr>=2.7.5,<3",
|
||||
# Logging / observability
|
||||
"structlog>=24.1",
|
||||
"prometheus-client>=0.20",
|
||||
# Misc
|
||||
"httpx>=0.27",
|
||||
"tenacity>=8.5",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.2",
|
||||
"pytest-asyncio>=0.23",
|
||||
"pytest-cov>=5.0",
|
||||
"ruff>=0.6.9",
|
||||
"mypy>=1.11",
|
||||
"types-Pillow",
|
||||
"pre-commit>=3.7",
|
||||
]
|
||||
|
||||
# Extraction layer (Phase 5) — kept optional so MVP install stays light
|
||||
llm = [
|
||||
"ollama>=0.3",
|
||||
"instructor>=1.4",
|
||||
]
|
||||
|
||||
# Async pipeline (Phase 4)
|
||||
async-pipeline = [
|
||||
"celery[redis]>=5.4",
|
||||
"redis>=5.0",
|
||||
"minio>=7.2",
|
||||
"sqlalchemy>=2.0",
|
||||
"psycopg[binary]>=3.2",
|
||||
"alembic>=1.13",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
ocr-sprint-api = "ocr_sprint.main:run"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
"ocr_sprint" = ["py.typed"]
|
||||
|
||||
# ---------- Tooling ----------
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py310"
|
||||
src = ["src", "tests"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"E", "F", "W", # pycodestyle / pyflakes
|
||||
"I", # isort
|
||||
"B", # bugbear
|
||||
"UP", # pyupgrade
|
||||
"SIM", # simplify
|
||||
"RUF", # ruff-specific
|
||||
"C4", # comprehensions
|
||||
"PIE",
|
||||
"PT", # pytest style
|
||||
"TID", # tidy imports
|
||||
]
|
||||
ignore = [
|
||||
"E501", # line length handled by formatter
|
||||
"B008", # FastAPI Depends() pattern
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "double"
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.10"
|
||||
strict = true
|
||||
warn_unused_ignores = true
|
||||
warn_redundant_casts = true
|
||||
disallow_untyped_defs = true
|
||||
plugins = ["pydantic.mypy"]
|
||||
mypy_path = "src"
|
||||
namespace_packages = true
|
||||
explicit_package_bases = true
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "8.0"
|
||||
addopts = "-ra --strict-markers --strict-config"
|
||||
testpaths = ["tests"]
|
||||
asyncio_mode = "auto"
|
||||
filterwarnings = [
|
||||
"ignore::DeprecationWarning:paddle.*",
|
||||
"ignore::DeprecationWarning:paddleocr.*",
|
||||
]
|
||||
|
||||
[tool.coverage.run]
|
||||
source = ["src/ocr_sprint"]
|
||||
branch = true
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
"pragma: no cover",
|
||||
"raise NotImplementedError",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
Reference in New Issue
Block a user