Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
73 lines
2.1 KiB
Python
73 lines
2.1 KiB
Python
"""Application settings loaded from environment / .env file."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
|
|
from pydantic import Field
|
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
"""Runtime configuration. Override via environment variables or a .env file."""
|
|
|
|
model_config = SettingsConfigDict(
|
|
env_file=".env",
|
|
env_file_encoding="utf-8",
|
|
case_sensitive=False,
|
|
extra="ignore",
|
|
)
|
|
|
|
# App
|
|
app_env: str = "local"
|
|
app_host: str = "0.0.0.0"
|
|
app_port: int = 8000
|
|
app_log_level: str = "INFO"
|
|
|
|
# Storage (Phase 1: local fs)
|
|
storage_local_dir: Path = Path("./storage")
|
|
|
|
# OCR
|
|
ocr_lang: str = "latin"
|
|
ocr_use_gpu: bool = False
|
|
ocr_det_model_dir: str | None = None
|
|
ocr_rec_model_dir: str | None = None
|
|
ocr_cls_model_dir: str | None = None
|
|
ocr_max_image_side: int = 2200
|
|
|
|
# Preprocessing
|
|
preprocess_target_dpi: int = 300
|
|
preprocess_denoise: bool = True
|
|
preprocess_deskew: bool = True
|
|
preprocess_adaptive_threshold: bool = False
|
|
|
|
# Confidence thresholds (Phase 5 routing)
|
|
confidence_auto_approve: float = Field(0.95, ge=0.0, le=1.0)
|
|
confidence_needs_review: float = Field(0.85, ge=0.0, le=1.0)
|
|
|
|
# LLM (Phase 5)
|
|
llm_enabled: bool = False
|
|
llm_provider: str = "ollama"
|
|
llm_model: str = "qwen2.5:1.5b"
|
|
llm_base_url: str = "http://localhost:11434"
|
|
llm_timeout_s: int = 60
|
|
|
|
# Async pipeline (Phase 4)
|
|
queue_enabled: bool = False
|
|
redis_url: str = "redis://localhost:6379/0"
|
|
database_url: str = "postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint"
|
|
minio_endpoint: str = "localhost:9000"
|
|
minio_access_key: str = "minioadmin"
|
|
minio_secret_key: str = "minioadmin"
|
|
minio_bucket: str = "ocr-sprint"
|
|
minio_secure: bool = False
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def get_settings() -> Settings:
|
|
"""Cached accessor so settings are loaded once per process."""
|
|
settings = Settings()
|
|
settings.storage_local_dir.mkdir(parents=True, exist_ok=True)
|
|
return settings
|