"""Blob storage abstraction. The MVP only ships a local-filesystem backend. The `BlobStorage` Protocol is deliberately small (put / get / exists / delete) so that an S3- or MinIO- backed implementation can be dropped in later without touching API code. Layout on disk: {blob_storage_dir}/ 2026/04/25/ . The date hierarchy keeps the directory listing manageable when the service processes thousands of documents per day, and makes manual rsync-based backup straightforward. """ from __future__ import annotations from datetime import datetime, timezone from pathlib import Path from typing import BinaryIO, Protocol from uuid import uuid4 from ocr_sprint.config import get_settings from ocr_sprint.utils.logging import get_logger _logger = get_logger(__name__) # Map of upload extensions we'll honor when persisting blobs. Anything else # falls back to `.bin` and the OCR pipeline's magic-byte sniffing handles # the actual content kind. _KNOWN_EXTS = {".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff", ".webp"} class BlobStorage(Protocol): """Minimal interface a blob backend must satisfy.""" def put(self, content: bytes, original_filename: str | None = None) -> str: """Persist `content` and return an opaque key the caller can use later.""" def get(self, key: str) -> bytes: """Return the raw bytes for `key`. Raises FileNotFoundError on miss.""" def open(self, key: str) -> BinaryIO: """Return a binary file-like object for streaming reads.""" def exists(self, key: str) -> bool: """True if `key` is currently stored.""" def delete(self, key: str) -> None: """Remove a blob. No-op if it doesn't exist.""" class LocalFsBlobStorage: """Filesystem-backed implementation rooted at `base_dir`.""" def __init__(self, base_dir: Path) -> None: # Resolve once so every subsequent path comparison (escape check, # empty-dir cleanup) is apples-to-apples — ``Path.parents`` of a # resolved key would otherwise never equal a relative ``base_dir``. base_dir.mkdir(parents=True, exist_ok=True) self.base_dir = base_dir.resolve() # ---------- helpers ---------- @staticmethod def _safe_ext(original_filename: str | None) -> str: if not original_filename: return ".bin" suffix = Path(original_filename).suffix.lower() return suffix if suffix in _KNOWN_EXTS else ".bin" def _resolve(self, key: str) -> Path: # Defensive: keys come from the DB but we still reject paths that try # to escape the blob root. ``Path.is_relative_to`` does proper path # containment — string ``startswith`` would let ``/app/blobs_evil`` # slip past when the root is ``/app/blobs``. candidate = (self.base_dir / key).resolve() if not candidate.is_relative_to(self.base_dir): raise ValueError(f"Blob key escapes storage root: {key!r}") return candidate # ---------- BlobStorage protocol ---------- def put(self, content: bytes, original_filename: str | None = None) -> str: now = datetime.now(timezone.utc) date_dir = Path(f"{now:%Y/%m/%d}") ext = self._safe_ext(original_filename) key = str(date_dir / f"{uuid4().hex}{ext}") target = self._resolve(key) target.parent.mkdir(parents=True, exist_ok=True) # Write to a temp file in the same directory then rename. This avoids # a half-written blob being read by a concurrent worker. tmp = target.with_suffix(target.suffix + ".tmp") tmp.write_bytes(content) tmp.rename(target) _logger.info("blob.put", key=key, size=len(content)) return key def get(self, key: str) -> bytes: path = self._resolve(key) if not path.exists(): raise FileNotFoundError(f"Blob not found: {key}") return path.read_bytes() def open(self, key: str) -> BinaryIO: path = self._resolve(key) if not path.exists(): raise FileNotFoundError(f"Blob not found: {key}") return path.open("rb") def exists(self, key: str) -> bool: try: return self._resolve(key).exists() except ValueError: return False def delete(self, key: str) -> None: try: path = self._resolve(key) except ValueError: return if path.exists(): path.unlink() _logger.info("blob.delete", key=key) # Best-effort cleanup of empty date dirs so we don't accumulate # 365 directories per year forever. ``self.base_dir`` is already # resolved (see __init__), so it can be compared against # ``path.parents`` directly. for parent in path.parents: if parent == self.base_dir or self.base_dir not in parent.parents: break try: parent.rmdir() except OSError: break def get_blob_storage() -> BlobStorage: """Build the configured blob backend. Single-process cache lives in `Settings`.""" s = get_settings() return LocalFsBlobStorage(s.blob_storage_dir) __all__ = ["BlobStorage", "LocalFsBlobStorage", "get_blob_storage"]