feat: implement PP-Structure table extraction pipeline with GPU runtime configuration support

2026-04-27 00:51:23 +07:00
parent 9d969e61fd
commit 6d793758ff
12 changed files with 896 additions and 31 deletions
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -28,13 +28,6 @@ from functools import partial
 from typing import Annotated
 from uuid import UUID, uuid4

-# Thread pool dedicated to blocking OCR work. Using a *separate* pool
-# (rather than the default loop executor) lets us cap the number of
-# concurrent heavy OCR jobs independently of other thread-pool users.
-# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
-# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
-_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
-
 from fastapi import (
    APIRouter,
    Depends,
@@ -73,6 +66,13 @@ from ocr_sprint.schemas.review import (
 from ocr_sprint.storage.blob import get_blob_storage
 from ocr_sprint.utils.logging import get_logger

+# Thread pool dedicated to blocking OCR work. Using a *separate* pool
+# (rather than the default loop executor) lets us cap the number of
+# concurrent heavy OCR jobs independently of other thread-pool users.
+# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
+# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
+_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
+
 router = APIRouter(
    prefix="/documents",
    tags=["documents"],
@@ -99,18 +99,17 @@ def _row_to_response(row: object) -> DocumentResponse:

    assert isinstance(row, JobRow)
    status_enum = DocumentStatus(row.status)
-    personel_list = None
+    result_obj = None
    if row.result is not None:
        result_obj = ExtractionResult.model_validate(row.result)
        # Auto-number personnel entries sequentially (1, 2, 3, ...)
        for idx, entry in enumerate(result_obj.personel, start=1):
            entry.no = idx
-        personel_list = result_obj.personel
    return DocumentResponse(
        job_id=row.job_id,
        status=status_enum,
        confidence=row.confidence,
-        data=personel_list,
+        data=result_obj,
        review_flags=list(row.review_flags or []),
        error=row.error,
        approved=bool(row.approved),
--- a/src/ocr_sprint/api/routes/health.py
+++ b/src/ocr_sprint/api/routes/health.py
@@ -6,6 +6,7 @@ from fastapi import APIRouter
 from fastapi.responses import JSONResponse

 from ocr_sprint import __version__
+from ocr_sprint.config import get_settings
 from ocr_sprint.pipeline import ocr as _ocr
 from ocr_sprint.pipeline import table as _table

@@ -21,15 +22,18 @@ async def health() -> dict[str, str]:
@router.get("/health/ready")
 async def readiness() -> JSONResponse:
    """Readiness check — returns 200 when OCR models are loaded, 503 if still warming up."""
+    settings = get_settings()
    ocr_ready = _ocr._instance is not None
-    table_ready = _table._instance is not None
+    table_ready = (not settings.tables_enabled) or _table._instance is not None
    ready = ocr_ready and table_ready
    payload = {
        "status": "ready" if ready else "warming_up",
        "version": __version__,
        "models": {
            "paddleocr": "ready" if ocr_ready else "loading",
-            "pp_structure": "ready" if table_ready else "loading",
+            "pp_structure": (
+                "disabled" if not settings.tables_enabled else "ready" if table_ready else "loading"
+            ),
        },
    }
    return JSONResponse(content=payload, status_code=200 if ready else 503)
--- a/src/ocr_sprint/main.py
+++ b/src/ocr_sprint/main.py
@@ -88,6 +88,17 @@ def create_app() -> FastAPI:
    )

    register_error_handlers(app)
+
+    # CORS — allow frontend dev servers and production origins
+    from fastapi.middleware.cors import CORSMiddleware
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
    app.add_middleware(MetricsMiddleware)
    app.include_router(health.router, prefix="/api/v1")
    app.include_router(documents.router, prefix="/api/v1")
--- a/src/ocr_sprint/pipeline/ocr.py
+++ b/src/ocr_sprint/pipeline/ocr.py
@@ -122,9 +122,14 @@ class OCRPage:


 def _build_paddleocr() -> PaddleOCR:
+    s = get_settings()
+    if s.ocr_use_gpu:
+        from ocr_sprint.utils.gpu import configure_nvidia_dll_path
+
+        configure_nvidia_dll_path()
+
    from paddleocr import PaddleOCR

-    s = get_settings()
    kwargs: dict[str, object] = {
        "lang": s.ocr_lang,
        "use_angle_cls": True,
--- a/src/ocr_sprint/pipeline/table.py
+++ b/src/ocr_sprint/pipeline/table.py
@@ -67,24 +67,41 @@ class DetectedTable:
 # ---------- PP-Structure singleton ----------


+def _create_pp_structure(
+    pp_structure_cls: type[PPStructure], pp_lang: str, use_gpu: bool
+) -> PPStructure:
+    # layout=True so that PP-Structure also returns figure/text regions; we
+    # filter to tables only afterwards. show_log=False to keep stdout clean.
+    return pp_structure_cls(
+        lang=pp_lang,
+        use_gpu=use_gpu,
+        layout=True,
+        show_log=False,
+    )
+
+
 def _build_pp_structure() -> PPStructure:
+    s = get_settings()
+    if s.ocr_use_gpu:
+        from ocr_sprint.utils.gpu import configure_nvidia_dll_path
+
+        configure_nvidia_dll_path()
+
    from paddleocr import PPStructure

-    s = get_settings()
    # PPStructure layout models only support 'en' and 'ch', not 'latin'.
    # Use 'en' for layout/table detection — it's language-agnostic (detects
    # table structure, not text language). OCR within cells still works for
    # Indonesian text because the recognition model handles Latin scripts.
    pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang
    _logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu)
-    # layout=True so that PP-Structure also returns figure/text regions; we
-    # filter to tables only afterwards. show_log=False to keep stdout clean.
-    return PPStructure(
-        lang=pp_lang,
-        use_gpu=s.ocr_use_gpu,
-        layout=True,
-        show_log=False,
-    )
+    try:
+        return _create_pp_structure(PPStructure, pp_lang, s.ocr_use_gpu)
+    except Exception as exc:
+        if not s.ocr_use_gpu:
+            raise
+        _logger.warning("pp_structure.gpu_init_failed_falling_back_cpu", error=str(exc))
+        return _create_pp_structure(PPStructure, pp_lang, False)


 def get_pp_structure() -> PPStructure:
--- a/src/ocr_sprint/schemas/document.py
+++ b/src/ocr_sprint/schemas/document.py
@@ -10,7 +10,6 @@ from uuid import UUID, uuid4
 from pydantic import BaseModel, ConfigDict, Field

 from ocr_sprint.schemas.extraction import ExtractionResult
-from ocr_sprint.schemas.personnel import PersonnelEntry


 class SourceKind(str, Enum):
@@ -53,7 +52,7 @@ class DocumentResponse(BaseModel):
    job_id: UUID
    status: DocumentStatus
    confidence: float | None = None
-    data: list[PersonnelEntry] | None = None
+    data: ExtractionResult | None = None
    review_flags: list[str] = Field(default_factory=list)
    error: str | None = None
    # Phase 6 — HITL review state.
--- a/src/ocr_sprint/utils/gpu.py
+++ b/src/ocr_sprint/utils/gpu.py
@@ -0,0 +1,57 @@
+"""GPU runtime helpers."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+_DLL_HANDLES: list[object] = []
+_CONFIGURED = False
+
+
+def configure_nvidia_dll_path() -> None:
+    """Expose NVIDIA wheel DLL directories to the Windows dynamic loader.
+
+    Paddle's Windows GPU wheels dynamically load CUDA/cuDNN DLLs by name. When
+    those DLLs come from Python packages such as ``nvidia-cudnn-cu11`` instead
+    of a system-wide CUDA Toolkit install, their ``bin`` folders are not on
+    ``PATH`` by default.
+    """
+    global _CONFIGURED
+    if _CONFIGURED or os.name != "nt":
+        return
+
+    package_names = ("nvidia.cudnn", "nvidia.cublas", "nvidia.cuda_nvrtc")
+    dll_dirs: list[Path] = []
+    for package_name in package_names:
+        try:
+            module = __import__(package_name, fromlist=["__file__"])
+        except Exception:
+            continue
+        module_file = getattr(module, "__file__", None)
+        if not module_file:
+            continue
+        dll_dir = Path(module_file).resolve().parent / "bin"
+        if dll_dir.is_dir():
+            dll_dirs.append(dll_dir)
+
+    if not dll_dirs:
+        _CONFIGURED = True
+        return
+
+    current_path_parts = os.environ.get("PATH", "").split(os.pathsep)
+    current_path_norm = {part.casefold() for part in current_path_parts if part}
+
+    prepend: list[str] = []
+    for dll_dir in dll_dirs:
+        dll_dir_str = str(dll_dir)
+        if dll_dir_str.casefold() not in current_path_norm:
+            prepend.append(dll_dir_str)
+        add_dll_directory = getattr(os, "add_dll_directory", None)
+        if add_dll_directory is not None:
+            _DLL_HANDLES.append(add_dll_directory(dll_dir_str))
+
+    if prepend:
+        os.environ["PATH"] = os.pathsep.join([*prepend, os.environ.get("PATH", "")])
+
+    _CONFIGURED = True