feat: implement PP-Structure table extraction pipeline with GPU runtime configuration support
This commit is contained in:
@@ -28,13 +28,6 @@ from functools import partial
|
||||
from typing import Annotated
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
# Thread pool dedicated to blocking OCR work. Using a *separate* pool
|
||||
# (rather than the default loop executor) lets us cap the number of
|
||||
# concurrent heavy OCR jobs independently of other thread-pool users.
|
||||
# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
|
||||
# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
|
||||
_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
|
||||
|
||||
from fastapi import (
|
||||
APIRouter,
|
||||
Depends,
|
||||
@@ -73,6 +66,13 @@ from ocr_sprint.schemas.review import (
|
||||
from ocr_sprint.storage.blob import get_blob_storage
|
||||
from ocr_sprint.utils.logging import get_logger
|
||||
|
||||
# Thread pool dedicated to blocking OCR work. Using a *separate* pool
|
||||
# (rather than the default loop executor) lets us cap the number of
|
||||
# concurrent heavy OCR jobs independently of other thread-pool users.
|
||||
# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
|
||||
# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
|
||||
_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/documents",
|
||||
tags=["documents"],
|
||||
@@ -99,18 +99,17 @@ def _row_to_response(row: object) -> DocumentResponse:
|
||||
|
||||
assert isinstance(row, JobRow)
|
||||
status_enum = DocumentStatus(row.status)
|
||||
personel_list = None
|
||||
result_obj = None
|
||||
if row.result is not None:
|
||||
result_obj = ExtractionResult.model_validate(row.result)
|
||||
# Auto-number personnel entries sequentially (1, 2, 3, ...)
|
||||
for idx, entry in enumerate(result_obj.personel, start=1):
|
||||
entry.no = idx
|
||||
personel_list = result_obj.personel
|
||||
return DocumentResponse(
|
||||
job_id=row.job_id,
|
||||
status=status_enum,
|
||||
confidence=row.confidence,
|
||||
data=personel_list,
|
||||
data=result_obj,
|
||||
review_flags=list(row.review_flags or []),
|
||||
error=row.error,
|
||||
approved=bool(row.approved),
|
||||
|
||||
@@ -6,6 +6,7 @@ from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from ocr_sprint import __version__
|
||||
from ocr_sprint.config import get_settings
|
||||
from ocr_sprint.pipeline import ocr as _ocr
|
||||
from ocr_sprint.pipeline import table as _table
|
||||
|
||||
@@ -21,15 +22,18 @@ async def health() -> dict[str, str]:
|
||||
@router.get("/health/ready")
|
||||
async def readiness() -> JSONResponse:
|
||||
"""Readiness check — returns 200 when OCR models are loaded, 503 if still warming up."""
|
||||
settings = get_settings()
|
||||
ocr_ready = _ocr._instance is not None
|
||||
table_ready = _table._instance is not None
|
||||
table_ready = (not settings.tables_enabled) or _table._instance is not None
|
||||
ready = ocr_ready and table_ready
|
||||
payload = {
|
||||
"status": "ready" if ready else "warming_up",
|
||||
"version": __version__,
|
||||
"models": {
|
||||
"paddleocr": "ready" if ocr_ready else "loading",
|
||||
"pp_structure": "ready" if table_ready else "loading",
|
||||
"pp_structure": (
|
||||
"disabled" if not settings.tables_enabled else "ready" if table_ready else "loading"
|
||||
),
|
||||
},
|
||||
}
|
||||
return JSONResponse(content=payload, status_code=200 if ready else 503)
|
||||
|
||||
@@ -88,6 +88,17 @@ def create_app() -> FastAPI:
|
||||
)
|
||||
|
||||
register_error_handlers(app)
|
||||
|
||||
# CORS — allow frontend dev servers and production origins
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.add_middleware(MetricsMiddleware)
|
||||
app.include_router(health.router, prefix="/api/v1")
|
||||
app.include_router(documents.router, prefix="/api/v1")
|
||||
|
||||
@@ -122,9 +122,14 @@ class OCRPage:
|
||||
|
||||
|
||||
def _build_paddleocr() -> PaddleOCR:
|
||||
s = get_settings()
|
||||
if s.ocr_use_gpu:
|
||||
from ocr_sprint.utils.gpu import configure_nvidia_dll_path
|
||||
|
||||
configure_nvidia_dll_path()
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
s = get_settings()
|
||||
kwargs: dict[str, object] = {
|
||||
"lang": s.ocr_lang,
|
||||
"use_angle_cls": True,
|
||||
|
||||
@@ -67,24 +67,41 @@ class DetectedTable:
|
||||
# ---------- PP-Structure singleton ----------
|
||||
|
||||
|
||||
def _create_pp_structure(
|
||||
pp_structure_cls: type[PPStructure], pp_lang: str, use_gpu: bool
|
||||
) -> PPStructure:
|
||||
# layout=True so that PP-Structure also returns figure/text regions; we
|
||||
# filter to tables only afterwards. show_log=False to keep stdout clean.
|
||||
return pp_structure_cls(
|
||||
lang=pp_lang,
|
||||
use_gpu=use_gpu,
|
||||
layout=True,
|
||||
show_log=False,
|
||||
)
|
||||
|
||||
|
||||
def _build_pp_structure() -> PPStructure:
|
||||
s = get_settings()
|
||||
if s.ocr_use_gpu:
|
||||
from ocr_sprint.utils.gpu import configure_nvidia_dll_path
|
||||
|
||||
configure_nvidia_dll_path()
|
||||
|
||||
from paddleocr import PPStructure
|
||||
|
||||
s = get_settings()
|
||||
# PPStructure layout models only support 'en' and 'ch', not 'latin'.
|
||||
# Use 'en' for layout/table detection — it's language-agnostic (detects
|
||||
# table structure, not text language). OCR within cells still works for
|
||||
# Indonesian text because the recognition model handles Latin scripts.
|
||||
pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang
|
||||
_logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu)
|
||||
# layout=True so that PP-Structure also returns figure/text regions; we
|
||||
# filter to tables only afterwards. show_log=False to keep stdout clean.
|
||||
return PPStructure(
|
||||
lang=pp_lang,
|
||||
use_gpu=s.ocr_use_gpu,
|
||||
layout=True,
|
||||
show_log=False,
|
||||
)
|
||||
try:
|
||||
return _create_pp_structure(PPStructure, pp_lang, s.ocr_use_gpu)
|
||||
except Exception as exc:
|
||||
if not s.ocr_use_gpu:
|
||||
raise
|
||||
_logger.warning("pp_structure.gpu_init_failed_falling_back_cpu", error=str(exc))
|
||||
return _create_pp_structure(PPStructure, pp_lang, False)
|
||||
|
||||
|
||||
def get_pp_structure() -> PPStructure:
|
||||
|
||||
@@ -10,7 +10,6 @@ from uuid import UUID, uuid4
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from ocr_sprint.schemas.extraction import ExtractionResult
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
|
||||
class SourceKind(str, Enum):
|
||||
@@ -53,7 +52,7 @@ class DocumentResponse(BaseModel):
|
||||
job_id: UUID
|
||||
status: DocumentStatus
|
||||
confidence: float | None = None
|
||||
data: list[PersonnelEntry] | None = None
|
||||
data: ExtractionResult | None = None
|
||||
review_flags: list[str] = Field(default_factory=list)
|
||||
error: str | None = None
|
||||
# Phase 6 — HITL review state.
|
||||
|
||||
57
src/ocr_sprint/utils/gpu.py
Normal file
57
src/ocr_sprint/utils/gpu.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""GPU runtime helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
_DLL_HANDLES: list[object] = []
|
||||
_CONFIGURED = False
|
||||
|
||||
|
||||
def configure_nvidia_dll_path() -> None:
|
||||
"""Expose NVIDIA wheel DLL directories to the Windows dynamic loader.
|
||||
|
||||
Paddle's Windows GPU wheels dynamically load CUDA/cuDNN DLLs by name. When
|
||||
those DLLs come from Python packages such as ``nvidia-cudnn-cu11`` instead
|
||||
of a system-wide CUDA Toolkit install, their ``bin`` folders are not on
|
||||
``PATH`` by default.
|
||||
"""
|
||||
global _CONFIGURED
|
||||
if _CONFIGURED or os.name != "nt":
|
||||
return
|
||||
|
||||
package_names = ("nvidia.cudnn", "nvidia.cublas", "nvidia.cuda_nvrtc")
|
||||
dll_dirs: list[Path] = []
|
||||
for package_name in package_names:
|
||||
try:
|
||||
module = __import__(package_name, fromlist=["__file__"])
|
||||
except Exception:
|
||||
continue
|
||||
module_file = getattr(module, "__file__", None)
|
||||
if not module_file:
|
||||
continue
|
||||
dll_dir = Path(module_file).resolve().parent / "bin"
|
||||
if dll_dir.is_dir():
|
||||
dll_dirs.append(dll_dir)
|
||||
|
||||
if not dll_dirs:
|
||||
_CONFIGURED = True
|
||||
return
|
||||
|
||||
current_path_parts = os.environ.get("PATH", "").split(os.pathsep)
|
||||
current_path_norm = {part.casefold() for part in current_path_parts if part}
|
||||
|
||||
prepend: list[str] = []
|
||||
for dll_dir in dll_dirs:
|
||||
dll_dir_str = str(dll_dir)
|
||||
if dll_dir_str.casefold() not in current_path_norm:
|
||||
prepend.append(dll_dir_str)
|
||||
add_dll_directory = getattr(os, "add_dll_directory", None)
|
||||
if add_dll_directory is not None:
|
||||
_DLL_HANDLES.append(add_dll_directory(dll_dir_str))
|
||||
|
||||
if prepend:
|
||||
os.environ["PATH"] = os.pathsep.join([*prepend, os.environ.get("PATH", "")])
|
||||
|
||||
_CONFIGURED = True
|
||||
Reference in New Issue
Block a user