feat: implement PP-Structure table extraction pipeline with GPU runtime configuration support

2026-04-27 00:51:23 +07:00
parent 9d969e61fd
commit 6d793758ff
12 changed files with 896 additions and 31 deletions
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -28,13 +28,6 @@ from functools import partial
 from typing import Annotated
 from uuid import UUID, uuid4

-# Thread pool dedicated to blocking OCR work. Using a *separate* pool
-# (rather than the default loop executor) lets us cap the number of
-# concurrent heavy OCR jobs independently of other thread-pool users.
-# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
-# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
-_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
-
 from fastapi import (
    APIRouter,
    Depends,
@@ -73,6 +66,13 @@ from ocr_sprint.schemas.review import (
 from ocr_sprint.storage.blob import get_blob_storage
 from ocr_sprint.utils.logging import get_logger

+# Thread pool dedicated to blocking OCR work. Using a *separate* pool
+# (rather than the default loop executor) lets us cap the number of
+# concurrent heavy OCR jobs independently of other thread-pool users.
+# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
+# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
+_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
+
 router = APIRouter(
    prefix="/documents",
    tags=["documents"],
@@ -99,18 +99,17 @@ def _row_to_response(row: object) -> DocumentResponse:

    assert isinstance(row, JobRow)
    status_enum = DocumentStatus(row.status)
-    personel_list = None
+    result_obj = None
    if row.result is not None:
        result_obj = ExtractionResult.model_validate(row.result)
        # Auto-number personnel entries sequentially (1, 2, 3, ...)
        for idx, entry in enumerate(result_obj.personel, start=1):
            entry.no = idx
-        personel_list = result_obj.personel
    return DocumentResponse(
        job_id=row.job_id,
        status=status_enum,
        confidence=row.confidence,
-        data=personel_list,
+        data=result_obj,
        review_flags=list(row.review_flags or []),
        error=row.error,
        approved=bool(row.approved),
--- a/src/ocr_sprint/api/routes/health.py
+++ b/src/ocr_sprint/api/routes/health.py
@@ -6,6 +6,7 @@ from fastapi import APIRouter
 from fastapi.responses import JSONResponse

 from ocr_sprint import __version__
+from ocr_sprint.config import get_settings
 from ocr_sprint.pipeline import ocr as _ocr
 from ocr_sprint.pipeline import table as _table

@@ -21,15 +22,18 @@ async def health() -> dict[str, str]:
@router.get("/health/ready")
 async def readiness() -> JSONResponse:
    """Readiness check — returns 200 when OCR models are loaded, 503 if still warming up."""
+    settings = get_settings()
    ocr_ready = _ocr._instance is not None
-    table_ready = _table._instance is not None
+    table_ready = (not settings.tables_enabled) or _table._instance is not None
    ready = ocr_ready and table_ready
    payload = {
        "status": "ready" if ready else "warming_up",
        "version": __version__,
        "models": {
            "paddleocr": "ready" if ocr_ready else "loading",
-            "pp_structure": "ready" if table_ready else "loading",
+            "pp_structure": (
+                "disabled" if not settings.tables_enabled else "ready" if table_ready else "loading"
+            ),
        },
    }
    return JSONResponse(content=payload, status_code=200 if ready else 503)