feat: implement PP-Structure table extraction pipeline with GPU runtime configuration support

This commit is contained in:
Adriankf59
2026-04-27 00:51:23 +07:00
parent 9d969e61fd
commit 6d793758ff
12 changed files with 896 additions and 31 deletions

View File

@@ -28,13 +28,6 @@ from functools import partial
from typing import Annotated
from uuid import UUID, uuid4
# Thread pool dedicated to blocking OCR work. Using a *separate* pool
# (rather than the default loop executor) lets us cap the number of
# concurrent heavy OCR jobs independently of other thread-pool users.
# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
from fastapi import (
APIRouter,
Depends,
@@ -73,6 +66,13 @@ from ocr_sprint.schemas.review import (
from ocr_sprint.storage.blob import get_blob_storage
from ocr_sprint.utils.logging import get_logger
# Thread pool dedicated to blocking OCR work. Using a *separate* pool
# (rather than the default loop executor) lets us cap the number of
# concurrent heavy OCR jobs independently of other thread-pool users.
# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
router = APIRouter(
prefix="/documents",
tags=["documents"],
@@ -99,18 +99,17 @@ def _row_to_response(row: object) -> DocumentResponse:
assert isinstance(row, JobRow)
status_enum = DocumentStatus(row.status)
personel_list = None
result_obj = None
if row.result is not None:
result_obj = ExtractionResult.model_validate(row.result)
# Auto-number personnel entries sequentially (1, 2, 3, ...)
for idx, entry in enumerate(result_obj.personel, start=1):
entry.no = idx
personel_list = result_obj.personel
return DocumentResponse(
job_id=row.job_id,
status=status_enum,
confidence=row.confidence,
data=personel_list,
data=result_obj,
review_flags=list(row.review_flags or []),
error=row.error,
approved=bool(row.approved),

View File

@@ -6,6 +6,7 @@ from fastapi import APIRouter
from fastapi.responses import JSONResponse
from ocr_sprint import __version__
from ocr_sprint.config import get_settings
from ocr_sprint.pipeline import ocr as _ocr
from ocr_sprint.pipeline import table as _table
@@ -21,15 +22,18 @@ async def health() -> dict[str, str]:
@router.get("/health/ready")
async def readiness() -> JSONResponse:
"""Readiness check — returns 200 when OCR models are loaded, 503 if still warming up."""
settings = get_settings()
ocr_ready = _ocr._instance is not None
table_ready = _table._instance is not None
table_ready = (not settings.tables_enabled) or _table._instance is not None
ready = ocr_ready and table_ready
payload = {
"status": "ready" if ready else "warming_up",
"version": __version__,
"models": {
"paddleocr": "ready" if ocr_ready else "loading",
"pp_structure": "ready" if table_ready else "loading",
"pp_structure": (
"disabled" if not settings.tables_enabled else "ready" if table_ready else "loading"
),
},
}
return JSONResponse(content=payload, status_code=200 if ready else 503)