Phase 6: HITL review endpoints + audit trail

- New job_corrections table (append-only audit log) + migration - Add approved / reviewed_by / reviewed_at columns to jobs - PATCH /documents/{id} apply field-level corrections - GET /documents/{id}/history return chronological audit trail - POST /documents/{id}/approve lock final version (idempotent) - Dotted field-path applier with root allow-list + list-index support - Auto-clear `missing_field` review flag when required header keys filled - Atomic batch apply: malformed path in batch rolls back all changes - 22 new tests (11 repository-level, 11 API-level); 184 total passing Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 20:12:04 +00:00
parent 45fbfdabb7
commit 66247e39a5
9 changed files with 1058 additions and 22 deletions
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -22,7 +22,17 @@ from __future__ import annotations
 from typing import Annotated
 from uuid import UUID, uuid4

-from fastapi import APIRouter, Depends, File, HTTPException, Query, Response, UploadFile, status
+from fastapi import (
+    APIRouter,
+    Depends,
+    File,
+    Header,
+    HTTPException,
+    Query,
+    Response,
+    UploadFile,
+    status,
+)
 from sqlalchemy.orm import Session

 from ocr_sprint.api.deps.auth import require_api_key
@@ -31,11 +41,22 @@ from ocr_sprint.api.errors import UnsupportedDocumentError
 from ocr_sprint.api.metrics import JOB_PROCESSING_SECONDS
 from ocr_sprint.config import get_settings
 from ocr_sprint.db.base import session_scope
-from ocr_sprint.db.repositories import JobNotFoundError, JobRepository
+from ocr_sprint.db.repositories import (
+    InvalidFieldPathError,
+    JobAlreadyApprovedError,
+    JobNotCompletedError,
+    JobNotFoundError,
+    JobRepository,
+)
 from ocr_sprint.pipeline.ingest import detect_source_kind
 from ocr_sprint.pipeline.orchestrator import run_pipeline
 from ocr_sprint.schemas.document import DocumentResponse, DocumentStatus
 from ocr_sprint.schemas.extraction import ExtractionResult
+from ocr_sprint.schemas.review import (
+    ApprovalResponse,
+    CorrectionEventResponse,
+    CorrectionRequest,
+)
 from ocr_sprint.storage.blob import get_blob_storage
 from ocr_sprint.utils.logging import get_logger

@@ -75,6 +96,9 @@ def _row_to_response(row: object) -> DocumentResponse:
        data=result_obj,
        review_flags=list(row.review_flags or []),
        error=row.error,
+        approved=bool(row.approved),
+        reviewed_by=row.reviewed_by,
+        reviewed_at=row.reviewed_at,
    )


@@ -192,3 +216,116 @@ async def get_document(
    except JobNotFoundError as exc:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
    return _row_to_response(row)
+
+
+# ---------- Phase 6 — HITL ----------
+
+
+def _correction_row_to_response(row: object) -> CorrectionEventResponse:
+    # Local import to avoid a cyclic import at module load time.
+    from ocr_sprint.db.models import JobCorrectionRow
+
+    assert isinstance(row, JobCorrectionRow)
+    return CorrectionEventResponse(
+        id=row.id,
+        job_id=row.job_id,
+        field_path=row.field_path,
+        old_value=row.old_value,
+        new_value=row.new_value,
+        corrected_by=row.corrected_by,
+        reason=row.reason,
+        corrected_at=row.corrected_at,
+    )
+
+
+@router.patch(
+    "/{job_id}",
+    response_model=DocumentResponse,
+)
+async def patch_document(
+    job_id: UUID,
+    body: CorrectionRequest,
+    session: Annotated[Session, Depends(get_session)],
+    x_user_id: Annotated[
+        str | None,
+        Header(description="Free-form reviewer identifier recorded on the audit row."),
+    ] = None,
+) -> DocumentResponse:
+    """Apply one or more field-level corrections and record an audit trail.
+
+    The whole batch is applied atomically — if any path is invalid the
+    request fails with 400 and no side effects are written. Returns the
+    updated document so the client doesn't need a follow-up GET.
+    """
+    repo = JobRepository(session)
+    try:
+        repo.apply_corrections(
+            job_id,
+            corrections=[(c.path, c.value, c.reason) for c in body.corrections],
+            corrected_by=x_user_id,
+        )
+        row = repo.get_or_raise(job_id)
+    except JobNotFoundError as exc:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
+    except InvalidFieldPathError as exc:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
+    except JobAlreadyApprovedError as exc:
+        raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc)) from exc
+    except JobNotCompletedError as exc:
+        raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc)) from exc
+
+    _logger.info(
+        "documents.patched",
+        job_id=str(job_id),
+        count=len(body.corrections),
+        corrected_by=x_user_id or "",
+    )
+    return _row_to_response(row)
+
+
+@router.get(
+    "/{job_id}/history",
+    response_model=list[CorrectionEventResponse],
+)
+async def get_history(
+    job_id: UUID,
+    session: Annotated[Session, Depends(get_session)],
+) -> list[CorrectionEventResponse]:
+    repo = JobRepository(session)
+    try:
+        rows = repo.list_corrections(job_id)
+    except JobNotFoundError as exc:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
+    return [_correction_row_to_response(r) for r in rows]
+
+
+@router.post(
+    "/{job_id}/approve",
+    response_model=ApprovalResponse,
+)
+async def approve_document(
+    job_id: UUID,
+    session: Annotated[Session, Depends(get_session)],
+    x_user_id: Annotated[
+        str | None,
+        Header(description="Free-form reviewer identifier recorded on the job."),
+    ] = None,
+) -> ApprovalResponse:
+    """Lock a job's final version. Idempotent: re-approving returns the
+    existing row without overwriting ``reviewed_at``.
+    """
+    repo = JobRepository(session)
+    try:
+        row = repo.approve(job_id, reviewed_by=x_user_id)
+    except JobNotFoundError as exc:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
+    except JobNotCompletedError as exc:
+        raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc)) from exc
+
+    _logger.info("documents.approved", job_id=str(job_id), reviewed_by=row.reviewed_by or "")
+    return ApprovalResponse(
+        job_id=row.job_id,
+        approved=bool(row.approved),
+        reviewed_by=row.reviewed_by,
+        reviewed_at=row.reviewed_at,
+    )