Phase 6: HITL review endpoints + audit trail

- New job_corrections table (append-only audit log) + migration - Add approved / reviewed_by / reviewed_at columns to jobs - PATCH /documents/{id} apply field-level corrections - GET /documents/{id}/history return chronological audit trail - POST /documents/{id}/approve lock final version (idempotent) - Dotted field-path applier with root allow-list + list-index support - Auto-clear `missing_field` review flag when required header keys filled - Atomic batch apply: malformed path in batch rolls back all changes - 22 new tests (11 repository-level, 11 API-level); 184 total passing Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 20:12:04 +00:00
parent 45fbfdabb7
commit 66247e39a5
9 changed files with 1058 additions and 22 deletions
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -22,7 +22,17 @@ from __future__ import annotations
 from typing import Annotated
 from uuid import UUID, uuid4

-from fastapi import APIRouter, Depends, File, HTTPException, Query, Response, UploadFile, status
+from fastapi import (
+    APIRouter,
+    Depends,
+    File,
+    Header,
+    HTTPException,
+    Query,
+    Response,
+    UploadFile,
+    status,
+)
 from sqlalchemy.orm import Session

 from ocr_sprint.api.deps.auth import require_api_key
@@ -31,11 +41,22 @@ from ocr_sprint.api.errors import UnsupportedDocumentError
 from ocr_sprint.api.metrics import JOB_PROCESSING_SECONDS
 from ocr_sprint.config import get_settings
 from ocr_sprint.db.base import session_scope
-from ocr_sprint.db.repositories import JobNotFoundError, JobRepository
+from ocr_sprint.db.repositories import (
+    InvalidFieldPathError,
+    JobAlreadyApprovedError,
+    JobNotCompletedError,
+    JobNotFoundError,
+    JobRepository,
+)
 from ocr_sprint.pipeline.ingest import detect_source_kind
 from ocr_sprint.pipeline.orchestrator import run_pipeline
 from ocr_sprint.schemas.document import DocumentResponse, DocumentStatus
 from ocr_sprint.schemas.extraction import ExtractionResult
+from ocr_sprint.schemas.review import (
+    ApprovalResponse,
+    CorrectionEventResponse,
+    CorrectionRequest,
+)
 from ocr_sprint.storage.blob import get_blob_storage
 from ocr_sprint.utils.logging import get_logger

@@ -75,6 +96,9 @@ def _row_to_response(row: object) -> DocumentResponse:
        data=result_obj,
        review_flags=list(row.review_flags or []),
        error=row.error,
+        approved=bool(row.approved),
+        reviewed_by=row.reviewed_by,
+        reviewed_at=row.reviewed_at,
    )


@@ -192,3 +216,116 @@ async def get_document(
    except JobNotFoundError as exc:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
    return _row_to_response(row)
+
+
+# ---------- Phase 6 — HITL ----------
+
+
+def _correction_row_to_response(row: object) -> CorrectionEventResponse:
+    # Local import to avoid a cyclic import at module load time.
+    from ocr_sprint.db.models import JobCorrectionRow
+
+    assert isinstance(row, JobCorrectionRow)
+    return CorrectionEventResponse(
+        id=row.id,
+        job_id=row.job_id,
+        field_path=row.field_path,
+        old_value=row.old_value,
+        new_value=row.new_value,
+        corrected_by=row.corrected_by,
+        reason=row.reason,
+        corrected_at=row.corrected_at,
+    )
+
+
+@router.patch(
+    "/{job_id}",
+    response_model=DocumentResponse,
+)
+async def patch_document(
+    job_id: UUID,
+    body: CorrectionRequest,
+    session: Annotated[Session, Depends(get_session)],
+    x_user_id: Annotated[
+        str | None,
+        Header(description="Free-form reviewer identifier recorded on the audit row."),
+    ] = None,
+) -> DocumentResponse:
+    """Apply one or more field-level corrections and record an audit trail.
+
+    The whole batch is applied atomically — if any path is invalid the
+    request fails with 400 and no side effects are written. Returns the
+    updated document so the client doesn't need a follow-up GET.
+    """
+    repo = JobRepository(session)
+    try:
+        repo.apply_corrections(
+            job_id,
+            corrections=[(c.path, c.value, c.reason) for c in body.corrections],
+            corrected_by=x_user_id,
+        )
+        row = repo.get_or_raise(job_id)
+    except JobNotFoundError as exc:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
+    except InvalidFieldPathError as exc:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
+    except JobAlreadyApprovedError as exc:
+        raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc)) from exc
+    except JobNotCompletedError as exc:
+        raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc)) from exc
+
+    _logger.info(
+        "documents.patched",
+        job_id=str(job_id),
+        count=len(body.corrections),
+        corrected_by=x_user_id or "",
+    )
+    return _row_to_response(row)
+
+
+@router.get(
+    "/{job_id}/history",
+    response_model=list[CorrectionEventResponse],
+)
+async def get_history(
+    job_id: UUID,
+    session: Annotated[Session, Depends(get_session)],
+) -> list[CorrectionEventResponse]:
+    repo = JobRepository(session)
+    try:
+        rows = repo.list_corrections(job_id)
+    except JobNotFoundError as exc:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
+    return [_correction_row_to_response(r) for r in rows]
+
+
+@router.post(
+    "/{job_id}/approve",
+    response_model=ApprovalResponse,
+)
+async def approve_document(
+    job_id: UUID,
+    session: Annotated[Session, Depends(get_session)],
+    x_user_id: Annotated[
+        str | None,
+        Header(description="Free-form reviewer identifier recorded on the job."),
+    ] = None,
+) -> ApprovalResponse:
+    """Lock a job's final version. Idempotent: re-approving returns the
+    existing row without overwriting ``reviewed_at``.
+    """
+    repo = JobRepository(session)
+    try:
+        row = repo.approve(job_id, reviewed_by=x_user_id)
+    except JobNotFoundError as exc:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(exc)) from exc
+    except JobNotCompletedError as exc:
+        raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc)) from exc
+
+    _logger.info("documents.approved", job_id=str(job_id), reviewed_by=row.reviewed_by or "")
+    return ApprovalResponse(
+        job_id=row.job_id,
+        approved=bool(row.approved),
+        reviewed_by=row.reviewed_by,
+        reviewed_at=row.reviewed_at,
+    )
--- a/src/ocr_sprint/db/models.py
+++ b/src/ocr_sprint/db/models.py
@@ -16,7 +16,7 @@ from datetime import datetime, timezone
 from typing import Any
 from uuid import UUID, uuid4

-from sqlalchemy import JSON, DateTime, Float, String, Uuid
+from sqlalchemy import JSON, Boolean, DateTime, Float, ForeignKey, Integer, String, Uuid
 from sqlalchemy.orm import Mapped, mapped_column

 from ocr_sprint.db.base import Base
@@ -42,6 +42,15 @@ class JobRow(Base):
    result: Mapped[dict[str, Any] | None] = mapped_column(JSON, nullable=True)
    error: Mapped[str | None] = mapped_column(String(2048), nullable=True)

+    # Phase 6 — HITL review state.
+    # Once ``approved=True`` the row is immutable except to admin users;
+    # corrections after that point are rejected by the route. ``reviewed_by``
+    # stores the free-form user identifier the reviewer sent via the
+    # ``X-User-Id`` header (best-effort attribution — no full RBAC yet).
+    approved: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
+    reviewed_by: Mapped[str | None] = mapped_column(String(128), nullable=True)
+    reviewed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
+
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, default=_utcnow
    )
@@ -51,3 +60,37 @@ class JobRow(Base):

    def __repr__(self) -> str:
        return f"JobRow(job_id={self.job_id!s}, status={self.status!r})"
+
+
+class JobCorrectionRow(Base):
+    """One correction event on a job's ``result``.
+
+    Each PATCH call writes one row per changed field path so we have a
+    full audit trail. Rows are append-only — never updated, never
+    deleted — so the history is reproducible and usable as ground-truth
+    data for future fine-tuning.
+    """
+
+    __tablename__ = "job_corrections"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+    job_id: Mapped[UUID] = mapped_column(
+        Uuid, ForeignKey("jobs.job_id", ondelete="CASCADE"), nullable=False, index=True
+    )
+    # Dotted JSON path into ExtractionResult, e.g. "header.nomor_sprint" or
+    # "personel[3].nrp". Kept as a plain string for simplicity — we don't
+    # parse it server-side beyond the allow-list check in the repository.
+    field_path: Mapped[str] = mapped_column(String(256), nullable=False)
+    old_value: Mapped[Any | None] = mapped_column(JSON, nullable=True)
+    new_value: Mapped[Any | None] = mapped_column(JSON, nullable=True)
+    corrected_by: Mapped[str | None] = mapped_column(String(128), nullable=True)
+    reason: Mapped[str | None] = mapped_column(String(512), nullable=True)
+    corrected_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=True), nullable=False, default=_utcnow
+    )
+
+    def __repr__(self) -> str:
+        return (
+            f"JobCorrectionRow(job_id={self.job_id!s}, "
+            f"field={self.field_path!r}, by={self.corrected_by!r})"
+        )
--- a/src/ocr_sprint/db/repositories.py
+++ b/src/ocr_sprint/db/repositories.py
@@ -6,6 +6,9 @@ know about sessions, transactions, or the row → schema mapping.

 from __future__ import annotations

+import copy
+import re
+from dataclasses import dataclass
 from datetime import datetime, timezone
 from typing import Any
 from uuid import UUID
@@ -13,7 +16,7 @@ from uuid import UUID
 from sqlalchemy import select
 from sqlalchemy.orm import Session

-from ocr_sprint.db.models import JobRow
+from ocr_sprint.db.models import JobCorrectionRow, JobRow
 from ocr_sprint.schemas.document import DocumentStatus, SourceKind


@@ -25,6 +28,110 @@ class JobNotFoundError(LookupError):
    """Raised by API code when GET /documents/{id} hits a missing row."""


+class InvalidFieldPathError(ValueError):
+    """Raised when a PATCH request references an unsupported field path."""
+
+
+class JobAlreadyApprovedError(RuntimeError):
+    """Raised when a PATCH is attempted against an already-approved job."""
+
+
+class JobNotCompletedError(RuntimeError):
+    """Raised when a PATCH/approve is attempted against a job that hasn't
+    produced a ``result`` payload yet (e.g. still pending or failed).
+    """
+
+
+@dataclass(frozen=True)
+class AppliedCorrection:
+    """Internal record of a correction that successfully applied to the
+    in-memory ``result`` dict. The repository turns this into a persisted
+    ``JobCorrectionRow`` after the whole batch is validated.
+    """
+
+    field_path: str
+    old_value: Any
+    new_value: Any
+    reason: str | None
+
+
+# Allow-list of top-level keys we let reviewers edit. Keeps the attack
+# surface small: they can't inject arbitrary fields into the JSON blob.
+_ALLOWED_ROOTS: frozenset[str] = frozenset({"header", "ttd", "personel", "untuk"})
+
+# Matches a single path segment like ``personel[3]`` — supports at most one
+# index per segment, enough for the list fields we care about.
+_SEGMENT_RE = re.compile(r"^([a-zA-Z_][a-zA-Z0-9_]*)(?:\[(\d+)\])?$")
+
+
+def _split_path(path: str) -> list[tuple[str, int | None]]:
+    """Parse ``header.nomor_sprint`` or ``personel[2].nrp`` into segments.
+
+    Returns list of ``(name, index_or_none)`` tuples. Raises
+    ``InvalidFieldPathError`` on malformed input so the caller can surface
+    a 400 to the client.
+    """
+    if not path or path.startswith(".") or path.endswith("."):
+        raise InvalidFieldPathError(f"Invalid field path: {path!r}")
+
+    parts = path.split(".")
+    out: list[tuple[str, int | None]] = []
+    for part in parts:
+        match = _SEGMENT_RE.match(part)
+        if match is None:
+            raise InvalidFieldPathError(f"Invalid segment in path: {part!r}")
+        name = match.group(1)
+        idx_raw = match.group(2)
+        idx = int(idx_raw) if idx_raw is not None else None
+        out.append((name, idx))
+
+    if out[0][0] not in _ALLOWED_ROOTS:
+        raise InvalidFieldPathError(
+            f"Field path root {out[0][0]!r} not in allowed roots {sorted(_ALLOWED_ROOTS)!r}"
+        )
+    return out
+
+
+def _apply_path(data: dict[str, Any], path: str, new_value: Any) -> Any:
+    """Apply a single correction to ``data`` in place. Returns the old
+    value so the caller can record it in the audit row.
+
+    Does NOT validate that the new value matches the field's expected
+    type — that's the reviewer's responsibility; the whole point of HITL
+    is to let humans override the model's typing.
+    """
+    segments = _split_path(path)
+    cursor: Any = data
+    for name, idx in segments[:-1]:
+        if not isinstance(cursor, dict) or name not in cursor:
+            raise InvalidFieldPathError(f"Cannot traverse to {path!r}: missing {name!r}")
+        cursor = cursor[name]
+        if idx is not None:
+            if not isinstance(cursor, list) or idx >= len(cursor):
+                raise InvalidFieldPathError(
+                    f"Cannot traverse to {path!r}: index [{idx}] out of range"
+                )
+            cursor = cursor[idx]
+
+    name, idx = segments[-1]
+    if idx is not None:
+        # Terminal segment is a list-element, e.g. ``untuk[2]``.
+        if not isinstance(cursor, dict) or name not in cursor:
+            raise InvalidFieldPathError(f"Cannot apply to {path!r}: missing container {name!r}")
+        container = cursor[name]
+        if not isinstance(container, list) or idx >= len(container):
+            raise InvalidFieldPathError(f"Cannot apply to {path!r}: index [{idx}] out of range")
+        old = container[idx]
+        container[idx] = new_value
+        return old
+
+    if not isinstance(cursor, dict):
+        raise InvalidFieldPathError(f"Cannot apply to {path!r}: parent is not an object")
+    old = cursor.get(name)
+    cursor[name] = new_value
+    return old
+
+
 class JobRepository:
    """SQL-backed repository for `jobs` rows."""

@@ -94,3 +201,139 @@ class JobRepository:
        if row is None:
            raise JobNotFoundError(f"Job not found: {job_id}")
        return row
+
+    # ---------- Phase 6 — HITL ----------
+
+    def apply_corrections(
+        self,
+        job_id: UUID,
+        *,
+        corrections: list[tuple[str, Any, str | None]],
+        corrected_by: str | None,
+    ) -> list[JobCorrectionRow]:
+        """Apply a batch of field corrections atomically.
+
+        ``corrections`` is a list of ``(path, new_value, reason)`` tuples.
+        Returns the persisted audit rows so the caller can surface them in
+        the response.
+
+        Raises
+        ------
+        JobNotFoundError
+            If the row doesn't exist.
+        JobNotCompletedError
+            If the job hasn't produced a result yet (status pending /
+            processing / failed).
+        JobAlreadyApprovedError
+            If the job has been approved — edits are locked.
+        InvalidFieldPathError
+            If any path is malformed or references a disallowed root.
+        """
+        row = self._get_or_raise(job_id)
+        if row.result is None:
+            raise JobNotCompletedError(
+                f"Job {job_id} has no result to correct (status={row.status})"
+            )
+        if row.approved:
+            raise JobAlreadyApprovedError(f"Job {job_id} is already approved; edits are locked")
+
+        # Deep-copy so we can roll back in memory if any correction fails.
+        # The underlying JSON column will only be re-assigned once every
+        # path applied cleanly.
+        working = copy.deepcopy(row.result)
+        applied: list[AppliedCorrection] = []
+        for path, new_value, reason in corrections:
+            old_value = _apply_path(working, path, new_value)
+            applied.append(
+                AppliedCorrection(
+                    field_path=path, old_value=old_value, new_value=new_value, reason=reason
+                )
+            )
+
+        # Persist audit rows first; if they fail the session rollback also
+        # undoes the result-column update we're about to do.
+        persisted: list[JobCorrectionRow] = []
+        for event in applied:
+            row_event = JobCorrectionRow(
+                job_id=job_id,
+                field_path=event.field_path,
+                old_value=event.old_value,
+                new_value=event.new_value,
+                corrected_by=corrected_by,
+                reason=event.reason,
+            )
+            self.session.add(row_event)
+            persisted.append(row_event)
+
+        # Clear review flags that the correction has resolved. Right now we
+        # only auto-clear MISSING_FIELD when any corrected field previously
+        # held a null/empty value — the reviewer explicitly filled a gap.
+        row.result = working
+        row.review_flags = _recompute_flags(
+            original_flags=list(row.review_flags or []),
+            applied=applied,
+            working_result=working,
+        )
+        row.updated_at = _utcnow()
+
+        self.session.flush()
+        return persisted
+
+    def list_corrections(self, job_id: UUID) -> list[JobCorrectionRow]:
+        """Return the full audit trail for ``job_id`` in chronological order."""
+        # ``get_or_raise`` so callers get a 404 instead of an empty list
+        # when the job itself doesn't exist.
+        self._get_or_raise(job_id)
+        stmt = (
+            select(JobCorrectionRow)
+            .where(JobCorrectionRow.job_id == job_id)
+            .order_by(JobCorrectionRow.corrected_at, JobCorrectionRow.id)
+        )
+        return list(self.session.scalars(stmt))
+
+    def approve(self, job_id: UUID, *, reviewed_by: str | None) -> JobRow:
+        """Mark a job as approved. Idempotent — re-approving is a no-op
+        that keeps the original ``reviewed_at`` (so the audit trail stays
+        intact).
+        """
+        row = self._get_or_raise(job_id)
+        if row.result is None:
+            raise JobNotCompletedError(
+                f"Job {job_id} has no result to approve (status={row.status})"
+            )
+        if row.approved:
+            return row
+        row.approved = True
+        row.reviewed_by = reviewed_by
+        row.reviewed_at = _utcnow()
+        row.updated_at = row.reviewed_at
+        return row
+
+
+def _recompute_flags(
+    *,
+    original_flags: list[str],
+    applied: list[AppliedCorrection],
+    working_result: dict[str, Any],
+) -> list[str]:
+    """Update review flags in light of the corrections just applied.
+
+    Keeps the policy simple on purpose:
+    * ``missing_field`` is removed if after the edit every required
+      header field is non-empty.
+    * Other flags stay untouched — the reviewer should either correct the
+      underlying issue (which this helper can detect) or explicitly
+      approve the result as-is (which bypasses the flag list).
+    """
+    flags = list(original_flags)
+    if "missing_field" in flags:
+        header = working_result.get("header") or {}
+        filled = all(bool(header.get(key)) for key in ("nomor_sprint", "satuan_penerbit"))
+        if filled:
+            flags = [f for f in flags if f != "missing_field"]
+
+    # ``applied`` isn't used directly in this MVP rule, but we keep the
+    # parameter so future policies can inspect exactly what changed
+    # without re-diffing the blob.
+    _ = applied
+    return flags
--- a/src/ocr_sprint/schemas/document.py
+++ b/src/ocr_sprint/schemas/document.py
@@ -55,3 +55,7 @@ class DocumentResponse(BaseModel):
    data: ExtractionResult | None = None
    review_flags: list[str] = Field(default_factory=list)
    error: str | None = None
+    # Phase 6 — HITL review state.
+    approved: bool = False
+    reviewed_by: str | None = None
+    reviewed_at: datetime | None = None
--- a/src/ocr_sprint/schemas/review.py
+++ b/src/ocr_sprint/schemas/review.py
@@ -0,0 +1,62 @@
+"""Request / response schemas for the HITL review endpoints (Phase 6).
+
+The API surface is deliberately small:
+
+* ``CorrectionRequest`` — body of ``PATCH /documents/{id}``. A list of
+  ``FieldCorrection`` entries; each one is applied atomically (all-or-
+  nothing) and recorded in the audit trail.
+* ``CorrectionEventResponse`` — single row in ``GET /documents/{id}/history``.
+* ``ApprovalResponse`` — echo back after ``POST /documents/{id}/approve``.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any
+from uuid import UUID
+
+from pydantic import BaseModel, Field
+
+
+class FieldCorrection(BaseModel):
+    """One field-level correction.
+
+    ``path`` is a dotted JSON path into ``ExtractionResult``. Supported
+    roots: ``header``, ``ttd``, ``personel[n]`` (n is a 0-based index),
+    ``untuk``. The path is validated by the repository before being
+    applied; unknown roots return 400.
+    """
+
+    path: str = Field(..., description="Dotted JSON path, e.g. 'header.nomor_sprint'.")
+    value: Any = Field(..., description="New value (any JSON-serialisable payload).")
+    reason: str | None = Field(
+        None, max_length=512, description="Optional free-form reason for the correction."
+    )
+
+
+class CorrectionRequest(BaseModel):
+    """PATCH body — one or more field corrections, applied atomically."""
+
+    corrections: list[FieldCorrection] = Field(..., min_length=1)
+
+
+class CorrectionEventResponse(BaseModel):
+    """One row of the audit log surfaced by GET /history."""
+
+    id: int
+    job_id: UUID
+    field_path: str
+    old_value: Any | None = None
+    new_value: Any | None = None
+    corrected_by: str | None = None
+    reason: str | None = None
+    corrected_at: datetime
+
+
+class ApprovalResponse(BaseModel):
+    """Echo returned after a job is approved."""
+
+    job_id: UUID
+    approved: bool
+    reviewed_by: str | None = None
+    reviewed_at: datetime | None = None