"""Top-level extraction result schemas.""" from __future__ import annotations from datetime import date from enum import Enum from pydantic import BaseModel, Field from ocr_sprint.schemas.personnel import PersonnelEntry class ReviewFlag(str, Enum): """Reasons a document was routed to human review.""" LOW_OCR_CONFIDENCE = "low_ocr_confidence" MISSING_FIELD = "missing_field" INVALID_NRP = "invalid_nrp" UNKNOWN_PANGKAT = "unknown_pangkat" PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch" DATE_PARSE_FAILED = "date_parse_failed" LLM_FALLBACK = "llm_fallback" LLM_UNAVAILABLE = "llm_unavailable" PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback" PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp" INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row" class Signatory(BaseModel): """The official signing the sprint (Penandatangan).""" nama: str | None = None pangkat: str | None = None nrp: str | None = None jabatan: str | None = None class HeaderFields(BaseModel): """Header fields parsed from the top portion of a sprint.""" nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.") tanggal: date | None = Field(None, description="Date the sprint was issued.") satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.") perihal: str | None = None dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.") class ExtractionResult(BaseModel): """Full structured payload extracted from a single sprint document.""" header: HeaderFields = Field(default_factory=HeaderFields) personel: list[PersonnelEntry] = Field(default_factory=list) untuk: list[str] = Field( default_factory=list, description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.", ) ttd: Signatory = Field(default_factory=Signatory) raw_text: str = Field(default="", description="Concatenated OCR text for debugging.") confidence: float = Field(0.0, ge=0.0, le=1.0) review_flags: list[ReviewFlag] = Field(default_factory=list)