61 lines
2.1 KiB
Python
61 lines
2.1 KiB
Python
"""Top-level extraction result schemas."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
from enum import Enum
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
|
|
|
|
|
class ReviewFlag(str, Enum):
|
|
"""Reasons a document was routed to human review."""
|
|
|
|
LOW_OCR_CONFIDENCE = "low_ocr_confidence"
|
|
MISSING_FIELD = "missing_field"
|
|
INVALID_NRP = "invalid_nrp"
|
|
UNKNOWN_PANGKAT = "unknown_pangkat"
|
|
PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch"
|
|
DATE_PARSE_FAILED = "date_parse_failed"
|
|
LLM_FALLBACK = "llm_fallback"
|
|
LLM_UNAVAILABLE = "llm_unavailable"
|
|
PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
|
|
PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp"
|
|
INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"
|
|
|
|
|
|
class Signatory(BaseModel):
|
|
"""The official signing the sprint (Penandatangan)."""
|
|
|
|
nama: str | None = None
|
|
pangkat: str | None = None
|
|
nrp: str | None = None
|
|
jabatan: str | None = None
|
|
|
|
|
|
class HeaderFields(BaseModel):
|
|
"""Header fields parsed from the top portion of a sprint."""
|
|
|
|
nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.")
|
|
tanggal: date | None = Field(None, description="Date the sprint was issued.")
|
|
satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.")
|
|
perihal: str | None = None
|
|
dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.")
|
|
|
|
|
|
class ExtractionResult(BaseModel):
|
|
"""Full structured payload extracted from a single sprint document."""
|
|
|
|
header: HeaderFields = Field(default_factory=HeaderFields)
|
|
personel: list[PersonnelEntry] = Field(default_factory=list)
|
|
untuk: list[str] = Field(
|
|
default_factory=list,
|
|
description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.",
|
|
)
|
|
ttd: Signatory = Field(default_factory=Signatory)
|
|
raw_text: str = Field(default="", description="Concatenated OCR text for debugging.")
|
|
confidence: float = Field(0.0, ge=0.0, le=1.0)
|
|
review_flags: list[ReviewFlag] = Field(default_factory=list)
|