From d0e1835cc12610f05693ce79eec5f1ff0a595e28 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 15:06:58 +0000 Subject: [PATCH] Phase 2: document detection + perspective correction + shadow removal Adds OpenCV-based phone-photo handling that runs before the standard preprocessing pipeline for IMAGE source kinds (PDF renders are flat by construction and skip this stage). Pipeline additions in src/ocr_sprint/pipeline/document_detect.py: - _find_document_quad: Canny + dilate + contour search, picks the largest convex 4-point polygon above a configurable area threshold; fails gracefully and returns None when no usable quad is found. - _four_point_warp: orders corners (TL/TR/BR/BL via sum/diff trick) and runs cv2.getPerspectiveTransform + warpPerspective. - _remove_shadow: per-channel background-division (dilate + median blur + 255 - absdiff + normalize) for uneven phone-shot lighting. - detect_and_correct: top-level entrypoint with graceful fallback to the original image when detection fails. Wired into the synchronous orchestrator: only enabled for IMAGE sources, skipped for PDF. New settings: - preprocess_detect_document (default: true) - preprocess_remove_shadow (default: true) - preprocess_min_quad_area_fraction (default: 0.20) Tests: 9 new unit tests covering corner ordering, quad detection on synthetic skewed documents, perspective warp output sanity, shadow removal shape preservation, full-pipeline behavior, and graceful fallback when detection fails. 70 tests total, all green. ML-based dewarping (DewarpNet) and DocTR detector are deferred to a future phase per the roadmap; the existing API is structured so they can be added as alternative backends behind DocumentDetectConfig. Co-authored-by: adrian kuman firmansah --- .env.example | 5 + README.md | 8 +- src/ocr_sprint/config.py | 5 + src/ocr_sprint/pipeline/document_detect.py | 205 +++++++++++++++++++++ src/ocr_sprint/pipeline/orchestrator.py | 11 +- tests/unit/test_document_detect.py | 128 +++++++++++++ 6 files changed, 357 insertions(+), 5 deletions(-) create mode 100644 src/ocr_sprint/pipeline/document_detect.py create mode 100644 tests/unit/test_document_detect.py diff --git a/.env.example b/.env.example index edd8ef2..530eff9 100644 --- a/.env.example +++ b/.env.example @@ -21,6 +21,11 @@ PREPROCESS_DENOISE=true PREPROCESS_DESKEW=true PREPROCESS_ADAPTIVE_THRESHOLD=false # turn on for low-quality phone photos +# ==== Document detection (Phase 2, IMAGE sources only) ==== +PREPROCESS_DETECT_DOCUMENT=true +PREPROCESS_REMOVE_SHADOW=true +PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20 + # ==== Confidence / routing (Phase 5) ==== CONFIDENCE_AUTO_APPROVE=0.95 CONFIDENCE_NEEDS_REVIEW=0.85 diff --git a/README.md b/README.md index 6e5558f..b8d2de0 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ OCR + structured extraction service for Indonesian police "surat sprint" (surat perintah) documents. Built around **FastAPI + PaddleOCR + hybrid extraction (regex → LLM lokal → validation)** with **on-premise** deployment as a hard requirement. -> **Status:** Phase 1 MVP — synchronous PDF/image OCR with regex header extraction, validation, and confidence scoring. Phase 2–6 (document detection, table extraction, async pipeline, LLM extraction, HITL) are tracked in [`docs/architecture.md`](docs/architecture.md). +> **Status:** Phase 1+2 — synchronous PDF/image OCR with regex header extraction, validation, confidence scoring, and **document detection + perspective correction + shadow removal** for phone photos. Phase 3–6 (table extraction, async pipeline, LLM extraction, HITL) are tracked in [`docs/architecture.md`](docs/architecture.md). ## Why this stack @@ -97,7 +97,7 @@ Pre-commit hooks run ruff on every commit. Install once with `pre-commit install src/ocr_sprint/ api/ # FastAPI routes + error handlers schemas/ # Pydantic v2 models (request/response, extraction, personnel) - pipeline/ # ingest → preprocess → ocr → extract → validate → score + pipeline/ # ingest → document_detect → preprocess → ocr → extract → validate → score extract/ # regex_rules.py (Phase 1) → llm.py (Phase 5) data/ # master data (Polri ranks, etc.) utils/ # logging, helpers @@ -111,8 +111,8 @@ docs/ # architecture & decision records | Phase | Scope | Status | |---|---|---| -| 1 | Sync API, PDF/image ingest, basic preprocessing, PaddleOCR, regex header extraction, validation, confidence scoring | **In progress** | -| 2 | DocTR document detection + dewarping for phone photos | Planned | +| 1 | Sync API, PDF/image ingest, basic preprocessing, PaddleOCR, regex header extraction, validation, confidence scoring | **Done** | +| 2 | OpenCV-based document detection, perspective transform, shadow removal for phone photos | **Done** | | 3 | PP-Structure table extraction for personnel rows | Planned | | 4 | Async pipeline (Celery + Redis), Postgres + MinIO, auth, observability | Planned | | 5 | LLM hybrid extraction (Ollama + structured output) | Planned | diff --git a/src/ocr_sprint/config.py b/src/ocr_sprint/config.py index 18a955c..1e2e8a5 100644 --- a/src/ocr_sprint/config.py +++ b/src/ocr_sprint/config.py @@ -42,6 +42,11 @@ class Settings(BaseSettings): preprocess_deskew: bool = True preprocess_adaptive_threshold: bool = False + # Document detection (Phase 2) — applied to IMAGE sources only + preprocess_detect_document: bool = True + preprocess_remove_shadow: bool = True + preprocess_min_quad_area_fraction: float = Field(0.20, ge=0.0, le=1.0) + # Confidence thresholds (Phase 5 routing) confidence_auto_approve: float = Field(0.95, ge=0.0, le=1.0) confidence_needs_review: float = Field(0.85, ge=0.0, le=1.0) diff --git a/src/ocr_sprint/pipeline/document_detect.py b/src/ocr_sprint/pipeline/document_detect.py new file mode 100644 index 0000000..4ea7e24 --- /dev/null +++ b/src/ocr_sprint/pipeline/document_detect.py @@ -0,0 +1,205 @@ +"""Phase 2 — document detection + perspective correction + shadow removal. + +Targets phone photos of surat sprint where the page is shot at an angle, with +uneven lighting, and not perfectly centered. The pipeline below uses pure +OpenCV (no ML model dependency) to: + + 1. detect the four corners of the document inside the image, + 2. apply a perspective transform to obtain a flat top-down rectangle, + 3. remove shadows via morphological background division. + +Failure mode is **graceful**: if no usable document quadrilateral is found, +we return the original image untouched and log a warning. The downstream +`preprocess` stage will still run. + +Future work (tracked in docs/architecture.md): + - swap the contour heuristic for a DocTR / MobileSAM model when accuracy + on real Polri photos isn't enough, + - add full ML-based dewarping (DewarpNet) for curved/folded pages. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import cv2 +import numpy as np + +from ocr_sprint.pipeline.ingest import NDArrayU8 +from ocr_sprint.utils.logging import get_logger + +_logger = get_logger(__name__) + +# Internal working size for contour detection. Smaller = faster + less noise. +_DETECT_HEIGHT = 500 + +# Reject any candidate quad whose area is below this fraction of the image — +# they're almost certainly not the document but logos, stamps, or text blocks. +_MIN_AREA_FRACTION = 0.20 + +# Polygon-approximation epsilon (% of perimeter). 2% works well for paper. +_POLY_EPSILON = 0.02 + + +@dataclass(frozen=True) +class DocumentDetectConfig: + """Tunable knobs for document detection.""" + + detect_document: bool = True + remove_shadow: bool = True + min_area_fraction: float = _MIN_AREA_FRACTION + + +# ---------- public surface ---------- + + +def detect_and_correct( + img: NDArrayU8, + cfg: DocumentDetectConfig | None = None, +) -> NDArrayU8: + """Run detection + perspective correction + shadow removal in one shot. + + Returns either the warped + cleaned image, or the original if detection + failed. Always returns a BGR uint8 ndarray. + """ + if cfg is None: + cfg = DocumentDetectConfig() + + out = img + if cfg.detect_document: + quad = _find_document_quad(img, min_area_fraction=cfg.min_area_fraction) + if quad is not None: + out = _four_point_warp(img, quad) + _logger.info("document_detect.warped", shape=out.shape[:2]) + else: + _logger.info("document_detect.no_quad_found", shape=img.shape[:2]) + + if cfg.remove_shadow: + out = _remove_shadow(out) + + return out + + +# ---------- corner detection ---------- + + +def _find_document_quad( + img: NDArrayU8, + min_area_fraction: float = _MIN_AREA_FRACTION, +) -> NDArrayU8 | None: + """Locate the document quadrilateral; return 4x2 corners or None.""" + h_orig, w_orig = img.shape[:2] + if h_orig < 50 or w_orig < 50: + return None + + scale = _DETECT_HEIGHT / float(h_orig) + if scale >= 1.0: + small = img + scale = 1.0 + else: + small = cv2.resize( + img, + (round(w_orig * scale), _DETECT_HEIGHT), + interpolation=cv2.INTER_AREA, + ) + + gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY) + gray = cv2.bilateralFilter(gray, 9, 75, 75) + edges = cv2.Canny(gray, 60, 180) + # close small gaps so contours are continuous + edges = cv2.dilate(edges, np.ones((3, 3), np.uint8), iterations=2) + + contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + if not contours: + return None + + image_area = float(small.shape[0] * small.shape[1]) + candidates = sorted(contours, key=cv2.contourArea, reverse=True)[:5] + + for contour in candidates: + perimeter = cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, _POLY_EPSILON * perimeter, True) + if len(approx) != 4: + continue + if not cv2.isContourConvex(approx): + continue + area = cv2.contourArea(approx) + if area < min_area_fraction * image_area: + return None # remaining contours are smaller — give up early + quad = approx.reshape(4, 2).astype(np.float32) / scale + return quad + + return None + + +# ---------- perspective transform ---------- + + +def _order_corners(pts: NDArrayU8) -> NDArrayU8: + """Return points sorted as (top-left, top-right, bottom-right, bottom-left).""" + rect = np.zeros((4, 2), dtype=np.float32) + s = pts.sum(axis=1) + diff = np.diff(pts, axis=1) + rect[0] = pts[np.argmin(s)] # smallest sum → top-left + rect[2] = pts[np.argmax(s)] # largest sum → bottom-right + rect[1] = pts[np.argmin(diff)] # smallest diff → top-right + rect[3] = pts[np.argmax(diff)] # largest diff → bottom-left + return rect + + +def _four_point_warp(img: NDArrayU8, quad: NDArrayU8) -> NDArrayU8: + """Warp the image to a top-down view of the detected quadrilateral.""" + rect = _order_corners(quad) + tl, tr, br, bl = rect + + width_top = float(np.linalg.norm(tr - tl)) + width_bot = float(np.linalg.norm(br - bl)) + height_left = float(np.linalg.norm(bl - tl)) + height_right = float(np.linalg.norm(br - tr)) + + max_width = round(max(width_top, width_bot)) + max_height = round(max(height_left, height_right)) + if max_width <= 1 or max_height <= 1: + return img + + dst = np.array( + [ + [0, 0], + [max_width - 1, 0], + [max_width - 1, max_height - 1], + [0, max_height - 1], + ], + dtype=np.float32, + ) + matrix = cv2.getPerspectiveTransform(rect, dst) + return cv2.warpPerspective(img, matrix, (max_width, max_height)) + + +# ---------- shadow removal ---------- + + +def _remove_shadow(img: NDArrayU8) -> NDArrayU8: + """Background-division shadow removal applied per channel. + + Idea: dilate + median-blur the channel to estimate the local background; + subtract from 255 minus the absolute diff to flatten lighting; normalize + back to 0-255. Cheap and surprisingly effective on phone shots. + """ + planes = cv2.split(img) + cleaned: list[NDArrayU8] = [] + kernel = np.ones((7, 7), np.uint8) + for plane in planes: + dilated = cv2.dilate(plane, kernel) + bg = cv2.medianBlur(dilated, 21) + diff: NDArrayU8 = (255 - cv2.absdiff(plane, bg)).astype(np.uint8) + # cv2 stubs reject None dst even though the runtime accepts it. + norm = cv2.normalize( # type: ignore[call-overload] + diff, + None, + alpha=0, + beta=255, + norm_type=cv2.NORM_MINMAX, + dtype=cv2.CV_8U, + ) + cleaned.append(norm) + return cv2.merge(cleaned) diff --git a/src/ocr_sprint/pipeline/orchestrator.py b/src/ocr_sprint/pipeline/orchestrator.py index 547993b..980f30b 100644 --- a/src/ocr_sprint/pipeline/orchestrator.py +++ b/src/ocr_sprint/pipeline/orchestrator.py @@ -14,6 +14,7 @@ from dataclasses import dataclass from ocr_sprint.config import get_settings from ocr_sprint.pipeline.confidence import compute_confidence, route +from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory from ocr_sprint.pipeline.extract.validators import validate_extraction from ocr_sprint.pipeline.ingest import detect_source_kind, ingest @@ -56,10 +57,18 @@ def run_pipeline(content: bytes) -> PipelineOutput: deskew=s.preprocess_deskew, adaptive_threshold=s.preprocess_adaptive_threshold, ) + # Document detection only makes sense on photographed images. PDF renders + # are already flat by construction, so we skip the heavy quad search there. + detect_cfg = DocumentDetectConfig( + detect_document=s.preprocess_detect_document and kind == SourceKind.IMAGE, + remove_shadow=s.preprocess_remove_shadow and kind == SourceKind.IMAGE, + min_area_fraction=s.preprocess_min_quad_area_fraction, + ) ocr_pages: list[OCRPage] = [] for page in pages: - cleaned = preprocess(page.image, pre_cfg) + corrected = detect_and_correct(page.image, detect_cfg) + cleaned = preprocess(corrected, pre_cfg) ocr_pages.append(run_ocr(cleaned)) full_text = "\n".join(p.text for p in ocr_pages) diff --git a/tests/unit/test_document_detect.py b/tests/unit/test_document_detect.py new file mode 100644 index 0000000..2b0ebf2 --- /dev/null +++ b/tests/unit/test_document_detect.py @@ -0,0 +1,128 @@ +"""Tests for Phase 2 document detection + perspective correction.""" + +from __future__ import annotations + +import cv2 +import numpy as np +import pytest + +from ocr_sprint.pipeline.document_detect import ( + DocumentDetectConfig, + _find_document_quad, + _four_point_warp, + _order_corners, + _remove_shadow, + detect_and_correct, +) + + +def _synthetic_skewed_doc( + canvas_size: tuple[int, int] = (900, 700), + page_size: tuple[int, int] = (500, 380), + skew: tuple[int, int, int, int] = (40, -20, -30, 25), +) -> np.ndarray: + """Create a dark canvas containing a white quadrilateral 'document'. + + Returns a BGR uint8 image. Useful for testing detection without bringing + in real photos. + """ + h, w = canvas_size + img = np.full((h, w, 3), 30, dtype=np.uint8) # dark grey background + page_w, page_h = page_size + cx, cy = w // 2, h // 2 + # corners offset by `skew` to simulate perspective distortion + sx_tl, sx_tr, sx_br, sx_bl = skew + pts = np.array( + [ + [cx - page_w // 2 + sx_tl, cy - page_h // 2 + sx_tl], # TL + [cx + page_w // 2 + sx_tr, cy - page_h // 2 - sx_tr], # TR + [cx + page_w // 2 + sx_br, cy + page_h // 2 + sx_br], # BR + [cx - page_w // 2 + sx_bl, cy + page_h // 2 - sx_bl], # BL + ], + dtype=np.int32, + ) + cv2.fillPoly(img, [pts], color=(245, 245, 245)) + # add a fake text line so the page isn't a flat white blob + cv2.line(img, (cx - 100, cy), (cx + 100, cy), color=(20, 20, 20), thickness=4) + return img + + +class TestOrderCorners: + def test_canonical_order(self) -> None: + # input pts unordered: BR, TL, TR, BL + pts = np.array([[100, 100], [10, 10], [110, 8], [9, 95]], dtype=np.float32) + ordered = _order_corners(pts) + # TL has the smallest sum + assert tuple(ordered[0]) == pytest.approx((10.0, 10.0)) + # BR has the largest sum + assert tuple(ordered[2]) == pytest.approx((100.0, 100.0)) + + +class TestFindDocumentQuad: + def test_finds_quad_in_synthetic_image(self) -> None: + img = _synthetic_skewed_doc() + quad = _find_document_quad(img, min_area_fraction=0.10) + assert quad is not None + assert quad.shape == (4, 2) + + def test_returns_none_for_flat_image(self) -> None: + flat = np.full((400, 400, 3), 200, dtype=np.uint8) + assert _find_document_quad(flat) is None + + def test_returns_none_for_tiny_image(self) -> None: + tiny = np.full((20, 20, 3), 200, dtype=np.uint8) + assert _find_document_quad(tiny) is None + + +class TestFourPointWarp: + def test_warp_produces_axis_aligned_rectangle(self) -> None: + img = _synthetic_skewed_doc() + quad = _find_document_quad(img, min_area_fraction=0.10) + assert quad is not None + warped = _four_point_warp(img, quad) + # warped output should be smaller than the canvas (it's only the page) + assert warped.shape[0] < img.shape[0] + assert warped.shape[1] < img.shape[1] + # majority of the warped image should be near-white (the page) + gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY) + bright_fraction = float((gray > 200).mean()) + assert bright_fraction > 0.7 + + +class TestRemoveShadow: + def test_output_shape_matches_input(self) -> None: + img = np.random.randint(0, 256, (200, 300, 3), dtype=np.uint8) + out = _remove_shadow(img) + assert out.shape == img.shape + assert out.dtype == np.uint8 + + +class TestDetectAndCorrect: + def test_full_pipeline_on_skewed_doc(self) -> None: + img = _synthetic_skewed_doc() + out = detect_and_correct( + img, + DocumentDetectConfig(detect_document=True, remove_shadow=True, min_area_fraction=0.1), + ) + # the output is the cropped+flattened page, so it must be smaller + assert out.shape[0] < img.shape[0] + + def test_disabled_returns_input_unchanged(self) -> None: + img = _synthetic_skewed_doc() + out = detect_and_correct( + img, + DocumentDetectConfig(detect_document=False, remove_shadow=False), + ) + assert out.shape == img.shape + # disabled both flags → bytes-identical + assert np.array_equal(out, img) + + def test_failed_detection_falls_back_to_input(self) -> None: + # flat grey image with no contour-detectable document + flat = np.full((300, 300, 3), 128, dtype=np.uint8) + out = detect_and_correct( + flat, + DocumentDetectConfig(detect_document=True, remove_shadow=False, min_area_fraction=0.5), + ) + # detection failed, so the original shape is preserved + assert out.shape == flat.shape