Phase 2: document detection + perspective correction + shadow removal

Adds OpenCV-based phone-photo handling that runs before the standard preprocessing pipeline for IMAGE source kinds (PDF renders are flat by construction and skip this stage). Pipeline additions in src/ocr_sprint/pipeline/document_detect.py: - _find_document_quad: Canny + dilate + contour search, picks the largest convex 4-point polygon above a configurable area threshold; fails gracefully and returns None when no usable quad is found. - _four_point_warp: orders corners (TL/TR/BR/BL via sum/diff trick) and runs cv2.getPerspectiveTransform + warpPerspective. - _remove_shadow: per-channel background-division (dilate + median blur + 255 - absdiff + normalize) for uneven phone-shot lighting. - detect_and_correct: top-level entrypoint with graceful fallback to the original image when detection fails. Wired into the synchronous orchestrator: only enabled for IMAGE sources, skipped for PDF. New settings: - preprocess_detect_document (default: true) - preprocess_remove_shadow (default: true) - preprocess_min_quad_area_fraction (default: 0.20) Tests: 9 new unit tests covering corner ordering, quad detection on synthetic skewed documents, perspective warp output sanity, shadow removal shape preservation, full-pipeline behavior, and graceful fallback when detection fails. 70 tests total, all green. ML-based dewarping (DewarpNet) and DocTR detector are deferred to a future phase per the roadmap; the existing API is structured so they can be added as alternative backends behind DocumentDetectConfig. Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 15:06:58 +00:00
parent ca0c0a0428
commit d0e1835cc1
6 changed files with 357 additions and 5 deletions
--- a/tests/unit/test_document_detect.py
+++ b/tests/unit/test_document_detect.py
@@ -0,0 +1,128 @@
+"""Tests for Phase 2 document detection + perspective correction."""
+
+from __future__ import annotations
+
+import cv2
+import numpy as np
+import pytest
+
+from ocr_sprint.pipeline.document_detect import (
+    DocumentDetectConfig,
+    _find_document_quad,
+    _four_point_warp,
+    _order_corners,
+    _remove_shadow,
+    detect_and_correct,
+)
+
+
+def _synthetic_skewed_doc(
+    canvas_size: tuple[int, int] = (900, 700),
+    page_size: tuple[int, int] = (500, 380),
+    skew: tuple[int, int, int, int] = (40, -20, -30, 25),
+) -> np.ndarray:
+    """Create a dark canvas containing a white quadrilateral 'document'.
+
+    Returns a BGR uint8 image. Useful for testing detection without bringing
+    in real photos.
+    """
+    h, w = canvas_size
+    img = np.full((h, w, 3), 30, dtype=np.uint8)  # dark grey background
+    page_w, page_h = page_size
+    cx, cy = w // 2, h // 2
+    # corners offset by `skew` to simulate perspective distortion
+    sx_tl, sx_tr, sx_br, sx_bl = skew
+    pts = np.array(
+        [
+            [cx - page_w // 2 + sx_tl, cy - page_h // 2 + sx_tl],  # TL
+            [cx + page_w // 2 + sx_tr, cy - page_h // 2 - sx_tr],  # TR
+            [cx + page_w // 2 + sx_br, cy + page_h // 2 + sx_br],  # BR
+            [cx - page_w // 2 + sx_bl, cy + page_h // 2 - sx_bl],  # BL
+        ],
+        dtype=np.int32,
+    )
+    cv2.fillPoly(img, [pts], color=(245, 245, 245))
+    # add a fake text line so the page isn't a flat white blob
+    cv2.line(img, (cx - 100, cy), (cx + 100, cy), color=(20, 20, 20), thickness=4)
+    return img
+
+
+class TestOrderCorners:
+    def test_canonical_order(self) -> None:
+        # input pts unordered: BR, TL, TR, BL
+        pts = np.array([[100, 100], [10, 10], [110, 8], [9, 95]], dtype=np.float32)
+        ordered = _order_corners(pts)
+        # TL has the smallest sum
+        assert tuple(ordered[0]) == pytest.approx((10.0, 10.0))
+        # BR has the largest sum
+        assert tuple(ordered[2]) == pytest.approx((100.0, 100.0))
+
+
+class TestFindDocumentQuad:
+    def test_finds_quad_in_synthetic_image(self) -> None:
+        img = _synthetic_skewed_doc()
+        quad = _find_document_quad(img, min_area_fraction=0.10)
+        assert quad is not None
+        assert quad.shape == (4, 2)
+
+    def test_returns_none_for_flat_image(self) -> None:
+        flat = np.full((400, 400, 3), 200, dtype=np.uint8)
+        assert _find_document_quad(flat) is None
+
+    def test_returns_none_for_tiny_image(self) -> None:
+        tiny = np.full((20, 20, 3), 200, dtype=np.uint8)
+        assert _find_document_quad(tiny) is None
+
+
+class TestFourPointWarp:
+    def test_warp_produces_axis_aligned_rectangle(self) -> None:
+        img = _synthetic_skewed_doc()
+        quad = _find_document_quad(img, min_area_fraction=0.10)
+        assert quad is not None
+        warped = _four_point_warp(img, quad)
+        # warped output should be smaller than the canvas (it's only the page)
+        assert warped.shape[0] < img.shape[0]
+        assert warped.shape[1] < img.shape[1]
+        # majority of the warped image should be near-white (the page)
+        gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
+        bright_fraction = float((gray > 200).mean())
+        assert bright_fraction > 0.7
+
+
+class TestRemoveShadow:
+    def test_output_shape_matches_input(self) -> None:
+        img = np.random.randint(0, 256, (200, 300, 3), dtype=np.uint8)
+        out = _remove_shadow(img)
+        assert out.shape == img.shape
+        assert out.dtype == np.uint8
+
+
+class TestDetectAndCorrect:
+    def test_full_pipeline_on_skewed_doc(self) -> None:
+        img = _synthetic_skewed_doc()
+        out = detect_and_correct(
+            img,
+            DocumentDetectConfig(detect_document=True, remove_shadow=True, min_area_fraction=0.1),
+        )
+        # the output is the cropped+flattened page, so it must be smaller
+        assert out.shape[0] < img.shape[0]
+
+    def test_disabled_returns_input_unchanged(self) -> None:
+        img = _synthetic_skewed_doc()
+        out = detect_and_correct(
+            img,
+            DocumentDetectConfig(detect_document=False, remove_shadow=False),
+        )
+        assert out.shape == img.shape
+        # disabled both flags → bytes-identical
+        assert np.array_equal(out, img)
+
+    def test_failed_detection_falls_back_to_input(self) -> None:
+        # flat grey image with no contour-detectable document
+        flat = np.full((300, 300, 3), 128, dtype=np.uint8)
+        out = detect_and_correct(
+            flat,
+            DocumentDetectConfig(detect_document=True, remove_shadow=False, min_area_fraction=0.5),
+        )
+        # detection failed, so the original shape is preserved
+        assert out.shape == flat.shape