Add initial project structure with OCR functionality and dependencies

- Create .gitignore to exclude Python-generated files and virtual environments - Add .python-version for Python version management - Implement main OCR script (main.py) to process PDF files and extract text - Add PDF processing functions in pdf_ocr.py - Update README.md with project description, requirements, and usage instructions - Include pyproject.toml for project metadata and dependencies - Add uv.lock for dependency resolution
2026-03-16 20:18:38 +05:00
commit 546c26157d
7 changed files with 342 additions and 0 deletions
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Callable
+
+import fitz
+import pytesseract
+from PIL import Image
+
+
+def get_total_pages(pdf_path: Path) -> int:
+    """Return total number of pages in a PDF file."""
+    with fitz.open(pdf_path) as document:
+        return document.page_count
+
+
+def ocr_pdf_range(
+    pdf_path: Path,
+    start_page: int,
+    end_page: int,
+    *,
+    lang: str,
+    dpi: int,
+    rotate: int = 0,
+    on_progress: Callable[[int, int, int], None] | None = None,
+) -> list[tuple[int, str]]:
+    """Run OCR for the selected inclusive page range and return extracted text."""
+    scale = dpi / 72.0
+    matrix = fitz.Matrix(scale, scale)
+    total_in_range = end_page - start_page + 1
+    results: list[tuple[int, str]] = []
+
+    with fitz.open(pdf_path) as document:
+        for index, page_number in enumerate(range(start_page, end_page + 1), start=1):
+            if on_progress is not None:
+                on_progress(index, total_in_range, page_number)
+
+            page = document.load_page(page_number - 1)
+            pixmap = page.get_pixmap(matrix=matrix, alpha=False)
+            image = Image.frombytes(
+                "RGB",
+                (pixmap.width, pixmap.height),
+                pixmap.samples,
+            )
+            try:
+                if rotate:
+                    rotated_image = image.rotate(-rotate, expand=True)
+                else:
+                    rotated_image = image
+
+                text = pytesseract.image_to_string(rotated_image, lang=lang)
+            except Exception as exc:
+                raise RuntimeError(f"OCR failed on page {page_number}") from exc
+            finally:
+                if rotate:
+                    rotated_image.close()
+                image.close()
+
+            results.append((page_number, text))
+
+    return results