Files
pdf-reader/pdf_ocr.py
T
k1nq 546c26157d Add initial project structure with OCR functionality and dependencies
- Create .gitignore to exclude Python-generated files and virtual environments
- Add .python-version for Python version management
- Implement main OCR script (main.py) to process PDF files and extract text
- Add PDF processing functions in pdf_ocr.py
- Update README.md with project description, requirements, and usage instructions
- Include pyproject.toml for project metadata and dependencies
- Add uv.lock for dependency resolution
2026-03-16 20:18:38 +05:00

61 lines
1.8 KiB
Python

from __future__ import annotations
from pathlib import Path
from typing import Callable
import fitz
import pytesseract
from PIL import Image
def get_total_pages(pdf_path: Path) -> int:
"""Return total number of pages in a PDF file."""
with fitz.open(pdf_path) as document:
return document.page_count
def ocr_pdf_range(
pdf_path: Path,
start_page: int,
end_page: int,
*,
lang: str,
dpi: int,
rotate: int = 0,
on_progress: Callable[[int, int, int], None] | None = None,
) -> list[tuple[int, str]]:
"""Run OCR for the selected inclusive page range and return extracted text."""
scale = dpi / 72.0
matrix = fitz.Matrix(scale, scale)
total_in_range = end_page - start_page + 1
results: list[tuple[int, str]] = []
with fitz.open(pdf_path) as document:
for index, page_number in enumerate(range(start_page, end_page + 1), start=1):
if on_progress is not None:
on_progress(index, total_in_range, page_number)
page = document.load_page(page_number - 1)
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
image = Image.frombytes(
"RGB",
(pixmap.width, pixmap.height),
pixmap.samples,
)
try:
if rotate:
rotated_image = image.rotate(-rotate, expand=True)
else:
rotated_image = image
text = pytesseract.image_to_string(rotated_image, lang=lang)
except Exception as exc:
raise RuntimeError(f"OCR failed on page {page_number}") from exc
finally:
if rotate:
rotated_image.close()
image.close()
results.append((page_number, text))
return results