Add initial project structure with OCR functionality and dependencies
- Create .gitignore to exclude Python-generated files and virtual environments - Add .python-version for Python version management - Implement main OCR script (main.py) to process PDF files and extract text - Add PDF processing functions in pdf_ocr.py - Update README.md with project description, requirements, and usage instructions - Include pyproject.toml for project metadata and dependencies - Add uv.lock for dependency resolution
This commit is contained in:
+61
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
import fitz
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def get_total_pages(pdf_path: Path) -> int:
|
||||
"""Return total number of pages in a PDF file."""
|
||||
with fitz.open(pdf_path) as document:
|
||||
return document.page_count
|
||||
|
||||
|
||||
def ocr_pdf_range(
|
||||
pdf_path: Path,
|
||||
start_page: int,
|
||||
end_page: int,
|
||||
*,
|
||||
lang: str,
|
||||
dpi: int,
|
||||
rotate: int = 0,
|
||||
on_progress: Callable[[int, int, int], None] | None = None,
|
||||
) -> list[tuple[int, str]]:
|
||||
"""Run OCR for the selected inclusive page range and return extracted text."""
|
||||
scale = dpi / 72.0
|
||||
matrix = fitz.Matrix(scale, scale)
|
||||
total_in_range = end_page - start_page + 1
|
||||
results: list[tuple[int, str]] = []
|
||||
|
||||
with fitz.open(pdf_path) as document:
|
||||
for index, page_number in enumerate(range(start_page, end_page + 1), start=1):
|
||||
if on_progress is not None:
|
||||
on_progress(index, total_in_range, page_number)
|
||||
|
||||
page = document.load_page(page_number - 1)
|
||||
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
|
||||
image = Image.frombytes(
|
||||
"RGB",
|
||||
(pixmap.width, pixmap.height),
|
||||
pixmap.samples,
|
||||
)
|
||||
try:
|
||||
if rotate:
|
||||
rotated_image = image.rotate(-rotate, expand=True)
|
||||
else:
|
||||
rotated_image = image
|
||||
|
||||
text = pytesseract.image_to_string(rotated_image, lang=lang)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"OCR failed on page {page_number}") from exc
|
||||
finally:
|
||||
if rotate:
|
||||
rotated_image.close()
|
||||
image.close()
|
||||
|
||||
results.append((page_number, text))
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user