Update README and main.py to clarify optional page range arguments
This commit is contained in:
@@ -19,6 +19,8 @@ Run OCR for an inclusive 1-based page range and write to a text file:
|
|||||||
|
|
||||||
uv run python main.py --input "input.pdf" --start 5 --end 12 --output "result.txt"
|
uv run python main.py --input "input.pdf" --start 5 --end 12 --output "result.txt"
|
||||||
|
|
||||||
|
If `--start` and `--end` are both omitted, OCR runs from the first page to the last page.
|
||||||
|
|
||||||
Optional flags:
|
Optional flags:
|
||||||
|
|
||||||
- --lang (default: rus+eng)
|
- --lang (default: rus+eng)
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
description="Recognize text from selected PDF pages and save to a text file.",
|
description="Recognize text from selected PDF pages and save to a text file.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--input", required=True, type=Path, help="Path to source PDF file.")
|
parser.add_argument("--input", required=True, type=Path, help="Path to source PDF file.")
|
||||||
parser.add_argument("--start", required=True, type=int, help="Start page (1-based, inclusive).")
|
parser.add_argument("--start", type=int, help="Start page (1-based, inclusive). Default: 1.")
|
||||||
parser.add_argument("--end", required=True, type=int, help="End page (1-based, inclusive).")
|
parser.add_argument("--end", type=int, help="End page (1-based, inclusive). Default: last page.")
|
||||||
parser.add_argument("--output", required=True, type=Path, help="Path to output TXT file.")
|
parser.add_argument("--output", required=True, type=Path, help="Path to output TXT file.")
|
||||||
parser.add_argument("--lang", default="rus+eng", help="Tesseract language(s), example: rus+eng.")
|
parser.add_argument("--lang", default="rus+eng", help="Tesseract language(s), example: rus+eng.")
|
||||||
parser.add_argument("--dpi", default=300, type=int, help="Render DPI before OCR. Default: 300.")
|
parser.add_argument("--dpi", default=300, type=int, help="Render DPI before OCR. Default: 300.")
|
||||||
@@ -37,6 +37,17 @@ def validate_args(args: argparse.Namespace) -> int:
|
|||||||
if input_path.suffix.lower() != ".pdf":
|
if input_path.suffix.lower() != ".pdf":
|
||||||
raise ValueError(f"Input file must have .pdf extension: {input_path}")
|
raise ValueError(f"Input file must have .pdf extension: {input_path}")
|
||||||
|
|
||||||
|
total_pages = get_total_pages(input_path)
|
||||||
|
|
||||||
|
if args.start is None and args.end is None:
|
||||||
|
args.start = 1
|
||||||
|
args.end = total_pages
|
||||||
|
else:
|
||||||
|
if args.start is None:
|
||||||
|
args.start = 1
|
||||||
|
if args.end is None:
|
||||||
|
args.end = total_pages
|
||||||
|
|
||||||
if args.start < 1:
|
if args.start < 1:
|
||||||
raise ValueError("--start must be >= 1.")
|
raise ValueError("--start must be >= 1.")
|
||||||
|
|
||||||
@@ -50,7 +61,6 @@ def validate_args(args: argparse.Namespace) -> int:
|
|||||||
if args.rotate not in allowed_rotations:
|
if args.rotate not in allowed_rotations:
|
||||||
raise ValueError("--rotate must be one of: 0, 90, 180, 270, -90, -180, -270.")
|
raise ValueError("--rotate must be one of: 0, 90, 180, 270, -90, -180, -270.")
|
||||||
|
|
||||||
total_pages = get_total_pages(input_path)
|
|
||||||
if args.end > total_pages:
|
if args.end > total_pages:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"--end ({args.end}) is out of range. Document has {total_pages} pages.",
|
f"--end ({args.end}) is out of range. Document has {total_pages} pages.",
|
||||||
|
|||||||
Reference in New Issue
Block a user