commit 6b18868ca6c696c6830d1d3486aa3c7ba2c728a7 Author: Lukas Holzner Date: Tue Dec 2 09:22:42 2025 +0000 Add initial implementation of PDF to Excel table extractor - Create main script for extracting tables from PDF files and saving to Excel format. - Add dependency checks for required libraries and Ghostscript. - Implement functions for extracting tables and saving them to Excel. - Update README with usage instructions and examples. - Add devcontainer configuration for development environment. - Include .gitignore to exclude PDF and Excel files from version control. - Specify required packages in requirements.txt. diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..93248be --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/python +{ + "name": "Python 3", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "mcr.microsoft.com/devcontainers/python:2-3.13-trixie" + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "pip3 install --user -r requirements.txt", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e2d2551 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pdf +*.xlsx \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b18d5c8 --- /dev/null +++ b/README.md @@ -0,0 +1,131 @@ +# PDF to Excel Table Extractor + +A command-line tool to extract tables from PDF files and save them to Excel format. + +## Features + +- Extract tables from single or multiple pages +- Automatically detects tables with and without visible borders +- Save each table to separate sheets or combine them into one +- Preserves table headers + +## Prerequisites + +This tool requires **Ghostscript** to be installed on your system. + +### Install Ghostscript + +**Ubuntu/Debian:** +```bash +sudo apt-get install ghostscript +``` + +**Fedora/RHEL:** +```bash +sudo dnf install ghostscript +``` + +**macOS (using Homebrew):** +```bash +brew install ghostscript +``` + +**Windows:** +Download and install from: https://www.ghostscript.com/releases/gsdnld.html + +## Installation + +1. Clone or download this repository: + ```bash + cd pdf-to-excel + ``` + +2. Create a virtual environment (recommended): + ```bash + python3 -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + ``` + +3. Install the required Python packages: + ```bash + pip install -r requirements.txt + ``` + +## Usage + +### Basic Usage + +Extract all tables from a PDF: +```bash +python pdf_to_excel.py input.pdf output.xlsx +``` + +### Options + +| Option | Description | +|--------|-------------| +| `-p, --pages` | Specify pages to extract from. Default: `all` | +| `-c, --combine` | Combine all tables into a single sheet | + +### Examples + +Extract tables from all pages: +```bash +python pdf_to_excel.py document.pdf tables.xlsx +``` + +Extract tables from specific pages: +```bash +python pdf_to_excel.py document.pdf tables.xlsx --pages 1,2,3 +``` + +Extract tables from a page range: +```bash +python pdf_to_excel.py document.pdf tables.xlsx --pages 1-5 +``` + +Extract from a single page: +```bash +python pdf_to_excel.py document.pdf tables.xlsx --pages 1 +``` + +Combine all tables into one sheet: +```bash +python pdf_to_excel.py document.pdf tables.xlsx --combine +``` + +## Output + +By default, each extracted table is saved to a separate sheet in the Excel file: +- `Table_1` - First table found +- `Table_2` - Second table found +- etc. + +When using the `--combine` flag, all tables are merged into a single sheet called `Combined_Tables`. + +## Troubleshooting + +### "No tables found" +- The PDF might contain images of tables rather than actual table data. This tool works best with text-based PDFs. +- Try specifying specific pages with the `--pages` option. + +### Ghostscript errors +- Make sure Ghostscript is installed and available in your system PATH. +- On Windows, you may need to restart your terminal after installing Ghostscript. + +### Poor table extraction quality +- The tool automatically tries two extraction methods (lattice and stream). +- For tables with clear borders, the lattice method is used. +- For tables without borders, the stream method is used. + +## Dependencies + +- **camelot-py**: PDF table extraction library +- **pandas**: Data manipulation and Excel export +- **openpyxl**: Excel file writing +- **opencv-python**: Image processing for table detection +- **ghostscript**: PDF rendering (system dependency) + +## License + +MIT License diff --git a/pdf_to_excel.py b/pdf_to_excel.py new file mode 100755 index 0000000..56efd1c --- /dev/null +++ b/pdf_to_excel.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +""" +PDF to Excel Table Extractor + +A CLI tool to extract tables from PDF files and save them to Excel format. +""" + +import argparse +import shutil +import sys +from pathlib import Path + +# Check for required dependencies before importing them +def check_dependencies(): + """Check if all required dependencies are installed.""" + missing_deps = [] + + try: + import camelot + except ImportError: + missing_deps.append("camelot-py[cv]") + + try: + import pandas + except ImportError: + missing_deps.append("pandas") + + try: + import openpyxl + except ImportError: + missing_deps.append("openpyxl") + + try: + import cv2 + except ImportError as e: + if "libGL" in str(e): + print("Error: Missing system library 'libGL.so.1'.") + print("Install it with: sudo apt-get install libgl1") + sys.exit(1) + missing_deps.append("opencv-python") + + if missing_deps: + print("Error: Missing required Python packages:") + for dep in missing_deps: + print(f" - {dep}") + print(f"\nInstall them with: pip install {' '.join(missing_deps)}") + sys.exit(1) + + # Check for Ghostscript + if not shutil.which("gs") and not shutil.which("gswin64c") and not shutil.which("gswin32c"): + print("Error: Ghostscript is not installed or not in PATH.") + print("\nInstall Ghostscript:") + print(" Ubuntu/Debian: sudo apt-get install ghostscript") + print(" Fedora/RHEL: sudo dnf install ghostscript") + print(" macOS: brew install ghostscript") + print(" Windows: https://www.ghostscript.com/releases/gsdnld.html") + sys.exit(1) + +check_dependencies() + +import camelot +import pandas as pd + + +def extract_tables_from_pdf(pdf_path: str, pages: str = "all") -> list: + """ + Extract tables from a PDF file. + + Args: + pdf_path: Path to the PDF file + pages: Page numbers to extract from (default: "all") + Can be "all", "1", "1,2,3", or "1-3" + + Returns: + List of pandas DataFrames containing the extracted tables + """ + try: + # Try lattice method first (works better for tables with visible borders) + tables = camelot.read_pdf(pdf_path, pages=pages, flavor="lattice") + + if len(tables) == 0: + # Fall back to stream method (works for tables without visible borders) + print("No tables found with lattice method, trying stream method...") + tables = camelot.read_pdf(pdf_path, pages=pages, flavor="stream") + + return tables + except FileNotFoundError: + print(f"Error: PDF file '{pdf_path}' not found.") + sys.exit(1) + except PermissionError: + print(f"Error: Permission denied when accessing '{pdf_path}'.") + sys.exit(1) + except Exception as e: + error_msg = str(e).lower() + if "ghostscript" in error_msg: + print("Error: Ghostscript error occurred.") + print("Make sure Ghostscript is properly installed and accessible.") + print(f"Details: {e}") + elif "password" in error_msg or "encrypted" in error_msg: + print("Error: The PDF appears to be password-protected or encrypted.") + print("Please provide an unencrypted PDF file.") + elif "invalid" in error_msg and "page" in error_msg: + print(f"Error: Invalid page specification '{pages}'.") + print("Use 'all', a single page number (e.g., '1'), ") + print("a comma-separated list (e.g., '1,2,3'), or a range (e.g., '1-5').") + elif "no tables" in error_msg: + print("Error: No tables could be detected in the PDF.") + print("The PDF might contain images of tables rather than actual table data.") + else: + print(f"Error extracting tables: {e}") + sys.exit(1) + + +def save_tables_to_excel(tables, output_path: str, separate_sheets: bool = True) -> None: + """ + Save extracted tables to an Excel file. + + Args: + tables: List of camelot Table objects + output_path: Path to the output Excel file + separate_sheets: If True, save each table to a separate sheet + """ + if len(tables) == 0: + print("No tables found in the PDF.") + print("Tips:") + print(" - The PDF might contain images of tables rather than actual table data") + print(" - Try specifying different pages with the --pages option") + sys.exit(1) + + # Check if output directory exists + output_dir = Path(output_path).parent + if output_dir and not output_dir.exists(): + try: + output_dir.mkdir(parents=True, exist_ok=True) + print(f"Created output directory: {output_dir}") + except PermissionError: + print(f"Error: Permission denied when creating directory '{output_dir}'.") + sys.exit(1) + except Exception as e: + print(f"Error creating output directory: {e}") + sys.exit(1) + + # Check if output file already exists and is writable + output_file = Path(output_path) + if output_file.exists(): + try: + # Test if we can write to the file + with open(output_file, 'a'): + pass + except PermissionError: + print(f"Error: Cannot write to '{output_path}'. File may be open in another program.") + sys.exit(1) + + try: + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + if separate_sheets: + for i, table in enumerate(tables): + df = table.df + # Use first row as header + if len(df) > 0: + df.columns = df.iloc[0] + df = df[1:] + df.to_excel(writer, sheet_name=f"Table_{i+1}", index=False) + print(f"Table {i+1}: {len(df)} rows extracted (Page {table.page})") + else: + # Combine all tables into one sheet + all_dfs = [] + for table in tables: + df = table.df + if len(df) > 0: + df.columns = df.iloc[0] + df = df[1:] + all_dfs.append(df) + + if all_dfs: + combined_df = pd.concat(all_dfs, ignore_index=True) + combined_df.to_excel(writer, sheet_name="Combined_Tables", index=False) + print(f"Combined {len(tables)} tables into one sheet with {len(combined_df)} total rows") + else: + print("Warning: All tables were empty.") + except PermissionError: + print(f"Error: Permission denied when writing to '{output_path}'.") + print("Make sure the file is not open in another program.") + sys.exit(1) + except Exception as e: + print(f"Error saving Excel file: {e}") + sys.exit(1) + + +def validate_pages_arg(pages: str) -> bool: + """ + Validate the pages argument format. + + Args: + pages: Pages specification string + + Returns: + True if valid, False otherwise + """ + if pages.lower() == "all": + return True + + # Check for valid formats: "1", "1,2,3", "1-5", "1,3-5,7" + import re + pattern = r'^(\d+(-\d+)?)(,\d+(-\d+)?)*$' + return bool(re.match(pattern, pages.replace(" ", ""))) + + +def main(): + parser = argparse.ArgumentParser( + description="Extract tables from PDF files and save to Excel format.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s input.pdf output.xlsx + %(prog)s input.pdf output.xlsx --pages 1,2,3 + %(prog)s input.pdf output.xlsx --pages 1-5 + %(prog)s input.pdf output.xlsx --combine + """ + ) + + parser.add_argument( + "input_pdf", + help="Path to the input PDF file" + ) + + parser.add_argument( + "output_excel", + help="Path to the output Excel file" + ) + + parser.add_argument( + "-p", "--pages", + default="all", + help="Pages to extract tables from (default: all). " + "Can be 'all', '1', '1,2,3', or '1-5'" + ) + + parser.add_argument( + "-c", "--combine", + action="store_true", + help="Combine all tables into a single sheet instead of separate sheets" + ) + + args = parser.parse_args() + + # Validate input file + input_path = Path(args.input_pdf) + if not input_path.exists(): + print(f"Error: Input file '{args.input_pdf}' does not exist.") + sys.exit(1) + + if not input_path.is_file(): + print(f"Error: '{args.input_pdf}' is not a file.") + sys.exit(1) + + if not input_path.suffix.lower() == ".pdf": + print(f"Warning: Input file may not be a PDF (extension: {input_path.suffix})") + + # Check if input file is readable + try: + with open(input_path, 'rb') as f: + # Read first few bytes to check if it's a valid PDF + header = f.read(5) + if header != b'%PDF-': + print(f"Warning: File does not appear to be a valid PDF (missing PDF header).") + except PermissionError: + print(f"Error: Permission denied when reading '{args.input_pdf}'.") + sys.exit(1) + except Exception as e: + print(f"Error reading input file: {e}") + sys.exit(1) + + # Validate pages argument + if not validate_pages_arg(args.pages): + print(f"Error: Invalid page specification '{args.pages}'.") + print("Valid formats:") + print(" 'all' - Extract from all pages") + print(" '1' - Extract from page 1") + print(" '1,2,3' - Extract from pages 1, 2, and 3") + print(" '1-5' - Extract from pages 1 through 5") + print(" '1,3-5,7' - Extract from pages 1, 3-5, and 7") + sys.exit(1) + + # Ensure output has .xlsx extension + output_path = args.output_excel + if not output_path.lower().endswith(".xlsx"): + output_path += ".xlsx" + + print(f"Extracting tables from: {args.input_pdf}") + print(f"Pages: {args.pages}") + + # Extract tables + tables = extract_tables_from_pdf(args.input_pdf, args.pages) + + print(f"Found {len(tables)} table(s)") + + # Save to Excel + save_tables_to_excel(tables, output_path, separate_sheets=not args.combine) + + print(f"Successfully saved to: {output_path}") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("\nOperation cancelled by user.") + sys.exit(130) + except Exception as e: + print(f"Unexpected error: {e}") + print("If this problem persists, please report it with the full error message.") + sys.exit(1) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ff1b4c7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +camelot-py[base]==0.11.0 +pandas>=2.0.0 +openpyxl>=3.1.0 +opencv-python>=4.8.0 +ghostscript>=0.7