- Create main script for extracting tables from PDF files and saving to Excel format. - Add dependency checks for required libraries and Ghostscript. - Implement functions for extracting tables and saving them to Excel. - Update README with usage instructions and examples. - Add devcontainer configuration for development environment. - Include .gitignore to exclude PDF and Excel files from version control. - Specify required packages in requirements.txt.
314 lines
11 KiB
Python
Executable File
314 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
PDF to Excel Table Extractor
|
|
|
|
A CLI tool to extract tables from PDF files and save them to Excel format.
|
|
"""
|
|
|
|
import argparse
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Check for required dependencies before importing them
|
|
def check_dependencies():
|
|
"""Check if all required dependencies are installed."""
|
|
missing_deps = []
|
|
|
|
try:
|
|
import camelot
|
|
except ImportError:
|
|
missing_deps.append("camelot-py[cv]")
|
|
|
|
try:
|
|
import pandas
|
|
except ImportError:
|
|
missing_deps.append("pandas")
|
|
|
|
try:
|
|
import openpyxl
|
|
except ImportError:
|
|
missing_deps.append("openpyxl")
|
|
|
|
try:
|
|
import cv2
|
|
except ImportError as e:
|
|
if "libGL" in str(e):
|
|
print("Error: Missing system library 'libGL.so.1'.")
|
|
print("Install it with: sudo apt-get install libgl1")
|
|
sys.exit(1)
|
|
missing_deps.append("opencv-python")
|
|
|
|
if missing_deps:
|
|
print("Error: Missing required Python packages:")
|
|
for dep in missing_deps:
|
|
print(f" - {dep}")
|
|
print(f"\nInstall them with: pip install {' '.join(missing_deps)}")
|
|
sys.exit(1)
|
|
|
|
# Check for Ghostscript
|
|
if not shutil.which("gs") and not shutil.which("gswin64c") and not shutil.which("gswin32c"):
|
|
print("Error: Ghostscript is not installed or not in PATH.")
|
|
print("\nInstall Ghostscript:")
|
|
print(" Ubuntu/Debian: sudo apt-get install ghostscript")
|
|
print(" Fedora/RHEL: sudo dnf install ghostscript")
|
|
print(" macOS: brew install ghostscript")
|
|
print(" Windows: https://www.ghostscript.com/releases/gsdnld.html")
|
|
sys.exit(1)
|
|
|
|
check_dependencies()
|
|
|
|
import camelot
|
|
import pandas as pd
|
|
|
|
|
|
def extract_tables_from_pdf(pdf_path: str, pages: str = "all") -> list:
|
|
"""
|
|
Extract tables from a PDF file.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
pages: Page numbers to extract from (default: "all")
|
|
Can be "all", "1", "1,2,3", or "1-3"
|
|
|
|
Returns:
|
|
List of pandas DataFrames containing the extracted tables
|
|
"""
|
|
try:
|
|
# Try lattice method first (works better for tables with visible borders)
|
|
tables = camelot.read_pdf(pdf_path, pages=pages, flavor="lattice")
|
|
|
|
if len(tables) == 0:
|
|
# Fall back to stream method (works for tables without visible borders)
|
|
print("No tables found with lattice method, trying stream method...")
|
|
tables = camelot.read_pdf(pdf_path, pages=pages, flavor="stream")
|
|
|
|
return tables
|
|
except FileNotFoundError:
|
|
print(f"Error: PDF file '{pdf_path}' not found.")
|
|
sys.exit(1)
|
|
except PermissionError:
|
|
print(f"Error: Permission denied when accessing '{pdf_path}'.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
error_msg = str(e).lower()
|
|
if "ghostscript" in error_msg:
|
|
print("Error: Ghostscript error occurred.")
|
|
print("Make sure Ghostscript is properly installed and accessible.")
|
|
print(f"Details: {e}")
|
|
elif "password" in error_msg or "encrypted" in error_msg:
|
|
print("Error: The PDF appears to be password-protected or encrypted.")
|
|
print("Please provide an unencrypted PDF file.")
|
|
elif "invalid" in error_msg and "page" in error_msg:
|
|
print(f"Error: Invalid page specification '{pages}'.")
|
|
print("Use 'all', a single page number (e.g., '1'), ")
|
|
print("a comma-separated list (e.g., '1,2,3'), or a range (e.g., '1-5').")
|
|
elif "no tables" in error_msg:
|
|
print("Error: No tables could be detected in the PDF.")
|
|
print("The PDF might contain images of tables rather than actual table data.")
|
|
else:
|
|
print(f"Error extracting tables: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def save_tables_to_excel(tables, output_path: str, separate_sheets: bool = True) -> None:
|
|
"""
|
|
Save extracted tables to an Excel file.
|
|
|
|
Args:
|
|
tables: List of camelot Table objects
|
|
output_path: Path to the output Excel file
|
|
separate_sheets: If True, save each table to a separate sheet
|
|
"""
|
|
if len(tables) == 0:
|
|
print("No tables found in the PDF.")
|
|
print("Tips:")
|
|
print(" - The PDF might contain images of tables rather than actual table data")
|
|
print(" - Try specifying different pages with the --pages option")
|
|
sys.exit(1)
|
|
|
|
# Check if output directory exists
|
|
output_dir = Path(output_path).parent
|
|
if output_dir and not output_dir.exists():
|
|
try:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
print(f"Created output directory: {output_dir}")
|
|
except PermissionError:
|
|
print(f"Error: Permission denied when creating directory '{output_dir}'.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error creating output directory: {e}")
|
|
sys.exit(1)
|
|
|
|
# Check if output file already exists and is writable
|
|
output_file = Path(output_path)
|
|
if output_file.exists():
|
|
try:
|
|
# Test if we can write to the file
|
|
with open(output_file, 'a'):
|
|
pass
|
|
except PermissionError:
|
|
print(f"Error: Cannot write to '{output_path}'. File may be open in another program.")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
|
if separate_sheets:
|
|
for i, table in enumerate(tables):
|
|
df = table.df
|
|
# Use first row as header
|
|
if len(df) > 0:
|
|
df.columns = df.iloc[0]
|
|
df = df[1:]
|
|
df.to_excel(writer, sheet_name=f"Table_{i+1}", index=False)
|
|
print(f"Table {i+1}: {len(df)} rows extracted (Page {table.page})")
|
|
else:
|
|
# Combine all tables into one sheet
|
|
all_dfs = []
|
|
for table in tables:
|
|
df = table.df
|
|
if len(df) > 0:
|
|
df.columns = df.iloc[0]
|
|
df = df[1:]
|
|
all_dfs.append(df)
|
|
|
|
if all_dfs:
|
|
combined_df = pd.concat(all_dfs, ignore_index=True)
|
|
combined_df.to_excel(writer, sheet_name="Combined_Tables", index=False)
|
|
print(f"Combined {len(tables)} tables into one sheet with {len(combined_df)} total rows")
|
|
else:
|
|
print("Warning: All tables were empty.")
|
|
except PermissionError:
|
|
print(f"Error: Permission denied when writing to '{output_path}'.")
|
|
print("Make sure the file is not open in another program.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error saving Excel file: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def validate_pages_arg(pages: str) -> bool:
|
|
"""
|
|
Validate the pages argument format.
|
|
|
|
Args:
|
|
pages: Pages specification string
|
|
|
|
Returns:
|
|
True if valid, False otherwise
|
|
"""
|
|
if pages.lower() == "all":
|
|
return True
|
|
|
|
# Check for valid formats: "1", "1,2,3", "1-5", "1,3-5,7"
|
|
import re
|
|
pattern = r'^(\d+(-\d+)?)(,\d+(-\d+)?)*$'
|
|
return bool(re.match(pattern, pages.replace(" ", "")))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract tables from PDF files and save to Excel format.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
%(prog)s input.pdf output.xlsx
|
|
%(prog)s input.pdf output.xlsx --pages 1,2,3
|
|
%(prog)s input.pdf output.xlsx --pages 1-5
|
|
%(prog)s input.pdf output.xlsx --combine
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"input_pdf",
|
|
help="Path to the input PDF file"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"output_excel",
|
|
help="Path to the output Excel file"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-p", "--pages",
|
|
default="all",
|
|
help="Pages to extract tables from (default: all). "
|
|
"Can be 'all', '1', '1,2,3', or '1-5'"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-c", "--combine",
|
|
action="store_true",
|
|
help="Combine all tables into a single sheet instead of separate sheets"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input file
|
|
input_path = Path(args.input_pdf)
|
|
if not input_path.exists():
|
|
print(f"Error: Input file '{args.input_pdf}' does not exist.")
|
|
sys.exit(1)
|
|
|
|
if not input_path.is_file():
|
|
print(f"Error: '{args.input_pdf}' is not a file.")
|
|
sys.exit(1)
|
|
|
|
if not input_path.suffix.lower() == ".pdf":
|
|
print(f"Warning: Input file may not be a PDF (extension: {input_path.suffix})")
|
|
|
|
# Check if input file is readable
|
|
try:
|
|
with open(input_path, 'rb') as f:
|
|
# Read first few bytes to check if it's a valid PDF
|
|
header = f.read(5)
|
|
if header != b'%PDF-':
|
|
print(f"Warning: File does not appear to be a valid PDF (missing PDF header).")
|
|
except PermissionError:
|
|
print(f"Error: Permission denied when reading '{args.input_pdf}'.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error reading input file: {e}")
|
|
sys.exit(1)
|
|
|
|
# Validate pages argument
|
|
if not validate_pages_arg(args.pages):
|
|
print(f"Error: Invalid page specification '{args.pages}'.")
|
|
print("Valid formats:")
|
|
print(" 'all' - Extract from all pages")
|
|
print(" '1' - Extract from page 1")
|
|
print(" '1,2,3' - Extract from pages 1, 2, and 3")
|
|
print(" '1-5' - Extract from pages 1 through 5")
|
|
print(" '1,3-5,7' - Extract from pages 1, 3-5, and 7")
|
|
sys.exit(1)
|
|
|
|
# Ensure output has .xlsx extension
|
|
output_path = args.output_excel
|
|
if not output_path.lower().endswith(".xlsx"):
|
|
output_path += ".xlsx"
|
|
|
|
print(f"Extracting tables from: {args.input_pdf}")
|
|
print(f"Pages: {args.pages}")
|
|
|
|
# Extract tables
|
|
tables = extract_tables_from_pdf(args.input_pdf, args.pages)
|
|
|
|
print(f"Found {len(tables)} table(s)")
|
|
|
|
# Save to Excel
|
|
save_tables_to_excel(tables, output_path, separate_sheets=not args.combine)
|
|
|
|
print(f"Successfully saved to: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\nOperation cancelled by user.")
|
|
sys.exit(130)
|
|
except Exception as e:
|
|
print(f"Unexpected error: {e}")
|
|
print("If this problem persists, please report it with the full error message.")
|
|
sys.exit(1)
|