# PDFAlchemy
[](https://badge.fury.io/py/pdfalchemy)
[](https://www.python.org/downloads/)
[](https://opensource.org/licenses/MIT)
[](https://github.com/psf/black)
A powerful Python library for advanced PDF processing with focus on image extraction and conversion capabilities.
## Features
- **PDF to PNG Conversion**: Convert PDF pages to high-quality PNG images with customizable DPI
- **Image Extraction**: Extract individual images from PDF pages using advanced computer vision algorithms
- **Flood Fill Algorithm**: Intelligent image detection and separation using morphological operations
- **Size and Aspect Ratio Filtering**: Filter extracted images based on customizable criteria
- **Base64 Encoding**: Convert PDF pages to base64-encoded PNG strings for web applications
- **Command Line Interface**: Easy-to-use CLI for batch processing and automation
- **Type Safety**: Full type hints and Pydantic validation for robust data handling
- **Comprehensive Testing**: Extensive test suite with 28+ test cases
## Installation
### From PyPI (Recommended)
```bash
pip install pdfalchemy
```
### From Source
```bash
git clone https://github.com/jainparul9814/pdfalchemy.git
cd pdfalchemy
# Install base dependencies
pip install -r requirements.txt
# Install in development mode
pip install -e .
# Install development dependencies (optional)
pip install -r requirements-dev.txt
```
## Quick Start
### Python API
```python
from pdfalchemy import PDFProcessor, PNGConversionInput, ImageExtractionInput
# Initialize processor
processor = PDFProcessor()
# Convert PDF to PNG
with open("document.pdf", "rb") as f:
pdf_bytes = f.read()
png_input = PNGConversionInput(
pdf_bytes=pdf_bytes,
dpi=300, # High resolution
first_page=1,
last_page=5
)
png_result = processor.to_png(png_input)
print(f"Converted {png_result.total_pages} pages")
# Extract images from PNG
for i, png_bytes in enumerate(png_result.png_images):
extraction_input = ImageExtractionInput(
png_bytes=png_bytes,
min_width=50,
min_height=50,
flood_fill_threshold=0.2,
noise_reduction=True
)
extraction_result = processor.extract_images_from_png(extraction_input)
print(f"Page {i+1}: Extracted {extraction_result.total_images} images")
```
### Command Line Interface
```bash
# Convert PDF to PNG images
pdfalchemy to-png document.pdf --output ./images/ --dpi 300
# Convert specific pages (range, list, or single page)
pdfalchemy to-png document.pdf --pages 1-5 --dpi 200
pdfalchemy to-png document.pdf --pages 1,3,5 --dpi 200
pdfalchemy to-png document.pdf --pages 3 --dpi 200
# Convert to base64 for web applications
pdfalchemy to-base64 document.pdf --dpi 200 --output images.json
# Extract individual images from PDF pages
pdfalchemy extract-images document.pdf --output ./extracted/ --min-size 100x100
# Extract images with custom filters
pdfalchemy extract-images document.pdf --min-width 50 --max-width 800 --aspect-ratio 0.5-2.0
# Extract images with advanced options
pdfalchemy extract-images document.pdf \
--output ./extracted/ \
--dpi 300 \
--pages 1-5 \
--min-size 100x100 \
--max-size 800x600 \
--aspect-ratio 0.5-2.0 \
--threshold 0.15 \
--format json \
--summary
# Get help for any command
pdfalchemy extract-images --help
```
## Advanced Usage
### Image Extraction with Custom Filters
```python
from pdfalchemy import PDFProcessor, ImageExtractionInput
processor = PDFProcessor()
# Configure image extraction with specific criteria
extraction_input = ImageExtractionInput(
png_bytes=png_bytes,
min_width=100, # Minimum width in pixels
min_height=100, # Minimum height in pixels
max_width=800, # Maximum width in pixels
max_height=600, # Maximum height in pixels
min_aspect_ratio=0.5, # Minimum aspect ratio (width/height)
max_aspect_ratio=2.0, # Maximum aspect ratio
flood_fill_threshold=0.15, # Threshold for flood fill algorithm
noise_reduction=True, # Enable noise reduction
separate_connected_regions=True # Separate connected regions
)
result = processor.extract_images_from_png(extraction_input)
print(f"Extracted {result.total_images} images")
print(f"Filtered out {result.filtered_count} images")
print(f"Processing time: {result.processing_time_ms:.2f} ms")
```
### Batch Processing
```python
from pathlib import Path
from pdfalchemy import PDFProcessor, PNGConversionInput
processor = PDFProcessor()
pdf_files = Path("./pdfs/").glob("*.pdf")
for pdf_file in pdf_files:
print(f"Processing {pdf_file}")
with open(pdf_file, "rb") as f:
pdf_bytes = f.read()
png_input = PNGConversionInput(
pdf_bytes=pdf_bytes,
dpi=200
)
result = processor.to_png(png_input)
print(f" Converted {result.total_pages} pages")
```
### Base64 Conversion for Web Applications
```python
from pdfalchemy import PDFProcessor, PNGConversionInput
processor = PDFProcessor()
with open("document.pdf", "rb") as f:
pdf_bytes = f.read()
png_input = PNGConversionInput(
pdf_bytes=pdf_bytes,
dpi=200
)
# Get base64 encoded PNG images
base64_images = processor.to_png_base64(png_input)
# Use in web applications
for i, base64_str in enumerate(base64_images):
html_img_tag = f'<img src="data:image/png;base64,{base64_str}" alt="Page {i+1}">'
print(html_img_tag)
```
## Command Line Interface
PDFAlchemy provides a powerful command-line interface for batch processing and automation.
### Available Commands
#### `to-png` - Convert PDF to PNG Images
```bash
pdfalchemy to-png <pdf_file> [options]
```
**Options:**
- `--output, -o`: Output directory for PNG files
- `--dpi`: DPI resolution (default: 200, range: 72-1200)
- `--pages`: Page range (e.g., '1-5', '1,3,5', or '3')
**Examples:**
```bash
# Convert all pages
pdfalchemy to-png document.pdf --output ./images/
# Convert specific pages with high resolution
pdfalchemy to-png document.pdf --dpi 300 --pages 1-5 --output ./high_res/
# Convert single page
pdfalchemy to-png document.pdf --pages 3 --output ./single_page/
```
#### `to-base64` - Convert PDF to Base64 Encoded PNG
```bash
pdfalchemy to-base64 <pdf_file> [options]
```
**Options:**
- `--output, -o`: Output file for base64 data (JSON format)
- `--dpi`: DPI resolution (default: 200)
- `--pages`: Page range (e.g., '1-5', '1,3,5', or '3')
**Examples:**
```bash
# Convert to base64 for web applications
pdfalchemy to-base64 document.pdf --dpi 200 --output images.json
# Convert specific pages
pdfalchemy to-base64 document.pdf --pages 1-3 --output selected_pages.json
```
#### `extract-images` - Extract Individual Images from PDF
```bash
pdfalchemy extract-images <pdf_file> [options]
```
**Basic Options:**
- `--output, -o`: Output directory for extracted images
- `--dpi`: DPI resolution for conversion (default: 200)
- `--pages`: Page range (e.g., '1-5', '1,3,5', or '3')
**Size Filtering:**
- `--min-size`: Minimum size in pixels (e.g., '100x100')
- `--max-size`: Maximum size in pixels (e.g., '800x600')
- `--min-width`: Minimum width in pixels
- `--min-height`: Minimum height in pixels
- `--max-width`: Maximum width in pixels
- `--max-height`: Maximum height in pixels
**Advanced Filtering:**
- `--aspect-ratio`: Aspect ratio range (e.g., '0.5-2.0')
- `--threshold`: Flood fill threshold (0.0-1.0, default: 0.1)
- `--no-noise-reduction`: Disable noise reduction
- `--no-separate-regions`: Disable connected region separation
- `--sort-order`: Sort order for extracted images ('top-bottom', 'left-right', 'reading-order', default: 'top-bottom')
**Output Options:**
- `--format`: Output format ('png' or 'json', default: 'png')
- `--summary`: Show detailed extraction summary
**Examples:**
```bash
# Basic image extraction
pdfalchemy extract-images document.pdf --output ./extracted/ --min-size 100x100
# Advanced filtering with custom sort order
pdfalchemy extract-images document.pdf \
--output ./filtered/ \
--min-width 50 \
--max-width 800 \
--aspect-ratio 0.5-2.0 \
--threshold 0.15 \
--sort-order reading-order
# JSON output with summary
pdfalchemy extract-images document.pdf \
--output ./json_output/ \
--format json \
--summary \
--pages 1-5
# High-resolution extraction with custom filters
pdfalchemy extract-images document.pdf \
--dpi 300 \
--output ./high_res_extracted/ \
--min-size 200x200 \
--max-size 1200x800 \
--aspect-ratio 0.8-1.5 \
--threshold 0.2 \
--no-noise-reduction
```
### Page Range Formats
The `--pages` option supports multiple formats:
- **Range**: `1-5` (pages 1 through 5)
- **List**: `1,3,5` (pages 1, 3, and 5)
- **Single**: `3` (page 3 only)
### Output Formats
#### PNG Format
- Saves individual PNG files for each extracted image
- File naming: `page_001_image_001.png`, `page_001_image_002.png`, etc.
- Suitable for visual inspection and further processing
#### JSON Format
- Saves all extracted images as base64-encoded data in a JSON file
- Includes metadata: page number, image index, size in bytes
- Suitable for web applications and programmatic access
### Sort Order Options
The `--sort-order` parameter controls how extracted images are ordered:
- **`top-bottom`** (default): Sort by y-coordinate first (top to bottom), then by x-coordinate (left to right)
- **`left-right`**: Sort by x-coordinate first (left to right), then by y-coordinate (top to bottom)
- **`reading-order`**: Group images by approximate rows and sort each row left-to-right, then sort rows top-to-bottom
### Performance Tips
1. **Use appropriate DPI**: Higher DPI provides better quality but increases processing time
2. **Filter early**: Use size and aspect ratio filters to reduce processing overhead
3. **Batch processing**: Process multiple files in scripts for automation
4. **Memory management**: For large PDFs, consider processing page ranges
## Configuration
PDFAlchemy uses Pydantic models for configuration and validation. All input and output models include comprehensive validation and type checking.
## Data Models
### PNGConversionInput
- `pdf_bytes`: PDF data as byte array
- `dpi`: Resolution in DPI (72-1200, default: 200)
- `first_page`: First page to convert (1-indexed, optional)
- `last_page`: Last page to convert (1-indexed, optional)
### PNGConversionOutput
- `png_images`: List of PNG images as byte arrays
- `total_pages`: Total number of pages converted
- `dpi_used`: DPI used for conversion
- `page_range`: Page range converted (e.g., '1-5')
- `total_size_bytes`: Total size of all PNG images
### ImageExtractionInput
- `png_bytes`: PNG image data as byte array
- `min_width/min_height`: Minimum dimensions for extracted images
- `max_width/max_height`: Maximum dimensions for extracted images
- `min_aspect_ratio/max_aspect_ratio`: Aspect ratio constraints
- `flood_fill_threshold`: Threshold for flood fill algorithm (0.0-1.0)
- `noise_reduction`: Enable noise reduction
- `separate_connected_regions`: Attempt to separate connected regions
- `sort_order`: Sort order for extracted images ('top-bottom', 'left-right', 'reading-order')
### ImageExtractionOutput
- `extracted_images`: List of base64 encoded extracted images
- `total_images`: Total number of extracted images
- `filtered_count`: Number of images filtered out
- `processing_time_ms`: Processing time in milliseconds
- `total_size_bytes`: Total size of all extracted images
## Development
### Setup Development Environment
```bash
git clone https://github.com/jainparul9814/pdfalchemy.git
cd pdfalchemy
# Install all dependencies
pip install -r requirements-dev.txt
# Install in development mode
pip install -e .
# Setup pre-commit hooks
pre-commit install
```
### Running Tests
```bash
# Run all tests
pytest
# Run core tests with verbose output
pytest tests/test_core.py -v
# Run with coverage
pytest --cov=src.pdfalchemy.core --cov-report=term-missing
```
### Code Quality
```bash
# Format code
black src/ tests/
# Sort imports
isort src/ tests/
# Type checking
mypy src/
# Linting
flake8 src/ tests/
```
### Sample Scripts
Check the `sample_test_scripts/` directory for working examples:
```bash
python sample_test_scripts/test_image_extraction.py
```
### Building and Publishing
```bash
# Clean previous builds
rm -rf dist/ build/ *.egg-info
# Build the package
python -m build
# Upload to PyPI
python -m twine upload dist/*
```
**Note**: Make sure you have the required build tools installed:
```bash
pip install build twine
```
## Dependencies
### Core Dependencies
- `pydantic>=2.0.0`: Data validation and settings management
- `pdf2image>=1.16.0`: PDF to image conversion
- `opencv-python>=4.8.0`: Computer vision for image processing
- `Pillow>=9.0.0`: Image processing
- `numpy>=1.21.0`: Numerical computing
### Development Dependencies
- `pytest>=7.0.0`: Testing framework
- `black>=23.0.0`: Code formatting
- `isort>=5.12.0`: Import sorting
- `flake8>=6.0.0`: Linting
- `mypy>=1.0.0`: Type checking
## Contributing
We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details.
1. Fork the repository
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
3. Commit your changes (`git commit -m 'Add amazing feature'`)
4. Push to the branch (`git push origin feature/amazing-feature`)
5. Open a Pull Request
## License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
## Support
- **Issues**: [GitHub Issues](https://github.com/jainparul9814/pdfalchemy/issues)
- **Author**: Parul Jain (jainparul9814@gmail.com)
## Changelog
See [CHANGELOG.md](CHANGELOG.md) for a list of changes and version history.
Raw data
{
"_id": null,
"home_page": "https://github.com/jainparul9814/pdfalchemy",
"name": "pdfalchemy",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.9",
"maintainer_email": null,
"keywords": "pdf, document, processing, extraction, manipulation",
"author": "Parul Jain",
"author_email": "jainparul9814@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/49/cc/d7d3472741d141f41164a7684ec0e39b5d9d6960e84c07b8d6a7ea63ef76/pdfalchemy-0.1.0.tar.gz",
"platform": null,
"description": "# PDFAlchemy\n\n[](https://badge.fury.io/py/pdfalchemy)\n[](https://www.python.org/downloads/)\n[](https://opensource.org/licenses/MIT)\n[](https://github.com/psf/black)\n\nA powerful Python library for advanced PDF processing with focus on image extraction and conversion capabilities.\n\n## Features\n\n- **PDF to PNG Conversion**: Convert PDF pages to high-quality PNG images with customizable DPI\n- **Image Extraction**: Extract individual images from PDF pages using advanced computer vision algorithms\n- **Flood Fill Algorithm**: Intelligent image detection and separation using morphological operations\n- **Size and Aspect Ratio Filtering**: Filter extracted images based on customizable criteria\n- **Base64 Encoding**: Convert PDF pages to base64-encoded PNG strings for web applications\n- **Command Line Interface**: Easy-to-use CLI for batch processing and automation\n- **Type Safety**: Full type hints and Pydantic validation for robust data handling\n- **Comprehensive Testing**: Extensive test suite with 28+ test cases\n\n## Installation\n\n### From PyPI (Recommended)\n\n```bash\npip install pdfalchemy\n```\n\n### From Source\n\n```bash\ngit clone https://github.com/jainparul9814/pdfalchemy.git\ncd pdfalchemy\n\n# Install base dependencies\npip install -r requirements.txt\n\n# Install in development mode\npip install -e .\n\n# Install development dependencies (optional)\npip install -r requirements-dev.txt\n```\n\n## Quick Start\n\n### Python API\n\n```python\nfrom pdfalchemy import PDFProcessor, PNGConversionInput, ImageExtractionInput\n\n# Initialize processor\nprocessor = PDFProcessor()\n\n# Convert PDF to PNG\nwith open(\"document.pdf\", \"rb\") as f:\n pdf_bytes = f.read()\n\npng_input = PNGConversionInput(\n pdf_bytes=pdf_bytes,\n dpi=300, # High resolution\n first_page=1,\n last_page=5\n)\n\npng_result = processor.to_png(png_input)\nprint(f\"Converted {png_result.total_pages} pages\")\n\n# Extract images from PNG\nfor i, png_bytes in enumerate(png_result.png_images):\n extraction_input = ImageExtractionInput(\n png_bytes=png_bytes,\n min_width=50,\n min_height=50,\n flood_fill_threshold=0.2,\n noise_reduction=True\n )\n \n extraction_result = processor.extract_images_from_png(extraction_input)\n print(f\"Page {i+1}: Extracted {extraction_result.total_images} images\")\n```\n\n### Command Line Interface\n\n```bash\n# Convert PDF to PNG images\npdfalchemy to-png document.pdf --output ./images/ --dpi 300\n\n# Convert specific pages (range, list, or single page)\npdfalchemy to-png document.pdf --pages 1-5 --dpi 200\npdfalchemy to-png document.pdf --pages 1,3,5 --dpi 200\npdfalchemy to-png document.pdf --pages 3 --dpi 200\n\n# Convert to base64 for web applications\npdfalchemy to-base64 document.pdf --dpi 200 --output images.json\n\n# Extract individual images from PDF pages\npdfalchemy extract-images document.pdf --output ./extracted/ --min-size 100x100\n\n# Extract images with custom filters\npdfalchemy extract-images document.pdf --min-width 50 --max-width 800 --aspect-ratio 0.5-2.0\n\n# Extract images with advanced options\npdfalchemy extract-images document.pdf \\\n --output ./extracted/ \\\n --dpi 300 \\\n --pages 1-5 \\\n --min-size 100x100 \\\n --max-size 800x600 \\\n --aspect-ratio 0.5-2.0 \\\n --threshold 0.15 \\\n --format json \\\n --summary\n\n# Get help for any command\npdfalchemy extract-images --help\n```\n\n## Advanced Usage\n\n### Image Extraction with Custom Filters\n\n```python\nfrom pdfalchemy import PDFProcessor, ImageExtractionInput\n\nprocessor = PDFProcessor()\n\n# Configure image extraction with specific criteria\nextraction_input = ImageExtractionInput(\n png_bytes=png_bytes,\n min_width=100, # Minimum width in pixels\n min_height=100, # Minimum height in pixels\n max_width=800, # Maximum width in pixels\n max_height=600, # Maximum height in pixels\n min_aspect_ratio=0.5, # Minimum aspect ratio (width/height)\n max_aspect_ratio=2.0, # Maximum aspect ratio\n flood_fill_threshold=0.15, # Threshold for flood fill algorithm\n noise_reduction=True, # Enable noise reduction\n separate_connected_regions=True # Separate connected regions\n)\n\nresult = processor.extract_images_from_png(extraction_input)\nprint(f\"Extracted {result.total_images} images\")\nprint(f\"Filtered out {result.filtered_count} images\")\nprint(f\"Processing time: {result.processing_time_ms:.2f} ms\")\n```\n\n### Batch Processing\n\n```python\nfrom pathlib import Path\nfrom pdfalchemy import PDFProcessor, PNGConversionInput\n\nprocessor = PDFProcessor()\npdf_files = Path(\"./pdfs/\").glob(\"*.pdf\")\n\nfor pdf_file in pdf_files:\n print(f\"Processing {pdf_file}\")\n \n with open(pdf_file, \"rb\") as f:\n pdf_bytes = f.read()\n \n png_input = PNGConversionInput(\n pdf_bytes=pdf_bytes,\n dpi=200\n )\n \n result = processor.to_png(png_input)\n print(f\" Converted {result.total_pages} pages\")\n```\n\n### Base64 Conversion for Web Applications\n\n```python\nfrom pdfalchemy import PDFProcessor, PNGConversionInput\n\nprocessor = PDFProcessor()\n\nwith open(\"document.pdf\", \"rb\") as f:\n pdf_bytes = f.read()\n\npng_input = PNGConversionInput(\n pdf_bytes=pdf_bytes,\n dpi=200\n)\n\n# Get base64 encoded PNG images\nbase64_images = processor.to_png_base64(png_input)\n\n# Use in web applications\nfor i, base64_str in enumerate(base64_images):\n html_img_tag = f'<img src=\"data:image/png;base64,{base64_str}\" alt=\"Page {i+1}\">'\n print(html_img_tag)\n```\n\n## Command Line Interface\n\nPDFAlchemy provides a powerful command-line interface for batch processing and automation.\n\n### Available Commands\n\n#### `to-png` - Convert PDF to PNG Images\n```bash\npdfalchemy to-png <pdf_file> [options]\n```\n\n**Options:**\n- `--output, -o`: Output directory for PNG files\n- `--dpi`: DPI resolution (default: 200, range: 72-1200)\n- `--pages`: Page range (e.g., '1-5', '1,3,5', or '3')\n\n**Examples:**\n```bash\n# Convert all pages\npdfalchemy to-png document.pdf --output ./images/\n\n# Convert specific pages with high resolution\npdfalchemy to-png document.pdf --dpi 300 --pages 1-5 --output ./high_res/\n\n# Convert single page\npdfalchemy to-png document.pdf --pages 3 --output ./single_page/\n```\n\n#### `to-base64` - Convert PDF to Base64 Encoded PNG\n```bash\npdfalchemy to-base64 <pdf_file> [options]\n```\n\n**Options:**\n- `--output, -o`: Output file for base64 data (JSON format)\n- `--dpi`: DPI resolution (default: 200)\n- `--pages`: Page range (e.g., '1-5', '1,3,5', or '3')\n\n**Examples:**\n```bash\n# Convert to base64 for web applications\npdfalchemy to-base64 document.pdf --dpi 200 --output images.json\n\n# Convert specific pages\npdfalchemy to-base64 document.pdf --pages 1-3 --output selected_pages.json\n```\n\n#### `extract-images` - Extract Individual Images from PDF\n```bash\npdfalchemy extract-images <pdf_file> [options]\n```\n\n**Basic Options:**\n- `--output, -o`: Output directory for extracted images\n- `--dpi`: DPI resolution for conversion (default: 200)\n- `--pages`: Page range (e.g., '1-5', '1,3,5', or '3')\n\n**Size Filtering:**\n- `--min-size`: Minimum size in pixels (e.g., '100x100')\n- `--max-size`: Maximum size in pixels (e.g., '800x600')\n- `--min-width`: Minimum width in pixels\n- `--min-height`: Minimum height in pixels\n- `--max-width`: Maximum width in pixels\n- `--max-height`: Maximum height in pixels\n\n**Advanced Filtering:**\n- `--aspect-ratio`: Aspect ratio range (e.g., '0.5-2.0')\n- `--threshold`: Flood fill threshold (0.0-1.0, default: 0.1)\n- `--no-noise-reduction`: Disable noise reduction\n- `--no-separate-regions`: Disable connected region separation\n- `--sort-order`: Sort order for extracted images ('top-bottom', 'left-right', 'reading-order', default: 'top-bottom')\n\n**Output Options:**\n- `--format`: Output format ('png' or 'json', default: 'png')\n- `--summary`: Show detailed extraction summary\n\n**Examples:**\n```bash\n# Basic image extraction\npdfalchemy extract-images document.pdf --output ./extracted/ --min-size 100x100\n\n# Advanced filtering with custom sort order\npdfalchemy extract-images document.pdf \\\n --output ./filtered/ \\\n --min-width 50 \\\n --max-width 800 \\\n --aspect-ratio 0.5-2.0 \\\n --threshold 0.15 \\\n --sort-order reading-order\n\n# JSON output with summary\npdfalchemy extract-images document.pdf \\\n --output ./json_output/ \\\n --format json \\\n --summary \\\n --pages 1-5\n\n# High-resolution extraction with custom filters\npdfalchemy extract-images document.pdf \\\n --dpi 300 \\\n --output ./high_res_extracted/ \\\n --min-size 200x200 \\\n --max-size 1200x800 \\\n --aspect-ratio 0.8-1.5 \\\n --threshold 0.2 \\\n --no-noise-reduction\n```\n\n### Page Range Formats\n\nThe `--pages` option supports multiple formats:\n- **Range**: `1-5` (pages 1 through 5)\n- **List**: `1,3,5` (pages 1, 3, and 5)\n- **Single**: `3` (page 3 only)\n\n### Output Formats\n\n#### PNG Format\n- Saves individual PNG files for each extracted image\n- File naming: `page_001_image_001.png`, `page_001_image_002.png`, etc.\n- Suitable for visual inspection and further processing\n\n#### JSON Format\n- Saves all extracted images as base64-encoded data in a JSON file\n- Includes metadata: page number, image index, size in bytes\n- Suitable for web applications and programmatic access\n\n### Sort Order Options\n\nThe `--sort-order` parameter controls how extracted images are ordered:\n\n- **`top-bottom`** (default): Sort by y-coordinate first (top to bottom), then by x-coordinate (left to right)\n- **`left-right`**: Sort by x-coordinate first (left to right), then by y-coordinate (top to bottom)\n- **`reading-order`**: Group images by approximate rows and sort each row left-to-right, then sort rows top-to-bottom\n\n### Performance Tips\n\n1. **Use appropriate DPI**: Higher DPI provides better quality but increases processing time\n2. **Filter early**: Use size and aspect ratio filters to reduce processing overhead\n3. **Batch processing**: Process multiple files in scripts for automation\n4. **Memory management**: For large PDFs, consider processing page ranges\n\n## Configuration\n\nPDFAlchemy uses Pydantic models for configuration and validation. All input and output models include comprehensive validation and type checking.\n\n## Data Models\n\n### PNGConversionInput\n- `pdf_bytes`: PDF data as byte array\n- `dpi`: Resolution in DPI (72-1200, default: 200)\n- `first_page`: First page to convert (1-indexed, optional)\n- `last_page`: Last page to convert (1-indexed, optional)\n\n### PNGConversionOutput\n- `png_images`: List of PNG images as byte arrays\n- `total_pages`: Total number of pages converted\n- `dpi_used`: DPI used for conversion\n- `page_range`: Page range converted (e.g., '1-5')\n- `total_size_bytes`: Total size of all PNG images\n\n### ImageExtractionInput\n- `png_bytes`: PNG image data as byte array\n- `min_width/min_height`: Minimum dimensions for extracted images\n- `max_width/max_height`: Maximum dimensions for extracted images\n- `min_aspect_ratio/max_aspect_ratio`: Aspect ratio constraints\n- `flood_fill_threshold`: Threshold for flood fill algorithm (0.0-1.0)\n- `noise_reduction`: Enable noise reduction\n- `separate_connected_regions`: Attempt to separate connected regions\n- `sort_order`: Sort order for extracted images ('top-bottom', 'left-right', 'reading-order')\n\n### ImageExtractionOutput\n- `extracted_images`: List of base64 encoded extracted images\n- `total_images`: Total number of extracted images\n- `filtered_count`: Number of images filtered out\n- `processing_time_ms`: Processing time in milliseconds\n- `total_size_bytes`: Total size of all extracted images\n\n## Development\n\n### Setup Development Environment\n\n```bash\ngit clone https://github.com/jainparul9814/pdfalchemy.git\ncd pdfalchemy\n\n# Install all dependencies\npip install -r requirements-dev.txt\n\n# Install in development mode\npip install -e .\n\n# Setup pre-commit hooks\npre-commit install\n```\n\n### Running Tests\n\n```bash\n# Run all tests\npytest\n\n# Run core tests with verbose output\npytest tests/test_core.py -v\n\n# Run with coverage\npytest --cov=src.pdfalchemy.core --cov-report=term-missing\n```\n\n### Code Quality\n\n```bash\n# Format code\nblack src/ tests/\n\n# Sort imports\nisort src/ tests/\n\n# Type checking\nmypy src/\n\n# Linting\nflake8 src/ tests/\n```\n\n### Sample Scripts\n\nCheck the `sample_test_scripts/` directory for working examples:\n\n```bash\npython sample_test_scripts/test_image_extraction.py\n```\n\n### Building and Publishing\n\n```bash\n# Clean previous builds\nrm -rf dist/ build/ *.egg-info\n\n# Build the package\npython -m build\n\n# Upload to PyPI\npython -m twine upload dist/*\n```\n\n**Note**: Make sure you have the required build tools installed:\n```bash\npip install build twine\n```\n\n## Dependencies\n\n### Core Dependencies\n- `pydantic>=2.0.0`: Data validation and settings management\n- `pdf2image>=1.16.0`: PDF to image conversion\n- `opencv-python>=4.8.0`: Computer vision for image processing\n- `Pillow>=9.0.0`: Image processing\n- `numpy>=1.21.0`: Numerical computing\n\n### Development Dependencies\n- `pytest>=7.0.0`: Testing framework\n- `black>=23.0.0`: Code formatting\n- `isort>=5.12.0`: Import sorting\n- `flake8>=6.0.0`: Linting\n- `mypy>=1.0.0`: Type checking\n\n## Contributing\n\nWe welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details.\n\n1. Fork the repository\n2. Create a feature branch (`git checkout -b feature/amazing-feature`)\n3. Commit your changes (`git commit -m 'Add amazing feature'`)\n4. Push to the branch (`git push origin feature/amazing-feature`)\n5. Open a Pull Request\n\n## License\n\nThis project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.\n\n## Support\n\n- **Issues**: [GitHub Issues](https://github.com/jainparul9814/pdfalchemy/issues)\n- **Author**: Parul Jain (jainparul9814@gmail.com)\n\n## Changelog\n\nSee [CHANGELOG.md](CHANGELOG.md) for a list of changes and version history. \n",
"bugtrack_url": null,
"license": "MIT",
"summary": "A Python library for advanced PDF manipulation and processing",
"version": "0.1.0",
"project_urls": {
"Bug Tracker": "https://github.com/jainparul9814/pdfalchemy/issues",
"Documentation": "https://pdfalchemy.readthedocs.io/",
"Homepage": "https://github.com/jainparul9814/pdfalchemy",
"Repository": "https://github.com/jainparul9814/pdfalchemy"
},
"split_keywords": [
"pdf",
" document",
" processing",
" extraction",
" manipulation"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "64e64845fb7d2a2e7efa7e6d2e6beaee09984e54f32db0908b39cfcc2eeff793",
"md5": "a6418cba132400461975f71c560a476e",
"sha256": "c7fe4b8ae3c3ed14b7513aabfd9699d5cb59905e2c6611bc94fb22c4851b28ef"
},
"downloads": -1,
"filename": "pdfalchemy-0.1.0-py3-none-any.whl",
"has_sig": false,
"md5_digest": "a6418cba132400461975f71c560a476e",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.9",
"size": 15933,
"upload_time": "2025-07-19T13:36:19",
"upload_time_iso_8601": "2025-07-19T13:36:19.309356Z",
"url": "https://files.pythonhosted.org/packages/64/e6/4845fb7d2a2e7efa7e6d2e6beaee09984e54f32db0908b39cfcc2eeff793/pdfalchemy-0.1.0-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "49ccd7d3472741d141f41164a7684ec0e39b5d9d6960e84c07b8d6a7ea63ef76",
"md5": "0c63ec51fb5d09974232594da4346378",
"sha256": "c73bbec1a238e224728270bb64ca3f4e07c9acebb86656bee8d87e35b3528cf5"
},
"downloads": -1,
"filename": "pdfalchemy-0.1.0.tar.gz",
"has_sig": false,
"md5_digest": "0c63ec51fb5d09974232594da4346378",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.9",
"size": 22812,
"upload_time": "2025-07-19T13:36:21",
"upload_time_iso_8601": "2025-07-19T13:36:21.697350Z",
"url": "https://files.pythonhosted.org/packages/49/cc/d7d3472741d141f41164a7684ec0e39b5d9d6960e84c07b8d6a7ea63ef76/pdfalchemy-0.1.0.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-07-19 13:36:21",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "jainparul9814",
"github_project": "pdfalchemy",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "pydantic",
"specs": [
[
">=",
"2.0.0"
]
]
},
{
"name": "pdf2image",
"specs": [
[
">=",
"1.16.0"
]
]
},
{
"name": "Pillow",
"specs": [
[
">=",
"9.0.0"
]
]
},
{
"name": "numpy",
"specs": [
[
">=",
"1.21.0"
]
]
},
{
"name": "opencv-python",
"specs": [
[
">=",
"4.8.0"
]
]
},
{
"name": "pytest",
"specs": [
[
">=",
"7.0.0"
]
]
},
{
"name": "pytest-asyncio",
"specs": [
[
">=",
"0.21.0"
]
]
},
{
"name": "pytest-cov",
"specs": [
[
">=",
"4.0.0"
]
]
},
{
"name": "black",
"specs": [
[
">=",
"23.0.0"
]
]
},
{
"name": "isort",
"specs": [
[
">=",
"5.12.0"
]
]
},
{
"name": "flake8",
"specs": [
[
">=",
"6.0.0"
]
]
},
{
"name": "mypy",
"specs": [
[
">=",
"1.0.0"
]
]
},
{
"name": "pre-commit",
"specs": [
[
">=",
"3.0.0"
]
]
}
],
"lcname": "pdfalchemy"
}