# # Copyright (c) 2024 Odos Matthews
# #
# # Permission is hereby granted, free of charge, to any person obtaining a copy
# # of this software and associated documentation files (the "Software"), to deal
# # in the Software without restriction, including without limitation the rights
# # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# # copies of the Software, and to permit persons to whom the Software is
# # furnished to do so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# SparkForge
A production-ready PySpark + Delta Lake pipeline engine with the Medallion Architecture (Bronze โ Silver โ Gold). Build scalable data pipelines with built-in parallel execution, comprehensive validation, and enterprise-grade monitoring.
[](https://sparkforge.readthedocs.io/)
[](https://badge.fury.io/py/sparkforge)
[](https://www.python.org/downloads/)
[](https://opensource.org/licenses/MIT)
## ๐ Quick Start
### Installation
```bash
pip install sparkforge
```
### Minimal Example (3 lines!)
```python
from sparkforge import PipelineBuilder
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MyPipeline").getOrCreate()
builder = PipelineBuilder(spark=spark, schema="my_schema")
# Bronze โ Silver โ Gold pipeline
pipeline = (builder
.with_bronze_rules(name="events", rules={"user_id": [F.col("user_id").isNotNull()]})
.add_silver_transform(name="clean_events", source_bronze="events",
transform=lambda spark, df, silvers: df.filter(F.col("status") == "active"),
rules={"status": [F.col("status").isNotNull()]}, table_name="clean_events")
.add_gold_transform(name="daily_metrics", transform=lambda spark, silvers:
list(silvers.values())[0].groupBy("date").count(),
rules={"date": [F.col("date").isNotNull()]}, table_name="daily_metrics")
.to_pipeline()
)
result = pipeline.initial_load(bronze_sources={"events": source_df})
```
## ๐ Feature Examples
### Core Features
- **[Hello World](examples/core/hello_world.py)** - Absolute simplest pipeline
- **[Basic Pipeline](examples/core/basic_pipeline.py)** - Standard Bronze โ Silver โ Gold flow
- **[Step-by-Step Execution](examples/core/step_by_step_execution.py)** - Debug individual steps
### Advanced Features
- **[Multi-Schema Support](examples/advanced/multi_schema_pipeline.py)** - Cross-schema data flows
- **[Dynamic Parallel Execution](examples/advanced/dynamic_parallel_execution.py)** - Advanced parallel processing
- **[Auto-Inference](examples/advanced/auto_infer_source_bronze_simple.py)** - Automatic dependency detection
- **[Column Filtering](examples/specialized/column_filtering_behavior.py)** - Control column preservation
### Use Case Examples
- **[E-commerce Analytics](examples/usecases/ecommerce_analytics.py)** - Order processing, customer analytics
- **[IoT Sensor Pipeline](examples/usecases/iot_sensor_pipeline.py)** - Real-time sensor data processing
- **[Step-by-Step Debugging](examples/usecases/step_by_step_debugging.py)** - Advanced debugging techniques
### Specialized Examples
- **[Bronze Without Datetime](examples/specialized/bronze_no_datetime_example.py)** - Full refresh pipelines
- **[Improved UX](examples/advanced/improved_user_experience.py)** - Enhanced user experience features
### ๐ [Complete Examples Guide](examples/README.md) - Organized by feature categories with learning paths
## ๐ฏ Key Features
- **๐๏ธ Medallion Architecture**: Bronze โ Silver โ Gold data layering with automatic dependency management
- **โก Advanced Parallel Execution**: Dynamic worker allocation, intelligent task prioritization, and adaptive optimization
- **๐ฏ Auto-Inference**: Automatically infers source dependencies, reducing boilerplate by 70%
- **๐ ๏ธ Preset Configurations**: One-line setup for development, production, and testing environments
- **๐ง Validation Helpers**: Built-in methods for common validation patterns (not_null, positive_numbers, etc.)
- **๐ Smart Detection**: Automatic timestamp column detection for watermarking
- **๐ข Multi-Schema Support**: Cross-schema data flows for multi-tenant, environment separation, and compliance
- **๐ Step-by-Step Debugging**: Execute individual pipeline steps independently for troubleshooting
- **โ
Enhanced Data Validation**: Configurable validation thresholds with automatic security validation and performance caching
- **๐๏ธ Column Filtering Control**: Explicit control over which columns are preserved after validation
- **๐ Incremental Processing**: Watermarking and incremental updates with Delta Lake
- **๐ง Delta Lake Integration**: Full support for ACID transactions, time travel, and schema evolution
## ๐ ๏ธ Installation
### Prerequisites
- Python 3.8+
- Java 8+ (for PySpark 3.2.4)
- PySpark 3.2.4+
- Delta Lake 1.2.0+
### Install from PyPI
```bash
pip install sparkforge
```
### Install from Source
```bash
git clone https://github.com/eddiethedean/sparkforge.git
cd sparkforge
pip install -e .
```
### Verify Installation
```python
import sparkforge
print(f"SparkForge version: {sparkforge.__version__}")
```
## ๐ Documentation
**๐ [Complete Documentation](https://sparkforge.readthedocs.io/)** - Professional documentation with search, navigation, and examples
### Quick Links
- **[5-Minute Quick Start](https://sparkforge.readthedocs.io/en/latest/quick_start_5_min.html)** - Get running in under 5 minutes โญ **START HERE**
- **[User Guide](https://sparkforge.readthedocs.io/en/latest/user_guide.html)** - Complete guide to all features
- **[API Reference](https://sparkforge.readthedocs.io/en/latest/api_reference.html)** - Complete API documentation
- **[Troubleshooting](https://sparkforge.readthedocs.io/en/latest/troubleshooting.html)** - Common issues and solutions
### Use Case Guides
- **[E-commerce Analytics](https://sparkforge.readthedocs.io/en/latest/usecase_ecommerce.html)** - Order processing, customer analytics
- **[IoT Data Processing](https://sparkforge.readthedocs.io/en/latest/usecase_iot.html)** - Sensor data, anomaly detection
- **[Business Intelligence](https://sparkforge.readthedocs.io/en/latest/usecase_bi.html)** - Dashboards, KPIs, reporting
## ๐งช Testing
Run the comprehensive test suite with 500+ tests:
```bash
# Fast parallel tests (recommended for development)
python tests/run_tests_parallel.py --workers 4
# Run all tests with coverage
pytest --cov=sparkforge --cov-report=html
# Run specific test categories
pytest -m "not slow" # Skip slow tests
pytest -m "delta" # Delta Lake tests only
pytest tests/test_integration_*.py # Integration tests only
```
**Performance Benefits:**
- **4x speedup** for core tests (22s vs 2+ minutes)
- **Smart categorization** of parallel vs sequential tests
- **Zero failures** with reliable parallel execution
- **Optimized test suite** with no duplicate tests
## ๐ Production Deployment
### Databricks
```python
from sparkforge import PipelineBuilder
# Spark session is automatically available
builder = PipelineBuilder(
spark=spark,
schema="production_schema",
min_bronze_rate=99.0,
min_silver_rate=95.0,
min_gold_rate=90.0,
enable_parallel_silver=True,
max_parallel_workers=8,
verbose=True
)
```
### AWS EMR / Azure Synapse
```python
from sparkforge import PipelineBuilder
# Configure for cloud storage
builder = PipelineBuilder(spark=spark, schema="my_schema")
pipeline = builder.to_pipeline()
result = pipeline.run_incremental(bronze_sources={"events": source_df})
```
## ๐ค Contributing
We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details.
### Quick Start for Contributors
1. **Fork the repository**
2. **Clone your fork**: `git clone https://github.com/eddiethedean/sparkforge.git`
3. **Install in development mode**: `pip install -e .`
4. **Run fast parallel tests**: `python tests/run_tests_parallel.py --workers 4`
5. **Create a feature branch**: `git checkout -b feature/amazing-feature`
6. **Make your changes and add tests**
7. **Run tests**: `python tests/run_tests_parallel.py --workers 4`
8. **Submit a pull request**
## ๐ License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
## ๐ Acknowledgments
- Built on top of [Apache Spark](https://spark.apache.org/)
- Powered by [Delta Lake](https://delta.io/)
- Inspired by the Medallion Architecture pattern
- Thanks to the PySpark and Delta Lake communities
---
**Made with โค๏ธ for the data engineering community**
Raw data
{
"_id": null,
"home_page": "https://github.com/eddiethedean/sparkforge",
"name": "sparkforge",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": "Odos Matthews <odosmatthews@gmail.com>",
"keywords": "spark, databricks, pipeline, etl, data-engineering, data-lakehouse, bronze-silver-gold, delta-lake, big-data",
"author": "Odos Matthews",
"author_email": "Odos Matthews <odosmatthews@gmail.com>",
"download_url": "https://files.pythonhosted.org/packages/d6/97/d6bea8385a7c2904744e4dd6179551d16909874e020f30ab6a0950fdbd88/sparkforge-0.4.5.tar.gz",
"platform": null,
"description": "# # Copyright (c) 2024 Odos Matthews\n# #\n# # Permission is hereby granted, free of charge, to any person obtaining a copy\n# # of this software and associated documentation files (the \"Software\"), to deal\n# # in the Software without restriction, including without limitation the rights\n# # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# # copies of the Software, and to permit persons to whom the Software is\n# # furnished to do so, subject to the following conditions:\n# #\n# # The above copyright notice and this permission notice shall be included in all\n# # copies or substantial portions of the Software.\n# #\n# # THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# # SOFTWARE.\n\n# SparkForge\n\nA production-ready PySpark + Delta Lake pipeline engine with the Medallion Architecture (Bronze \u2192 Silver \u2192 Gold). Build scalable data pipelines with built-in parallel execution, comprehensive validation, and enterprise-grade monitoring.\n\n[](https://sparkforge.readthedocs.io/)\n[](https://badge.fury.io/py/sparkforge)\n[](https://www.python.org/downloads/)\n[](https://opensource.org/licenses/MIT)\n\n## \ud83d\ude80 Quick Start\n\n### Installation\n```bash\npip install sparkforge\n```\n\n### Minimal Example (3 lines!)\n```python\nfrom sparkforge import PipelineBuilder\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.appName(\"MyPipeline\").getOrCreate()\nbuilder = PipelineBuilder(spark=spark, schema=\"my_schema\")\n\n# Bronze \u2192 Silver \u2192 Gold pipeline\npipeline = (builder\n .with_bronze_rules(name=\"events\", rules={\"user_id\": [F.col(\"user_id\").isNotNull()]})\n .add_silver_transform(name=\"clean_events\", source_bronze=\"events\",\n transform=lambda spark, df, silvers: df.filter(F.col(\"status\") == \"active\"),\n rules={\"status\": [F.col(\"status\").isNotNull()]}, table_name=\"clean_events\")\n .add_gold_transform(name=\"daily_metrics\", transform=lambda spark, silvers:\n list(silvers.values())[0].groupBy(\"date\").count(),\n rules={\"date\": [F.col(\"date\").isNotNull()]}, table_name=\"daily_metrics\")\n .to_pipeline()\n)\n\nresult = pipeline.initial_load(bronze_sources={\"events\": source_df})\n```\n\n## \ud83d\udcda Feature Examples\n\n### Core Features\n- **[Hello World](examples/core/hello_world.py)** - Absolute simplest pipeline\n- **[Basic Pipeline](examples/core/basic_pipeline.py)** - Standard Bronze \u2192 Silver \u2192 Gold flow\n- **[Step-by-Step Execution](examples/core/step_by_step_execution.py)** - Debug individual steps\n\n### Advanced Features\n- **[Multi-Schema Support](examples/advanced/multi_schema_pipeline.py)** - Cross-schema data flows\n- **[Dynamic Parallel Execution](examples/advanced/dynamic_parallel_execution.py)** - Advanced parallel processing\n- **[Auto-Inference](examples/advanced/auto_infer_source_bronze_simple.py)** - Automatic dependency detection\n- **[Column Filtering](examples/specialized/column_filtering_behavior.py)** - Control column preservation\n\n### Use Case Examples\n- **[E-commerce Analytics](examples/usecases/ecommerce_analytics.py)** - Order processing, customer analytics\n- **[IoT Sensor Pipeline](examples/usecases/iot_sensor_pipeline.py)** - Real-time sensor data processing\n- **[Step-by-Step Debugging](examples/usecases/step_by_step_debugging.py)** - Advanced debugging techniques\n\n### Specialized Examples\n- **[Bronze Without Datetime](examples/specialized/bronze_no_datetime_example.py)** - Full refresh pipelines\n- **[Improved UX](examples/advanced/improved_user_experience.py)** - Enhanced user experience features\n\n### \ud83d\udcd6 [Complete Examples Guide](examples/README.md) - Organized by feature categories with learning paths\n\n## \ud83c\udfaf Key Features\n\n- **\ud83c\udfd7\ufe0f Medallion Architecture**: Bronze \u2192 Silver \u2192 Gold data layering with automatic dependency management\n- **\u26a1 Advanced Parallel Execution**: Dynamic worker allocation, intelligent task prioritization, and adaptive optimization\n- **\ud83c\udfaf Auto-Inference**: Automatically infers source dependencies, reducing boilerplate by 70%\n- **\ud83d\udee0\ufe0f Preset Configurations**: One-line setup for development, production, and testing environments\n- **\ud83d\udd27 Validation Helpers**: Built-in methods for common validation patterns (not_null, positive_numbers, etc.)\n- **\ud83d\udcca Smart Detection**: Automatic timestamp column detection for watermarking\n- **\ud83c\udfe2 Multi-Schema Support**: Cross-schema data flows for multi-tenant, environment separation, and compliance\n- **\ud83d\udd0d Step-by-Step Debugging**: Execute individual pipeline steps independently for troubleshooting\n- **\u2705 Enhanced Data Validation**: Configurable validation thresholds with automatic security validation and performance caching\n- **\ud83c\udf9b\ufe0f Column Filtering Control**: Explicit control over which columns are preserved after validation\n- **\ud83d\udd04 Incremental Processing**: Watermarking and incremental updates with Delta Lake\n- **\ud83d\udca7 Delta Lake Integration**: Full support for ACID transactions, time travel, and schema evolution\n\n## \ud83d\udee0\ufe0f Installation\n\n### Prerequisites\n- Python 3.8+\n- Java 8+ (for PySpark 3.2.4)\n- PySpark 3.2.4+\n- Delta Lake 1.2.0+\n\n### Install from PyPI\n```bash\npip install sparkforge\n```\n\n### Install from Source\n```bash\ngit clone https://github.com/eddiethedean/sparkforge.git\ncd sparkforge\npip install -e .\n```\n\n### Verify Installation\n```python\nimport sparkforge\nprint(f\"SparkForge version: {sparkforge.__version__}\")\n```\n\n## \ud83d\udcd6 Documentation\n\n**\ud83d\udcd6 [Complete Documentation](https://sparkforge.readthedocs.io/)** - Professional documentation with search, navigation, and examples\n\n### Quick Links\n- **[5-Minute Quick Start](https://sparkforge.readthedocs.io/en/latest/quick_start_5_min.html)** - Get running in under 5 minutes \u2b50 **START HERE**\n- **[User Guide](https://sparkforge.readthedocs.io/en/latest/user_guide.html)** - Complete guide to all features\n- **[API Reference](https://sparkforge.readthedocs.io/en/latest/api_reference.html)** - Complete API documentation\n- **[Troubleshooting](https://sparkforge.readthedocs.io/en/latest/troubleshooting.html)** - Common issues and solutions\n\n### Use Case Guides\n- **[E-commerce Analytics](https://sparkforge.readthedocs.io/en/latest/usecase_ecommerce.html)** - Order processing, customer analytics\n- **[IoT Data Processing](https://sparkforge.readthedocs.io/en/latest/usecase_iot.html)** - Sensor data, anomaly detection\n- **[Business Intelligence](https://sparkforge.readthedocs.io/en/latest/usecase_bi.html)** - Dashboards, KPIs, reporting\n\n## \ud83e\uddea Testing\n\nRun the comprehensive test suite with 500+ tests:\n\n```bash\n# Fast parallel tests (recommended for development)\npython tests/run_tests_parallel.py --workers 4\n\n# Run all tests with coverage\npytest --cov=sparkforge --cov-report=html\n\n# Run specific test categories\npytest -m \"not slow\" # Skip slow tests\npytest -m \"delta\" # Delta Lake tests only\npytest tests/test_integration_*.py # Integration tests only\n```\n\n**Performance Benefits:**\n- **4x speedup** for core tests (22s vs 2+ minutes)\n- **Smart categorization** of parallel vs sequential tests\n- **Zero failures** with reliable parallel execution\n- **Optimized test suite** with no duplicate tests\n\n## \ud83d\ude80 Production Deployment\n\n### Databricks\n```python\nfrom sparkforge import PipelineBuilder\n\n# Spark session is automatically available\nbuilder = PipelineBuilder(\n spark=spark,\n schema=\"production_schema\",\n min_bronze_rate=99.0,\n min_silver_rate=95.0,\n min_gold_rate=90.0,\n enable_parallel_silver=True,\n max_parallel_workers=8,\n verbose=True\n)\n```\n\n### AWS EMR / Azure Synapse\n```python\nfrom sparkforge import PipelineBuilder\n\n# Configure for cloud storage\nbuilder = PipelineBuilder(spark=spark, schema=\"my_schema\")\npipeline = builder.to_pipeline()\nresult = pipeline.run_incremental(bronze_sources={\"events\": source_df})\n```\n\n## \ud83e\udd1d Contributing\n\nWe welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details.\n\n### Quick Start for Contributors\n1. **Fork the repository**\n2. **Clone your fork**: `git clone https://github.com/eddiethedean/sparkforge.git`\n3. **Install in development mode**: `pip install -e .`\n4. **Run fast parallel tests**: `python tests/run_tests_parallel.py --workers 4`\n5. **Create a feature branch**: `git checkout -b feature/amazing-feature`\n6. **Make your changes and add tests**\n7. **Run tests**: `python tests/run_tests_parallel.py --workers 4`\n8. **Submit a pull request**\n\n## \ud83d\udcdd License\n\nThis project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.\n\n## \ud83c\udfc6 Acknowledgments\n\n- Built on top of [Apache Spark](https://spark.apache.org/)\n- Powered by [Delta Lake](https://delta.io/)\n- Inspired by the Medallion Architecture pattern\n- Thanks to the PySpark and Delta Lake communities\n\n---\n\n**Made with \u2764\ufe0f for the data engineering community**\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "A powerful data pipeline builder for Apache Spark and Databricks",
"version": "0.4.5",
"project_urls": {
"Bug Tracker": "https://github.com/eddiethedean/sparkforge/issues",
"Documentation": "https://sparkforge.readthedocs.io/",
"Homepage": "https://github.com/eddiethedean/sparkforge",
"Repository": "https://github.com/eddiethedean/sparkforge"
},
"split_keywords": [
"spark",
" databricks",
" pipeline",
" etl",
" data-engineering",
" data-lakehouse",
" bronze-silver-gold",
" delta-lake",
" big-data"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "770eab3e0970ef4452d74b2d3365f22bada70fc850313a76f47776006782cc7d",
"md5": "89bcd1d99cdf90f0fbb47603a8fcfeda",
"sha256": "86197c37136a48586267a24e3d1b7725f6a0fb223f555e624dcb4740d366f5f6"
},
"downloads": -1,
"filename": "sparkforge-0.4.5-py3-none-any.whl",
"has_sig": false,
"md5_digest": "89bcd1d99cdf90f0fbb47603a8fcfeda",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 154572,
"upload_time": "2025-09-15T21:36:11",
"upload_time_iso_8601": "2025-09-15T21:36:11.380226Z",
"url": "https://files.pythonhosted.org/packages/77/0e/ab3e0970ef4452d74b2d3365f22bada70fc850313a76f47776006782cc7d/sparkforge-0.4.5-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "d697d6bea8385a7c2904744e4dd6179551d16909874e020f30ab6a0950fdbd88",
"md5": "356475bc0730aab40fea4948da0840b2",
"sha256": "38594ccc0dbb8c04ffcbb7d0fab77f3b9598f6f89982e0e5e5f7842c6760c3d7"
},
"downloads": -1,
"filename": "sparkforge-0.4.5.tar.gz",
"has_sig": false,
"md5_digest": "356475bc0730aab40fea4948da0840b2",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8",
"size": 209059,
"upload_time": "2025-09-15T21:36:14",
"upload_time_iso_8601": "2025-09-15T21:36:14.786170Z",
"url": "https://files.pythonhosted.org/packages/d6/97/d6bea8385a7c2904744e4dd6179551d16909874e020f30ab6a0950fdbd88/sparkforge-0.4.5.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-09-15 21:36:14",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "eddiethedean",
"github_project": "sparkforge",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "pyspark",
"specs": [
[
"==",
"3.2.4"
]
]
},
{
"name": "pydantic",
"specs": [
[
">=",
"1.8.0"
]
]
},
{
"name": "delta-spark",
"specs": [
[
"<",
"2.0.0"
],
[
">=",
"1.2.0"
]
]
},
{
"name": "pandas",
"specs": [
[
">=",
"1.3.0"
]
]
},
{
"name": "numpy",
"specs": [
[
">=",
"1.21.0"
]
]
},
{
"name": "psutil",
"specs": [
[
">=",
"5.8.0"
]
]
},
{
"name": "pyarrow",
"specs": [
[
">=",
"8.0.0"
]
]
},
{
"name": "fastparquet",
"specs": [
[
">=",
"0.8.0"
]
]
}
],
"lcname": "sparkforge"
}