# DataProcessing
[](https://github.com/conorzen/dataprocess/actions)
A user-friendly Python package for working with CSV data. DataProcessing makes common CSV operations simple and intuitive, with smart defaults and helpful error messages.
## Features
- **Smart Loading**: Auto-detect encoding, delimiters, and handle malformed files
- **Intuitive API**: Chainable methods for filtering, sorting, and data manipulation
- **SQL Support**: Write SQL queries directly on CSV data
- **Live Data**: Connect to databases, APIs, and real-time data streams
- **Helpful Errors**: Clear error messages instead of cryptic pandas errors
- **Smart Defaults**: Works out of the box with minimal configuration
- **Data Exploration**: Quick summaries and data profiling
## Quick Start
```python
from dataprocessing import load, save, import_live, create_live_stream
# Load CSV with smart defaults
data = load("data.csv")
# Filter and manipulate
filtered = data.where(data['age'] > 25).sort_by("name")
# Or use SQL
filtered = data.sql("SELECT * FROM data WHERE age > 25 ORDER BY name")
# Load from database
from dataprocessing import load_from_db
db_data = load_from_db('postgresql', 'postgresql://user:pass@localhost/db', 'SELECT * FROM users')
# Load from API
from dataprocessing import load_from_api
api_data = load_from_api('https://api.example.com', '/data')
# Simple live data import
data = import_live("@https://example.com/live-data.csv")
live_data = create_live_stream(data, interval=60)
results = live_data.sql("SELECT * FROM data LIMIT 10")
# Save with automatic formatting
filtered.save("output.csv")
```
## Installation
```bash
pip install dataprocessing
```
For faster performance with large datasets:
```bash
pip install dataprocessing[fast]
```
## Basic Usage
### Loading Data
```python
from dataprocessing import load
# Simple loading with auto-detection
data = load("data.csv")
# With custom options
data = load("data.csv", encoding="utf-8", delimiter=";")
```
### Data Manipulation
```python
# Filtering
young_users = data.where(data['age'] < 30)
active_users = data.where(data['status'] == "active")
# Sorting
sorted_data = data.sort_by("name")
sorted_data = data.sort_by("age", ascending=False)
# Column operations
data = data.rename_column("old_name", "new_name")
data = data.select_columns(["name", "age", "email"])
data = data.drop_columns(["unused_column"])
# Adding columns
data = data.add_column("full_name", data["first_name"] + " " + data["last_name"])
```
### Data Exploration
```python
# Quick summary
print(data.summary())
# Data profiling
print(data.profile())
# Preview data
print(data.head())
print(data.tail())
print(data.sample(5))
```
### SQL Support
```python
# Basic queries
result = data.sql("SELECT * FROM data WHERE age > 25")
# Aggregations
summary = data.sql("SELECT COUNT(*) as count, AVG(age) as avg_age FROM data")
# Group by
grouped = data.sql("SELECT city, COUNT(*) as count FROM data GROUP BY city")
# Complex queries
complex_result = data.sql("""
SELECT
city,
COUNT(*) as total_users,
AVG(age) as avg_age,
MAX(salary) as max_salary
FROM data
WHERE age > 25
GROUP BY city
HAVING COUNT(*) > 1
ORDER BY avg_age DESC
""")
```
### Live Data Connections
```python
from dataprocessing import load_from_db, load_from_api, create_live_stream
# Database connections
data = load_from_db('postgresql', 'postgresql://user:pass@localhost/db', 'SELECT * FROM users')
# API connections
data = load_from_api('https://api.example.com', '/users', headers={'Authorization': 'Bearer token'})
# Real-time data streams
def get_sensor_data():
return {'temperature': 25.5, 'humidity': 60}
stream = create_live_stream(get_sensor_data, interval=1.0)
stream.start()
data = CSVData(stream.get_latest_data())
```
### Simple Live Data Import
```python
from dataprocessing import import_live, create_live_stream
# Super simple syntax
data = import_live("@https://example.com/live-data.csv")
live_data = create_live_stream(data, interval=60)
print(live_data.header)
results = live_data.sql("SELECT * FROM data LIMIT 10")
print(results)
```
### Chaining Operations
```python
result = (load("data.csv")
.where(data['age'] > 18)
.where(data['status'] == "active")
.sort_by("name")
.select_columns(["name", "email", "age"])
.save("filtered_data.csv"))
```
### Data Validation
```python
# Validate data types
data = data.validate_types({
"age": "int",
"email": "email",
"date": "date"
})
# Check for missing values
missing_report = data.check_missing()
```
### Data Cleaning
```python
# Handle missing values
data = data.fill_missing("age", 0)
data = data.drop_missing(["email"])
# Remove duplicates
data = data.drop_duplicates()
```
## Error Handling
DataProcessing provides helpful error messages:
```python
# Instead of: KeyError: 'age'
# You get: Column 'age' not found. Did you mean 'Age'?
# Instead of: UnicodeDecodeError
# You get: Unable to read file encoding. Try specifying encoding='utf-8'
```
## Performance
For large datasets, use the fast backend:
```python
from dataprocessing import load
# Uses Polars for faster performance
data = load("large_file.csv", backend="polars")
```
## Examples
Check out the `examples/` directory for comprehensive usage examples:
- `basic_usage.py` - Basic CSV operations
- `advanced_usage.py` - Advanced data manipulation
- `sql_usage.py` - SQL query examples
- `live_data_usage.py` - Database and API connections
- `simple_live_usage.py` - Simple live data import
## Contributing
Contributions are welcome! Please feel free to submit a Pull Request.
## License
This project is licensed under the MIT License - see the LICENSE file for details.
Raw data
{
"_id": null,
"home_page": "https://github.com/conorzen/dataprocess",
"name": "dataprocesslite",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": null,
"keywords": "csv, data, processing, pandas, sql, live-data",
"author": "Conor Reidy",
"author_email": "conor@example.com",
"download_url": "https://files.pythonhosted.org/packages/ab/fc/dbd947f8b02e9e0c285e477b8861ed0d3d1bfd4a77762c6501e855535eb9/dataprocesslite-0.1.0.tar.gz",
"platform": null,
"description": "# DataProcessing\n\n[](https://github.com/conorzen/dataprocess/actions)\n\nA user-friendly Python package for working with CSV data. DataProcessing makes common CSV operations simple and intuitive, with smart defaults and helpful error messages.\n\n## Features\n\n- **Smart Loading**: Auto-detect encoding, delimiters, and handle malformed files\n- **Intuitive API**: Chainable methods for filtering, sorting, and data manipulation\n- **SQL Support**: Write SQL queries directly on CSV data\n- **Live Data**: Connect to databases, APIs, and real-time data streams\n- **Helpful Errors**: Clear error messages instead of cryptic pandas errors\n- **Smart Defaults**: Works out of the box with minimal configuration\n- **Data Exploration**: Quick summaries and data profiling\n\n## Quick Start\n\n```python\nfrom dataprocessing import load, save, import_live, create_live_stream\n\n# Load CSV with smart defaults\ndata = load(\"data.csv\")\n\n# Filter and manipulate\nfiltered = data.where(data['age'] > 25).sort_by(\"name\")\n\n# Or use SQL\nfiltered = data.sql(\"SELECT * FROM data WHERE age > 25 ORDER BY name\")\n\n# Load from database\nfrom dataprocessing import load_from_db\ndb_data = load_from_db('postgresql', 'postgresql://user:pass@localhost/db', 'SELECT * FROM users')\n\n# Load from API\nfrom dataprocessing import load_from_api\napi_data = load_from_api('https://api.example.com', '/data')\n\n# Simple live data import\ndata = import_live(\"@https://example.com/live-data.csv\")\nlive_data = create_live_stream(data, interval=60)\nresults = live_data.sql(\"SELECT * FROM data LIMIT 10\")\n\n# Save with automatic formatting\nfiltered.save(\"output.csv\")\n```\n\n## Installation\n\n```bash\npip install dataprocessing\n```\n\nFor faster performance with large datasets:\n```bash\npip install dataprocessing[fast]\n```\n\n## Basic Usage\n\n### Loading Data\n\n```python\nfrom dataprocessing import load\n\n# Simple loading with auto-detection\ndata = load(\"data.csv\")\n\n# With custom options\ndata = load(\"data.csv\", encoding=\"utf-8\", delimiter=\";\")\n```\n\n### Data Manipulation\n\n```python\n# Filtering\nyoung_users = data.where(data['age'] < 30)\nactive_users = data.where(data['status'] == \"active\")\n\n# Sorting\nsorted_data = data.sort_by(\"name\")\nsorted_data = data.sort_by(\"age\", ascending=False)\n\n# Column operations\ndata = data.rename_column(\"old_name\", \"new_name\")\ndata = data.select_columns([\"name\", \"age\", \"email\"])\ndata = data.drop_columns([\"unused_column\"])\n\n# Adding columns\ndata = data.add_column(\"full_name\", data[\"first_name\"] + \" \" + data[\"last_name\"])\n```\n\n### Data Exploration\n\n```python\n# Quick summary\nprint(data.summary())\n\n# Data profiling\nprint(data.profile())\n\n# Preview data\nprint(data.head())\nprint(data.tail())\nprint(data.sample(5))\n```\n\n### SQL Support\n\n```python\n# Basic queries\nresult = data.sql(\"SELECT * FROM data WHERE age > 25\")\n\n# Aggregations\nsummary = data.sql(\"SELECT COUNT(*) as count, AVG(age) as avg_age FROM data\")\n\n# Group by\ngrouped = data.sql(\"SELECT city, COUNT(*) as count FROM data GROUP BY city\")\n\n# Complex queries\ncomplex_result = data.sql(\"\"\"\n SELECT \n city,\n COUNT(*) as total_users,\n AVG(age) as avg_age,\n MAX(salary) as max_salary\n FROM data \n WHERE age > 25 \n GROUP BY city \n HAVING COUNT(*) > 1\n ORDER BY avg_age DESC\n\"\"\")\n```\n\n### Live Data Connections\n\n```python\nfrom dataprocessing import load_from_db, load_from_api, create_live_stream\n\n# Database connections\ndata = load_from_db('postgresql', 'postgresql://user:pass@localhost/db', 'SELECT * FROM users')\n\n# API connections\ndata = load_from_api('https://api.example.com', '/users', headers={'Authorization': 'Bearer token'})\n\n# Real-time data streams\ndef get_sensor_data():\n return {'temperature': 25.5, 'humidity': 60}\n\nstream = create_live_stream(get_sensor_data, interval=1.0)\nstream.start()\ndata = CSVData(stream.get_latest_data())\n```\n\n### Simple Live Data Import\n\n```python\nfrom dataprocessing import import_live, create_live_stream\n\n# Super simple syntax\ndata = import_live(\"@https://example.com/live-data.csv\")\nlive_data = create_live_stream(data, interval=60)\n\nprint(live_data.header)\nresults = live_data.sql(\"SELECT * FROM data LIMIT 10\")\nprint(results)\n```\n\n### Chaining Operations\n\n```python\nresult = (load(\"data.csv\")\n .where(data['age'] > 18)\n .where(data['status'] == \"active\")\n .sort_by(\"name\")\n .select_columns([\"name\", \"email\", \"age\"])\n .save(\"filtered_data.csv\"))\n```\n\n### Data Validation\n\n```python\n# Validate data types\ndata = data.validate_types({\n \"age\": \"int\",\n \"email\": \"email\",\n \"date\": \"date\"\n})\n\n# Check for missing values\nmissing_report = data.check_missing()\n```\n\n### Data Cleaning\n\n```python\n# Handle missing values\ndata = data.fill_missing(\"age\", 0)\ndata = data.drop_missing([\"email\"])\n\n# Remove duplicates\ndata = data.drop_duplicates()\n```\n\n## Error Handling\n\nDataProcessing provides helpful error messages:\n\n```python\n# Instead of: KeyError: 'age'\n# You get: Column 'age' not found. Did you mean 'Age'?\n\n# Instead of: UnicodeDecodeError\n# You get: Unable to read file encoding. Try specifying encoding='utf-8'\n```\n\n## Performance\n\nFor large datasets, use the fast backend:\n\n```python\nfrom dataprocessing import load\n\n# Uses Polars for faster performance\ndata = load(\"large_file.csv\", backend=\"polars\")\n```\n\n## Examples\n\nCheck out the `examples/` directory for comprehensive usage examples:\n\n- `basic_usage.py` - Basic CSV operations\n- `advanced_usage.py` - Advanced data manipulation\n- `sql_usage.py` - SQL query examples\n- `live_data_usage.py` - Database and API connections\n- `simple_live_usage.py` - Simple live data import\n\n## Contributing\n\nContributions are welcome! Please feel free to submit a Pull Request.\n\n## License\n\nThis project is licensed under the MIT License - see the LICENSE file for details. \n",
"bugtrack_url": null,
"license": null,
"summary": "A user-friendly Python package for working with CSV data",
"version": "0.1.0",
"project_urls": {
"Bug Reports": "https://github.com/conorzen/dataprocess/issues",
"Documentation": "https://github.com/conorzen/dataprocess#readme",
"Homepage": "https://github.com/conorzen/dataprocess",
"Source": "https://github.com/conorzen/dataprocess"
},
"split_keywords": [
"csv",
" data",
" processing",
" pandas",
" sql",
" live-data"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "20d7fd486b8d7f7be0dab61d2dacbee3b0287a618ef38f5a5065ed915229dcb7",
"md5": "950e594930693ac885eec3f57ea2f0ad",
"sha256": "2d547136c39748a59198c677b25acad48cdece9e0db7f9a0effff21605e4c026"
},
"downloads": -1,
"filename": "dataprocesslite-0.1.0-py3-none-any.whl",
"has_sig": false,
"md5_digest": "950e594930693ac885eec3f57ea2f0ad",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 28252,
"upload_time": "2025-07-26T18:35:13",
"upload_time_iso_8601": "2025-07-26T18:35:13.862576Z",
"url": "https://files.pythonhosted.org/packages/20/d7/fd486b8d7f7be0dab61d2dacbee3b0287a618ef38f5a5065ed915229dcb7/dataprocesslite-0.1.0-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "abfcdbd947f8b02e9e0c285e477b8861ed0d3d1bfd4a77762c6501e855535eb9",
"md5": "058dc1bd59b2c3c4fc6cba9a8d3e5c9c",
"sha256": "fd11a9572706933774435115e08dbf536d28e296640d52c275b2bc6026fc41eb"
},
"downloads": -1,
"filename": "dataprocesslite-0.1.0.tar.gz",
"has_sig": false,
"md5_digest": "058dc1bd59b2c3c4fc6cba9a8d3e5c9c",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8",
"size": 27652,
"upload_time": "2025-07-26T18:35:15",
"upload_time_iso_8601": "2025-07-26T18:35:15.380189Z",
"url": "https://files.pythonhosted.org/packages/ab/fc/dbd947f8b02e9e0c285e477b8861ed0d3d1bfd4a77762c6501e855535eb9/dataprocesslite-0.1.0.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-07-26 18:35:15",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "conorzen",
"github_project": "dataprocess",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [
{
"name": "pandas",
"specs": [
[
">=",
"1.5.0"
]
]
},
{
"name": "chardet",
"specs": [
[
">=",
"4.0.0"
]
]
},
{
"name": "python-dateutil",
"specs": [
[
">=",
"2.8.0"
]
]
},
{
"name": "numpy",
"specs": [
[
">=",
"1.21.0"
]
]
},
{
"name": "openpyxl",
"specs": [
[
">=",
"3.0.0"
]
]
},
{
"name": "pyarrow",
"specs": [
[
">=",
"7.0.0"
]
]
},
{
"name": "requests",
"specs": [
[
">=",
"2.25.0"
]
]
},
{
"name": "psycopg2-binary",
"specs": [
[
">=",
"2.9.0"
]
]
},
{
"name": "mysql-connector-python",
"specs": [
[
">=",
"8.0.0"
]
]
}
],
"lcname": "dataprocesslite"
}