# dutil
A few data utilities to make life of a data scientist easier
## Installation
```shell
pip install dutil
```
## Modules
- `pipeline` (data caching and pipelines)
- `stats` (statistical functions)
- `string` (string manipulations)
- `transform` (data transformations)
- `jupyter` (tools for jupyter notebooks)
### Pipeline
```python
import dutil.pipeline as dpipe
import pandas as pd
import numpy as np
from loguru import logger
# --- Define data transformations via step functions (similar to dask.delayed)
@dpipe.delayed_cached() # lazy computation + caching on disk
def load_1():
df = pd.DataFrame({'a': [1., 2.], 'b': [0.1, np.nan]})
logger.info('Loaded {} records'.format(len(df)))
return df
@dpipe.delayed_cached() # lazy computation + caching on disk
def load_2(timestamp):
df = pd.DataFrame({'a': [0.9, 3.], 'b': [0.001, 1.]})
logger.info('Loaded {} records'.format(len(df)))
return df
@dpipe.delayed_cached() # lazy computation + caching on disk
def compute(x, y, eps):
assert x.shape == y.shape
diff = ((x - y).abs() / (y.abs()+eps)).mean().mean()
logger.info('Difference is computed')
return diff
# Define pipeline dependencies
ts = pd.Timestamp(2019, 1, 1)
eps = 0.01
s1 = load_1()
s2 = load_2(ts)
diff = compute(s1, s2, eps)
# Trigger pipeline execution
print('diff: {:.3f}'.format(dpipe.delayed_compute((diff, ))[0]))
```
### Stats
```python
from dutil.stats import mean_lower, mean_upper
import pandas as pd
ss = pd.Series([0, 1, 5, -1])
mean_lower(ss) # Compute mean among 50% smallest elements
mean_upper(ss) # Compute mean among 50% biggest elements
```
### String
```python
from dutil.string import compare_companies
compare_companies("Aarons Holdings Company Inc.", "Aaron's, Inc.") # Give match rating for two company names
```
### Transform
```python
from dutil.transform import ht
import pandas as pd
df = pd.DataFrame({'a': [0, 2, 2, 4, 6], 'b': [1, 1, 1, 1, 1]})
ht(df) # Return first and last rows of a DataFrame, a Series, or an array
```
### Jupyter
```python
from dutil.jupyter import dht
import pandas as pd
df = pd.DataFrame({'a': [0, 2, 2, 4, 6], 'b': [1, 1, 1, 1, 1]})
dht(df) # Display first and last rows of a DataFrame, a Series, or an array in a Jupyter notebook
```
Raw data
{
"_id": null,
"home_page": "https://github.com/mysterious-ben/dutil",
"name": "dutil",
"maintainer": "",
"docs_url": null,
"requires_python": ">=3.7",
"maintainer_email": "",
"keywords": "",
"author": "Yaroslav Kopotilov",
"author_email": "",
"download_url": "https://files.pythonhosted.org/packages/73/a6/a9b2345a1196aa80e819c47364e3b5bb7a6285bf8f3aeddc9fd3529172e1/dutil-0.2.21.tar.gz",
"platform": "",
"description": "# dutil\n\nA few data utilities to make life of a data scientist easier\n\n## Installation\n\n```shell\npip install dutil\n```\n\n## Modules\n\n- `pipeline` (data caching and pipelines)\n- `stats` (statistical functions)\n- `string` (string manipulations)\n- `transform` (data transformations)\n- `jupyter` (tools for jupyter notebooks)\n\n\n### Pipeline\n\n```python\nimport dutil.pipeline as dpipe\nimport pandas as pd\nimport numpy as np\nfrom loguru import logger\n\n# --- Define data transformations via step functions (similar to dask.delayed)\n\n@dpipe.delayed_cached() # lazy computation + caching on disk\ndef load_1():\n df = pd.DataFrame({'a': [1., 2.], 'b': [0.1, np.nan]})\n logger.info('Loaded {} records'.format(len(df)))\n return df\n\n@dpipe.delayed_cached() # lazy computation + caching on disk\ndef load_2(timestamp):\n df = pd.DataFrame({'a': [0.9, 3.], 'b': [0.001, 1.]})\n logger.info('Loaded {} records'.format(len(df)))\n return df\n\n@dpipe.delayed_cached() # lazy computation + caching on disk\ndef compute(x, y, eps):\n assert x.shape == y.shape\n diff = ((x - y).abs() / (y.abs()+eps)).mean().mean()\n logger.info('Difference is computed')\n return diff\n\n# Define pipeline dependencies\nts = pd.Timestamp(2019, 1, 1)\neps = 0.01\ns1 = load_1()\ns2 = load_2(ts)\ndiff = compute(s1, s2, eps)\n\n# Trigger pipeline execution\nprint('diff: {:.3f}'.format(dpipe.delayed_compute((diff, ))[0]))\n```\n\n### Stats\n\n```python\nfrom dutil.stats import mean_lower, mean_upper\nimport pandas as pd\nss = pd.Series([0, 1, 5, -1])\nmean_lower(ss) # Compute mean among 50% smallest elements\nmean_upper(ss) # Compute mean among 50% biggest elements\n```\n\n### String\n\n```python\nfrom dutil.string import compare_companies\ncompare_companies(\"Aarons Holdings Company Inc.\", \"Aaron's, Inc.\") # Give match rating for two company names\n```\n\n### Transform\n\n```python\nfrom dutil.transform import ht\nimport pandas as pd\ndf = pd.DataFrame({'a': [0, 2, 2, 4, 6], 'b': [1, 1, 1, 1, 1]})\nht(df) # Return first and last rows of a DataFrame, a Series, or an array\n```\n\n### Jupyter\n\n```python\nfrom dutil.jupyter import dht\nimport pandas as pd\ndf = pd.DataFrame({'a': [0, 2, 2, 4, 6], 'b': [1, 1, 1, 1, 1]})\ndht(df) # Display first and last rows of a DataFrame, a Series, or an array in a Jupyter notebook\n```\n\n\n",
"bugtrack_url": null,
"license": "Apache License, Version 2.0",
"summary": "A few useful tools for data wrangling",
"version": "0.2.21",
"split_keywords": [],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "bdabe57413a062af54dccea8beb9cbd0351b488ea75894bbff1fc02d6b625959",
"md5": "00a546df5eea29ed530dc66bdecff192",
"sha256": "ef38e08dcf5556c553595e10988ae512884995d08cc154c3b7b5799b450dddd1"
},
"downloads": -1,
"filename": "dutil-0.2.21-py3-none-any.whl",
"has_sig": false,
"md5_digest": "00a546df5eea29ed530dc66bdecff192",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.7",
"size": 19721,
"upload_time": "2020-12-25T14:24:35",
"upload_time_iso_8601": "2020-12-25T14:24:35.841586Z",
"url": "https://files.pythonhosted.org/packages/bd/ab/e57413a062af54dccea8beb9cbd0351b488ea75894bbff1fc02d6b625959/dutil-0.2.21-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "73a6a9b2345a1196aa80e819c47364e3b5bb7a6285bf8f3aeddc9fd3529172e1",
"md5": "d3f4d8fa73f7289ce15d8642c423c7f6",
"sha256": "1112b4e7077fae7d9e831386da98807691c137a581bce8e3ad5ff8dd6c21f68d"
},
"downloads": -1,
"filename": "dutil-0.2.21.tar.gz",
"has_sig": false,
"md5_digest": "d3f4d8fa73f7289ce15d8642c423c7f6",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.7",
"size": 13372,
"upload_time": "2020-12-25T14:24:36",
"upload_time_iso_8601": "2020-12-25T14:24:36.914734Z",
"url": "https://files.pythonhosted.org/packages/73/a6/a9b2345a1196aa80e819c47364e3b5bb7a6285bf8f3aeddc9fd3529172e1/dutil-0.2.21.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2020-12-25 14:24:36",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "mysterious-ben",
"github_project": "dutil",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [
{
"name": "dask",
"specs": [
[
"==",
"2.29.0"
]
]
},
{
"name": "dill",
"specs": [
[
"==",
"0.3.2"
]
]
},
{
"name": "fuzzywuzzy",
"specs": [
[
"==",
"0.18.0"
]
]
},
{
"name": "ipython",
"specs": [
[
"==",
"7.19.0"
]
]
},
{
"name": "loguru",
"specs": [
[
"==",
"0.5.0"
]
]
},
{
"name": "numpy",
"specs": [
[
"==",
"1.18.4"
]
]
},
{
"name": "pandas",
"specs": [
[
"==",
"1.0.4"
]
]
},
{
"name": "pyarrow",
"specs": [
[
"==",
"1.0.1"
]
]
},
{
"name": "pytest",
"specs": [
[
"==",
"5.4.2"
]
]
},
{
"name": "python_Levenshtein",
"specs": [
[
"==",
"0.12.0"
]
]
},
{
"name": "xxhash",
"specs": [
[
"==",
"2.0.0"
]
]
}
],
"lcname": "dutil"
}