# Data Playbook
:book: Playbooks for data. Open, process and save table based data.
[![Workflow Status](https://github.com/kellerza/data-playbook/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/kellerza/data-playbook/actions)
[![codecov](https://codecov.io/gh/kellerza/data-playbook/branch/master/graph/badge.svg)](https://codecov.io/gh/kellerza/data-playbook)
Automate repetitive tasks on table based data. Include various input and output tasks.
Install: `pip install dataplaybook`
Use the `@task` and `@playbook` decorators
```python
from dataplaybook import task, playbook
from dataplaybook.tasks.io_xlsx
@task
def print
```
## Tasks
Tasks are implemented as simple Python functions and the modules can be found in the dataplaybook/tasks folder.
| Module | Functions |
| :----------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------- |
| Generic function to work on tables<br>`dataplaybook.tasks` | build_lookup, build_lookup_var, combine, drop, extend, filter, print, replace, unique, vlookup |
| Fuzzy string matching <br>`dataplaybook.taksk.fuzzy`<br> Requires _pip install fuzzywuzzy_ | |
| Read/write excel files ()<br>`dataplaybook.tasks.io_xlsx` | read_excel, write_excel |
| Misc IO tasks<br>`dataplaybook.tasks.io_misc` | read_csv, read_tab_delim, read_text_regex, wget, write_csv |
| MongoDB functions<br>`dataplaybook.tasks.io_mongo` | read_mongo, write_mongo, columns_to_list, list_to_columns |
| PDF functions. Requires _pdftotext_ on your path<br>`dataplaybook.tasks.io_pdf` | read_pdf_pages, read_pdf_files |
| Read XML<br>`dataplaybook.tasks.io_xml` | read_xml |
```bash
$ dataplaybook --all -vvv
dataplaybook.tasks
- build_lookup "(table: list[RowData], key: str, columns: list[str]) -> RowDataGen"
- build_lookup_dict "(table: list[RowData], key: str | list[str], columns: list[str] | None = None) -> dict[str | tuple, Any]"
- combine "(tables: list[list[RowData]], key: str, columns: list[str], value: Union[Literal[True], str] = True) -> list[RowData]"
- ensure_lists "(tables: Sequence[list[RowData]], columns: Sequence[str]) -> None"
- filter_rows "(table: list[RowData], include: dict[str, str] | None = None, exclude: dict[str, str] | None = None) -> RowDataGen"
- print_table "(*, table: list[RowData] | None = None, tables: dict[str, list[RowData]] | None = None) -> None"
- remove_null "(tables: Sequence[list[RowData]]) -> None"
- replace "(table: list[RowData], replace_dict: dict[str, str], columns: list[str]) -> None"
- unique "(table: list[RowData], key: str) -> RowDataGen"
- vlookup "(table0: list[RowData], acro: list[RowData], columns: list[str]) -> None"
dataplaybook.tasks.fuzzy
- fuzzy_match "(table1: list[RowData], table2: list[RowData], t1_column: str, t2_column: str, t1_target_column: str) -> None"
dataplaybook.tasks.ietf
- add_standards_column "(table: list[RowData], columns: list[str], rfc_col: str) -> None"
- extract_standards_from_table "(table: list[RowData], extract_columns: list[str], include_columns: list[str] | None = None, name: str = '', line_offset: int = 1) -> RowDataGen"
dataplaybook.tasks.gis
- linestring "(table: list[RowData], lat_a: str = 'latA', lat_b: str = 'latB', lon_a: str = 'lonA', lon_b: str = 'lonB', linestring_column: str = 'linestring', error: str = '22 -22') -> list[RowData]"
dataplaybook.tasks.io_mail
- mail "(to_addrs: list[str] | str, from_addr: str, subject: str, server: str, files: list[str] | None = None, priority: int = 4, body: str | None = '', html: str | None = '', cc_addrs: list[str] | None = None, bcc_addrs: list[str] | None = None) -> None"
dataplaybook.tasks.io_misc
- file_rotate "(file: str, count: int = 3) -> None"
- glob "(patterns: list[str]) -> RowDataGen"
- read_csv "(file: str, columns: dict[str, str] | None = None) -> RowDataGen"
- read_json "(file: str) -> list[RowData]"
- read_tab_delim "(file: str, headers: list[str]) -> RowDataGen"
- read_text_regex "(filename: str, newline: Pattern, fields: Optional[Pattern]) -> RowDataGen"
- wget "(url: str, file: str, age: int = 172800) -> None"
- write_csv "(table: list[RowData], file: str, header: list[str] | None = None) -> None"
- write_json "(data: dict[str, list[RowData]] | list[RowData], file: str, only_var: bool = False) -> None"
dataplaybook.tasks.io_mongo
- columns_to_list "(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'"
- list_to_columns "(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'"
- mongo_delete_sids "(*, mdb: 'MongoURI', sids: 'list[str]') -> 'None'"
- mongo_list_sids "(mdb: 'MongoURI') -> 'list[str]'"
- mongo_sync_sids "(*, mdb_local: 'MongoURI', mdb_remote: 'MongoURI', ignore_remote: 'Sequence[str] | None' = None, only_sync_sids: 'Sequence[str] | None' = None) -> 'None'"
- read_mongo "(mdb: 'MongoURI', *, set_id: 'str | None' = None) -> 'RowDataGen'"
- write_mongo "(table: 'list[RowData]', mdb: 'MongoURI', *, set_id: 'str | None' = None, force: 'bool' = False) -> 'None'"
dataplaybook.tasks.io_pdf
- read_pdf_files "(folder: str, pattern: str = '*.pdf', *, layout: bool = True, args: list[str] | None = None) -> RowDataGen"
- read_pdf_pages "(filename: str, *, layout: bool = True, args: list[str] | None = None) -> RowDataGen"
dataplaybook.tasks.io_xlsx
- read_excel "(*, tables: dict[str, list[RowData]], file: str, sheets: list[RowData] | None = None) -> list[str]"
- write_excel "(*, tables: dict[str, list[RowData]], file: str, include: list[str] | None = None, header: list[str] | None = None, headers: list[Any] | None = None, ensure_string: bool = False) -> None"
dataplaybook.tasks.io_xml
- read_xml "(tables: dict[str, list[RowData]], file: str, targets: list[str]) -> None"
```
## Local development
Poetry is used for dependency management. Install poetry and run `poetry install` to install the dependencies.
```bash
poetry install -E all
```
pre-commit is used for code formatting and linting. Install pre-commit and run `pre-commit install` to install the git hooks.
```bash
pip install pre-commit && pre-commit install
```
Test locally using pre-commit (ruff, codespell, mypy & pylint)
```bash
git add . && pre-commit run --all
poetry run pylint dataplaybook tests
```
## Data Playbook v0 - origins
Data playbooks was created to replace various snippets of code I had lying around. They were all created to ensure repeatability of some menial task, and generally followed a similar structure of load something, process it and save it. (Process network data into GIS tools, network audits & reporting on router & NMS output, Extract IETF standards to complete SOCs, read my bank statements into my Excel budgeting tool, etc.)
For many of these tasks I have specific processing code (`tasks_x.py`, loaded with `modules: [tasks_x]` in the playbook), but in almost all cases input & output tasks (and configuring these names etc) are common. The idea of the modular tasks originally came from Home Assistant, where I started learning Python and the idea of "custom components" to add your own integrations, although one could argue this also has similarities to Ansible playbooks.
In many cases I have a 'loose' coupling to actual file names, using Everything search (`!es search_pattern` in the playbook) to resolve a search pattern to the correct file used for input.
It has some parts in common with Ansible Playbooks, especially the name was chosen after I was introduced to Ansible Playbooks. The task structure has been updated in 2019 to match the Ansible Playbooks 2.0/2.5+ format and allow names. This format will also be easier to introduce loop mechanisms etc.
### Comparison to Ansible Playbooks
Data playbooks is intended to create and modify variables in the environment (similar to **inventory**). Data playbooks starts with an empty environment (although you can read the environment from various sources inside the play).
Although new variables can be created using **register:** in Ansible, data playbook functions requires the output to be captured through `target:`.
Data playbook tasks are different form Ansible's **actions**:
- They are mostly not idempotent, since the intention is to modify tables as we go along,
- they can return lists containing rows or be Python iterators (that `yield` rows of a table)
- if they dont return any tabular data (a list), the return value will be added to the `var` table in the environment
- Each have a strict voluptuous schema, evaluated when loading and during runtime (e.g. to expand templates) to allow quick troubleshooting
You could argue I can do this with Ansible, but it won't be as elegant with single item hosts files, `gather_facts: no` and `delegate_to: localhost` throughout the playbooks. It will likely only be half as much fun trying to force it into my way of thinking.
Raw data
{
"_id": null,
"home_page": "https://github.com/kellerza/data-playbook",
"name": "dataplaybook",
"maintainer": null,
"docs_url": null,
"requires_python": ">3.10",
"maintainer_email": null,
"keywords": "data, tables, excel, mongodb, generators",
"author": "Johann Kellerman",
"author_email": "kellerza@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/27/b6/34731ca237bade0a5b891b537f065a0c3380c67d163e0c320116eb172c56/dataplaybook-1.0.20.tar.gz",
"platform": null,
"description": "# Data Playbook\n\n:book: Playbooks for data. Open, process and save table based data.\n[![Workflow Status](https://github.com/kellerza/data-playbook/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/kellerza/data-playbook/actions)\n[![codecov](https://codecov.io/gh/kellerza/data-playbook/branch/master/graph/badge.svg)](https://codecov.io/gh/kellerza/data-playbook)\n\nAutomate repetitive tasks on table based data. Include various input and output tasks.\n\nInstall: `pip install dataplaybook`\n\nUse the `@task` and `@playbook` decorators\n\n```python\nfrom dataplaybook import task, playbook\nfrom dataplaybook.tasks.io_xlsx\n\n@task\ndef print\n```\n\n## Tasks\n\nTasks are implemented as simple Python functions and the modules can be found in the dataplaybook/tasks folder.\n\n| Module | Functions |\n| :----------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------- |\n| Generic function to work on tables<br>`dataplaybook.tasks` | build_lookup, build_lookup_var, combine, drop, extend, filter, print, replace, unique, vlookup |\n| Fuzzy string matching <br>`dataplaybook.taksk.fuzzy`<br> Requires _pip install fuzzywuzzy_ | |\n| Read/write excel files ()<br>`dataplaybook.tasks.io_xlsx` | read_excel, write_excel |\n| Misc IO tasks<br>`dataplaybook.tasks.io_misc` | read_csv, read_tab_delim, read_text_regex, wget, write_csv |\n| MongoDB functions<br>`dataplaybook.tasks.io_mongo` | read_mongo, write_mongo, columns_to_list, list_to_columns |\n| PDF functions. Requires _pdftotext_ on your path<br>`dataplaybook.tasks.io_pdf` | read_pdf_pages, read_pdf_files |\n| Read XML<br>`dataplaybook.tasks.io_xml` | read_xml |\n\n```bash\n$ dataplaybook --all -vvv\ndataplaybook.tasks\n- build_lookup \"(table: list[RowData], key: str, columns: list[str]) -> RowDataGen\"\n- build_lookup_dict \"(table: list[RowData], key: str | list[str], columns: list[str] | None = None) -> dict[str | tuple, Any]\"\n- combine \"(tables: list[list[RowData]], key: str, columns: list[str], value: Union[Literal[True], str] = True) -> list[RowData]\"\n- ensure_lists \"(tables: Sequence[list[RowData]], columns: Sequence[str]) -> None\"\n- filter_rows \"(table: list[RowData], include: dict[str, str] | None = None, exclude: dict[str, str] | None = None) -> RowDataGen\"\n- print_table \"(*, table: list[RowData] | None = None, tables: dict[str, list[RowData]] | None = None) -> None\"\n- remove_null \"(tables: Sequence[list[RowData]]) -> None\"\n- replace \"(table: list[RowData], replace_dict: dict[str, str], columns: list[str]) -> None\"\n- unique \"(table: list[RowData], key: str) -> RowDataGen\"\n- vlookup \"(table0: list[RowData], acro: list[RowData], columns: list[str]) -> None\"\ndataplaybook.tasks.fuzzy\n- fuzzy_match \"(table1: list[RowData], table2: list[RowData], t1_column: str, t2_column: str, t1_target_column: str) -> None\"\ndataplaybook.tasks.ietf\n- add_standards_column \"(table: list[RowData], columns: list[str], rfc_col: str) -> None\"\n- extract_standards_from_table \"(table: list[RowData], extract_columns: list[str], include_columns: list[str] | None = None, name: str = '', line_offset: int = 1) -> RowDataGen\"\ndataplaybook.tasks.gis\n- linestring \"(table: list[RowData], lat_a: str = 'latA', lat_b: str = 'latB', lon_a: str = 'lonA', lon_b: str = 'lonB', linestring_column: str = 'linestring', error: str = '22 -22') -> list[RowData]\"\ndataplaybook.tasks.io_mail\n- mail \"(to_addrs: list[str] | str, from_addr: str, subject: str, server: str, files: list[str] | None = None, priority: int = 4, body: str | None = '', html: str | None = '', cc_addrs: list[str] | None = None, bcc_addrs: list[str] | None = None) -> None\"\ndataplaybook.tasks.io_misc\n- file_rotate \"(file: str, count: int = 3) -> None\"\n- glob \"(patterns: list[str]) -> RowDataGen\"\n- read_csv \"(file: str, columns: dict[str, str] | None = None) -> RowDataGen\"\n- read_json \"(file: str) -> list[RowData]\"\n- read_tab_delim \"(file: str, headers: list[str]) -> RowDataGen\"\n- read_text_regex \"(filename: str, newline: Pattern, fields: Optional[Pattern]) -> RowDataGen\"\n- wget \"(url: str, file: str, age: int = 172800) -> None\"\n- write_csv \"(table: list[RowData], file: str, header: list[str] | None = None) -> None\"\n- write_json \"(data: dict[str, list[RowData]] | list[RowData], file: str, only_var: bool = False) -> None\"\ndataplaybook.tasks.io_mongo\n- columns_to_list \"(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'\"\n- list_to_columns \"(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'\"\n- mongo_delete_sids \"(*, mdb: 'MongoURI', sids: 'list[str]') -> 'None'\"\n- mongo_list_sids \"(mdb: 'MongoURI') -> 'list[str]'\"\n- mongo_sync_sids \"(*, mdb_local: 'MongoURI', mdb_remote: 'MongoURI', ignore_remote: 'Sequence[str] | None' = None, only_sync_sids: 'Sequence[str] | None' = None) -> 'None'\"\n- read_mongo \"(mdb: 'MongoURI', *, set_id: 'str | None' = None) -> 'RowDataGen'\"\n- write_mongo \"(table: 'list[RowData]', mdb: 'MongoURI', *, set_id: 'str | None' = None, force: 'bool' = False) -> 'None'\"\ndataplaybook.tasks.io_pdf\n- read_pdf_files \"(folder: str, pattern: str = '*.pdf', *, layout: bool = True, args: list[str] | None = None) -> RowDataGen\"\n- read_pdf_pages \"(filename: str, *, layout: bool = True, args: list[str] | None = None) -> RowDataGen\"\ndataplaybook.tasks.io_xlsx\n- read_excel \"(*, tables: dict[str, list[RowData]], file: str, sheets: list[RowData] | None = None) -> list[str]\"\n- write_excel \"(*, tables: dict[str, list[RowData]], file: str, include: list[str] | None = None, header: list[str] | None = None, headers: list[Any] | None = None, ensure_string: bool = False) -> None\"\ndataplaybook.tasks.io_xml\n- read_xml \"(tables: dict[str, list[RowData]], file: str, targets: list[str]) -> None\"\n```\n\n## Local development\n\nPoetry is used for dependency management. Install poetry and run `poetry install` to install the dependencies.\n\n```bash\npoetry install -E all\n```\n\npre-commit is used for code formatting and linting. Install pre-commit and run `pre-commit install` to install the git hooks.\n\n```bash\npip install pre-commit && pre-commit install\n```\n\nTest locally using pre-commit (ruff, codespell, mypy & pylint)\n\n```bash\ngit add . && pre-commit run --all\npoetry run pylint dataplaybook tests\n```\n\n## Data Playbook v0 - origins\n\nData playbooks was created to replace various snippets of code I had lying around. They were all created to ensure repeatability of some menial task, and generally followed a similar structure of load something, process it and save it. (Process network data into GIS tools, network audits & reporting on router & NMS output, Extract IETF standards to complete SOCs, read my bank statements into my Excel budgeting tool, etc.)\n\nFor many of these tasks I have specific processing code (`tasks_x.py`, loaded with `modules: [tasks_x]` in the playbook), but in almost all cases input & output tasks (and configuring these names etc) are common. The idea of the modular tasks originally came from Home Assistant, where I started learning Python and the idea of \"custom components\" to add your own integrations, although one could argue this also has similarities to Ansible playbooks.\n\nIn many cases I have a 'loose' coupling to actual file names, using Everything search (`!es search_pattern` in the playbook) to resolve a search pattern to the correct file used for input.\n\nIt has some parts in common with Ansible Playbooks, especially the name was chosen after I was introduced to Ansible Playbooks. The task structure has been updated in 2019 to match the Ansible Playbooks 2.0/2.5+ format and allow names. This format will also be easier to introduce loop mechanisms etc.\n\n### Comparison to Ansible Playbooks\n\nData playbooks is intended to create and modify variables in the environment (similar to **inventory**). Data playbooks starts with an empty environment (although you can read the environment from various sources inside the play).\nAlthough new variables can be created using **register:** in Ansible, data playbook functions requires the output to be captured through `target:`.\n\nData playbook tasks are different form Ansible's **actions**:\n\n- They are mostly not idempotent, since the intention is to modify tables as we go along,\n- they can return lists containing rows or be Python iterators (that `yield` rows of a table)\n- if they dont return any tabular data (a list), the return value will be added to the `var` table in the environment\n- Each have a strict voluptuous schema, evaluated when loading and during runtime (e.g. to expand templates) to allow quick troubleshooting\n\nYou could argue I can do this with Ansible, but it won't be as elegant with single item hosts files, `gather_facts: no` and `delegate_to: localhost` throughout the playbooks. It will likely only be half as much fun trying to force it into my way of thinking.\n\n",
"bugtrack_url": null,
"license": "Apache-2.0",
"summary": "Playbooks for data. Open, process and save table based data.",
"version": "1.0.20",
"project_urls": {
"Homepage": "https://github.com/kellerza/data-playbook",
"Repository": "https://github.com/kellerza/data-playbook"
},
"split_keywords": [
"data",
" tables",
" excel",
" mongodb",
" generators"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "b957114da51741dc6367fd2aa48ae12d3d31a61c4356dd8ac46d463836320994",
"md5": "92273b2cf890a78f91251698aa6bcb9e",
"sha256": "988e35851f33d590d5249fcf79264ab3adc0bb89101a07d6e411c95e64f6401f"
},
"downloads": -1,
"filename": "dataplaybook-1.0.20-py3-none-any.whl",
"has_sig": false,
"md5_digest": "92273b2cf890a78f91251698aa6bcb9e",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">3.10",
"size": 43603,
"upload_time": "2024-10-02T13:13:26",
"upload_time_iso_8601": "2024-10-02T13:13:26.501356Z",
"url": "https://files.pythonhosted.org/packages/b9/57/114da51741dc6367fd2aa48ae12d3d31a61c4356dd8ac46d463836320994/dataplaybook-1.0.20-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "27b634731ca237bade0a5b891b537f065a0c3380c67d163e0c320116eb172c56",
"md5": "a5c239334c5e0c4fe6c16729a546a218",
"sha256": "402be86268aa14581e7c05b9ccb9bec049eafd4ab1286c6ba7bb972360316118"
},
"downloads": -1,
"filename": "dataplaybook-1.0.20.tar.gz",
"has_sig": false,
"md5_digest": "a5c239334c5e0c4fe6c16729a546a218",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">3.10",
"size": 38662,
"upload_time": "2024-10-02T13:13:27",
"upload_time_iso_8601": "2024-10-02T13:13:27.800099Z",
"url": "https://files.pythonhosted.org/packages/27/b6/34731ca237bade0a5b891b537f065a0c3380c67d163e0c320116eb172c56/dataplaybook-1.0.20.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-10-02 13:13:27",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "kellerza",
"github_project": "data-playbook",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"lcname": "dataplaybook"
}