dataplaybook


Namedataplaybook JSON
Version 1.0.20 PyPI version JSON
download
home_pagehttps://github.com/kellerza/data-playbook
SummaryPlaybooks for data. Open, process and save table based data.
upload_time2024-10-02 13:13:27
maintainerNone
docs_urlNone
authorJohann Kellerman
requires_python>3.10
licenseApache-2.0
keywords data tables excel mongodb generators
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # Data Playbook

:book: Playbooks for data. Open, process and save table based data.
[![Workflow Status](https://github.com/kellerza/data-playbook/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/kellerza/data-playbook/actions)
[![codecov](https://codecov.io/gh/kellerza/data-playbook/branch/master/graph/badge.svg)](https://codecov.io/gh/kellerza/data-playbook)

Automate repetitive tasks on table based data. Include various input and output tasks.

Install: `pip install dataplaybook`

Use the `@task` and `@playbook` decorators

```python
from dataplaybook import task, playbook
from dataplaybook.tasks.io_xlsx

@task
def print
```

## Tasks

Tasks are implemented as simple Python functions and the modules can be found in the dataplaybook/tasks folder.

| Module                                                                                     | Functions                                                                                      |
| :----------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------- |
| Generic function to work on tables<br>`dataplaybook.tasks`                                 | build_lookup, build_lookup_var, combine, drop, extend, filter, print, replace, unique, vlookup |
| Fuzzy string matching <br>`dataplaybook.taksk.fuzzy`<br> Requires _pip install fuzzywuzzy_ |                                                                                                |
| Read/write excel files ()<br>`dataplaybook.tasks.io_xlsx`                                  | read_excel, write_excel                                                                        |
| Misc IO tasks<br>`dataplaybook.tasks.io_misc`                                              | read_csv, read_tab_delim, read_text_regex, wget, write_csv                                     |
| MongoDB functions<br>`dataplaybook.tasks.io_mongo`                                         | read_mongo, write_mongo, columns_to_list, list_to_columns                                      |
| PDF functions. Requires _pdftotext_ on your path<br>`dataplaybook.tasks.io_pdf`            | read_pdf_pages, read_pdf_files                                                                 |
| Read XML<br>`dataplaybook.tasks.io_xml`                                                    | read_xml                                                                                       |

```bash
$ dataplaybook --all -vvv
dataplaybook.tasks
- build_lookup "(table: list[RowData], key: str, columns: list[str]) -> RowDataGen"
- build_lookup_dict "(table: list[RowData], key: str | list[str], columns: list[str] | None = None) -> dict[str | tuple, Any]"
- combine "(tables: list[list[RowData]], key: str, columns: list[str], value: Union[Literal[True], str] = True) -> list[RowData]"
- ensure_lists "(tables: Sequence[list[RowData]], columns: Sequence[str]) -> None"
- filter_rows "(table: list[RowData], include: dict[str, str] | None = None, exclude: dict[str, str] | None = None) -> RowDataGen"
- print_table "(*, table: list[RowData] | None = None, tables: dict[str, list[RowData]] | None = None) -> None"
- remove_null "(tables: Sequence[list[RowData]]) -> None"
- replace "(table: list[RowData], replace_dict: dict[str, str], columns: list[str]) -> None"
- unique "(table: list[RowData], key: str) -> RowDataGen"
- vlookup "(table0: list[RowData], acro: list[RowData], columns: list[str]) -> None"
dataplaybook.tasks.fuzzy
- fuzzy_match "(table1: list[RowData], table2: list[RowData], t1_column: str, t2_column: str, t1_target_column: str) -> None"
dataplaybook.tasks.ietf
- add_standards_column "(table: list[RowData], columns: list[str], rfc_col: str) -> None"
- extract_standards_from_table "(table: list[RowData], extract_columns: list[str], include_columns: list[str] | None = None, name: str = '', line_offset: int = 1) -> RowDataGen"
dataplaybook.tasks.gis
- linestring "(table: list[RowData], lat_a: str = 'latA', lat_b: str = 'latB', lon_a: str = 'lonA', lon_b: str = 'lonB', linestring_column: str = 'linestring', error: str = '22 -22') -> list[RowData]"
dataplaybook.tasks.io_mail
- mail "(to_addrs: list[str] | str, from_addr: str, subject: str, server: str, files: list[str] | None = None, priority: int = 4, body: str | None = '', html: str | None = '', cc_addrs: list[str] | None = None, bcc_addrs: list[str] | None = None) -> None"
dataplaybook.tasks.io_misc
- file_rotate "(file: str, count: int = 3) -> None"
- glob "(patterns: list[str]) -> RowDataGen"
- read_csv "(file: str, columns: dict[str, str] | None = None) -> RowDataGen"
- read_json "(file: str) -> list[RowData]"
- read_tab_delim "(file: str, headers: list[str]) -> RowDataGen"
- read_text_regex "(filename: str, newline: Pattern, fields: Optional[Pattern]) -> RowDataGen"
- wget "(url: str, file: str, age: int = 172800) -> None"
- write_csv "(table: list[RowData], file: str, header: list[str] | None = None) -> None"
- write_json "(data: dict[str, list[RowData]] | list[RowData], file: str, only_var: bool = False) -> None"
dataplaybook.tasks.io_mongo
- columns_to_list "(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'"
- list_to_columns "(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'"
- mongo_delete_sids "(*, mdb: 'MongoURI', sids: 'list[str]') -> 'None'"
- mongo_list_sids "(mdb: 'MongoURI') -> 'list[str]'"
- mongo_sync_sids "(*, mdb_local: 'MongoURI', mdb_remote: 'MongoURI', ignore_remote: 'Sequence[str] | None' = None, only_sync_sids: 'Sequence[str] | None' = None) -> 'None'"
- read_mongo "(mdb: 'MongoURI', *, set_id: 'str | None' = None) -> 'RowDataGen'"
- write_mongo "(table: 'list[RowData]', mdb: 'MongoURI', *, set_id: 'str | None' = None, force: 'bool' = False) -> 'None'"
dataplaybook.tasks.io_pdf
- read_pdf_files "(folder: str, pattern: str = '*.pdf', *, layout: bool = True, args: list[str] | None = None) -> RowDataGen"
- read_pdf_pages "(filename: str, *, layout: bool = True, args: list[str] | None = None) -> RowDataGen"
dataplaybook.tasks.io_xlsx
- read_excel "(*, tables: dict[str, list[RowData]], file: str, sheets: list[RowData] | None = None) -> list[str]"
- write_excel "(*, tables: dict[str, list[RowData]], file: str, include: list[str] | None = None, header: list[str] | None = None, headers: list[Any] | None = None, ensure_string: bool = False) -> None"
dataplaybook.tasks.io_xml
- read_xml "(tables: dict[str, list[RowData]], file: str, targets: list[str]) -> None"
```

## Local development

Poetry is used for dependency management. Install poetry and run `poetry install` to install the dependencies.

```bash
poetry install -E all
```

pre-commit is used for code formatting and linting. Install pre-commit and run `pre-commit install` to install the git hooks.

```bash
pip install pre-commit && pre-commit install
```

Test locally using pre-commit (ruff, codespell, mypy & pylint)

```bash
git add . && pre-commit run --all
poetry run pylint dataplaybook tests
```

## Data Playbook v0 - origins

Data playbooks was created to replace various snippets of code I had lying around. They were all created to ensure repeatability of some menial task, and generally followed a similar structure of load something, process it and save it. (Process network data into GIS tools, network audits & reporting on router & NMS output, Extract IETF standards to complete SOCs, read my bank statements into my Excel budgeting tool, etc.)

For many of these tasks I have specific processing code (`tasks_x.py`, loaded with `modules: [tasks_x]` in the playbook), but in almost all cases input & output tasks (and configuring these names etc) are common. The idea of the modular tasks originally came from Home Assistant, where I started learning Python and the idea of "custom components" to add your own integrations, although one could argue this also has similarities to Ansible playbooks.

In many cases I have a 'loose' coupling to actual file names, using Everything search (`!es search_pattern` in the playbook) to resolve a search pattern to the correct file used for input.

It has some parts in common with Ansible Playbooks, especially the name was chosen after I was introduced to Ansible Playbooks. The task structure has been updated in 2019 to match the Ansible Playbooks 2.0/2.5+ format and allow names. This format will also be easier to introduce loop mechanisms etc.

### Comparison to Ansible Playbooks

Data playbooks is intended to create and modify variables in the environment (similar to **inventory**). Data playbooks starts with an empty environment (although you can read the environment from various sources inside the play).
Although new variables can be created using **register:** in Ansible, data playbook functions requires the output to be captured through `target:`.

Data playbook tasks are different form Ansible's **actions**:

- They are mostly not idempotent, since the intention is to modify tables as we go along,
- they can return lists containing rows or be Python iterators (that `yield` rows of a table)
- if they dont return any tabular data (a list), the return value will be added to the `var` table in the environment
- Each have a strict voluptuous schema, evaluated when loading and during runtime (e.g. to expand templates) to allow quick troubleshooting

You could argue I can do this with Ansible, but it won't be as elegant with single item hosts files, `gather_facts: no` and `delegate_to: localhost` throughout the playbooks. It will likely only be half as much fun trying to force it into my way of thinking.


            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/kellerza/data-playbook",
    "name": "dataplaybook",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">3.10",
    "maintainer_email": null,
    "keywords": "data, tables, excel, mongodb, generators",
    "author": "Johann Kellerman",
    "author_email": "kellerza@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/27/b6/34731ca237bade0a5b891b537f065a0c3380c67d163e0c320116eb172c56/dataplaybook-1.0.20.tar.gz",
    "platform": null,
    "description": "# Data Playbook\n\n:book: Playbooks for data. Open, process and save table based data.\n[![Workflow Status](https://github.com/kellerza/data-playbook/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/kellerza/data-playbook/actions)\n[![codecov](https://codecov.io/gh/kellerza/data-playbook/branch/master/graph/badge.svg)](https://codecov.io/gh/kellerza/data-playbook)\n\nAutomate repetitive tasks on table based data. Include various input and output tasks.\n\nInstall: `pip install dataplaybook`\n\nUse the `@task` and `@playbook` decorators\n\n```python\nfrom dataplaybook import task, playbook\nfrom dataplaybook.tasks.io_xlsx\n\n@task\ndef print\n```\n\n## Tasks\n\nTasks are implemented as simple Python functions and the modules can be found in the dataplaybook/tasks folder.\n\n| Module                                                                                     | Functions                                                                                      |\n| :----------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------- |\n| Generic function to work on tables<br>`dataplaybook.tasks`                                 | build_lookup, build_lookup_var, combine, drop, extend, filter, print, replace, unique, vlookup |\n| Fuzzy string matching <br>`dataplaybook.taksk.fuzzy`<br> Requires _pip install fuzzywuzzy_ |                                                                                                |\n| Read/write excel files ()<br>`dataplaybook.tasks.io_xlsx`                                  | read_excel, write_excel                                                                        |\n| Misc IO tasks<br>`dataplaybook.tasks.io_misc`                                              | read_csv, read_tab_delim, read_text_regex, wget, write_csv                                     |\n| MongoDB functions<br>`dataplaybook.tasks.io_mongo`                                         | read_mongo, write_mongo, columns_to_list, list_to_columns                                      |\n| PDF functions. Requires _pdftotext_ on your path<br>`dataplaybook.tasks.io_pdf`            | read_pdf_pages, read_pdf_files                                                                 |\n| Read XML<br>`dataplaybook.tasks.io_xml`                                                    | read_xml                                                                                       |\n\n```bash\n$ dataplaybook --all -vvv\ndataplaybook.tasks\n- build_lookup \"(table: list[RowData], key: str, columns: list[str]) -> RowDataGen\"\n- build_lookup_dict \"(table: list[RowData], key: str | list[str], columns: list[str] | None = None) -> dict[str | tuple, Any]\"\n- combine \"(tables: list[list[RowData]], key: str, columns: list[str], value: Union[Literal[True], str] = True) -> list[RowData]\"\n- ensure_lists \"(tables: Sequence[list[RowData]], columns: Sequence[str]) -> None\"\n- filter_rows \"(table: list[RowData], include: dict[str, str] | None = None, exclude: dict[str, str] | None = None) -> RowDataGen\"\n- print_table \"(*, table: list[RowData] | None = None, tables: dict[str, list[RowData]] | None = None) -> None\"\n- remove_null \"(tables: Sequence[list[RowData]]) -> None\"\n- replace \"(table: list[RowData], replace_dict: dict[str, str], columns: list[str]) -> None\"\n- unique \"(table: list[RowData], key: str) -> RowDataGen\"\n- vlookup \"(table0: list[RowData], acro: list[RowData], columns: list[str]) -> None\"\ndataplaybook.tasks.fuzzy\n- fuzzy_match \"(table1: list[RowData], table2: list[RowData], t1_column: str, t2_column: str, t1_target_column: str) -> None\"\ndataplaybook.tasks.ietf\n- add_standards_column \"(table: list[RowData], columns: list[str], rfc_col: str) -> None\"\n- extract_standards_from_table \"(table: list[RowData], extract_columns: list[str], include_columns: list[str] | None = None, name: str = '', line_offset: int = 1) -> RowDataGen\"\ndataplaybook.tasks.gis\n- linestring \"(table: list[RowData], lat_a: str = 'latA', lat_b: str = 'latB', lon_a: str = 'lonA', lon_b: str = 'lonB', linestring_column: str = 'linestring', error: str = '22 -22') -> list[RowData]\"\ndataplaybook.tasks.io_mail\n- mail \"(to_addrs: list[str] | str, from_addr: str, subject: str, server: str, files: list[str] | None = None, priority: int = 4, body: str | None = '', html: str | None = '', cc_addrs: list[str] | None = None, bcc_addrs: list[str] | None = None) -> None\"\ndataplaybook.tasks.io_misc\n- file_rotate \"(file: str, count: int = 3) -> None\"\n- glob \"(patterns: list[str]) -> RowDataGen\"\n- read_csv \"(file: str, columns: dict[str, str] | None = None) -> RowDataGen\"\n- read_json \"(file: str) -> list[RowData]\"\n- read_tab_delim \"(file: str, headers: list[str]) -> RowDataGen\"\n- read_text_regex \"(filename: str, newline: Pattern, fields: Optional[Pattern]) -> RowDataGen\"\n- wget \"(url: str, file: str, age: int = 172800) -> None\"\n- write_csv \"(table: list[RowData], file: str, header: list[str] | None = None) -> None\"\n- write_json \"(data: dict[str, list[RowData]] | list[RowData], file: str, only_var: bool = False) -> None\"\ndataplaybook.tasks.io_mongo\n- columns_to_list \"(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'\"\n- list_to_columns \"(table: 'list[RowData]', *, list_column: 'str', columns: 'Columns') -> 'None'\"\n- mongo_delete_sids \"(*, mdb: 'MongoURI', sids: 'list[str]') -> 'None'\"\n- mongo_list_sids \"(mdb: 'MongoURI') -> 'list[str]'\"\n- mongo_sync_sids \"(*, mdb_local: 'MongoURI', mdb_remote: 'MongoURI', ignore_remote: 'Sequence[str] | None' = None, only_sync_sids: 'Sequence[str] | None' = None) -> 'None'\"\n- read_mongo \"(mdb: 'MongoURI', *, set_id: 'str | None' = None) -> 'RowDataGen'\"\n- write_mongo \"(table: 'list[RowData]', mdb: 'MongoURI', *, set_id: 'str | None' = None, force: 'bool' = False) -> 'None'\"\ndataplaybook.tasks.io_pdf\n- read_pdf_files \"(folder: str, pattern: str = '*.pdf', *, layout: bool = True, args: list[str] | None = None) -> RowDataGen\"\n- read_pdf_pages \"(filename: str, *, layout: bool = True, args: list[str] | None = None) -> RowDataGen\"\ndataplaybook.tasks.io_xlsx\n- read_excel \"(*, tables: dict[str, list[RowData]], file: str, sheets: list[RowData] | None = None) -> list[str]\"\n- write_excel \"(*, tables: dict[str, list[RowData]], file: str, include: list[str] | None = None, header: list[str] | None = None, headers: list[Any] | None = None, ensure_string: bool = False) -> None\"\ndataplaybook.tasks.io_xml\n- read_xml \"(tables: dict[str, list[RowData]], file: str, targets: list[str]) -> None\"\n```\n\n## Local development\n\nPoetry is used for dependency management. Install poetry and run `poetry install` to install the dependencies.\n\n```bash\npoetry install -E all\n```\n\npre-commit is used for code formatting and linting. Install pre-commit and run `pre-commit install` to install the git hooks.\n\n```bash\npip install pre-commit && pre-commit install\n```\n\nTest locally using pre-commit (ruff, codespell, mypy & pylint)\n\n```bash\ngit add . && pre-commit run --all\npoetry run pylint dataplaybook tests\n```\n\n## Data Playbook v0 - origins\n\nData playbooks was created to replace various snippets of code I had lying around. They were all created to ensure repeatability of some menial task, and generally followed a similar structure of load something, process it and save it. (Process network data into GIS tools, network audits & reporting on router & NMS output, Extract IETF standards to complete SOCs, read my bank statements into my Excel budgeting tool, etc.)\n\nFor many of these tasks I have specific processing code (`tasks_x.py`, loaded with `modules: [tasks_x]` in the playbook), but in almost all cases input & output tasks (and configuring these names etc) are common. The idea of the modular tasks originally came from Home Assistant, where I started learning Python and the idea of \"custom components\" to add your own integrations, although one could argue this also has similarities to Ansible playbooks.\n\nIn many cases I have a 'loose' coupling to actual file names, using Everything search (`!es search_pattern` in the playbook) to resolve a search pattern to the correct file used for input.\n\nIt has some parts in common with Ansible Playbooks, especially the name was chosen after I was introduced to Ansible Playbooks. The task structure has been updated in 2019 to match the Ansible Playbooks 2.0/2.5+ format and allow names. This format will also be easier to introduce loop mechanisms etc.\n\n### Comparison to Ansible Playbooks\n\nData playbooks is intended to create and modify variables in the environment (similar to **inventory**). Data playbooks starts with an empty environment (although you can read the environment from various sources inside the play).\nAlthough new variables can be created using **register:** in Ansible, data playbook functions requires the output to be captured through `target:`.\n\nData playbook tasks are different form Ansible's **actions**:\n\n- They are mostly not idempotent, since the intention is to modify tables as we go along,\n- they can return lists containing rows or be Python iterators (that `yield` rows of a table)\n- if they dont return any tabular data (a list), the return value will be added to the `var` table in the environment\n- Each have a strict voluptuous schema, evaluated when loading and during runtime (e.g. to expand templates) to allow quick troubleshooting\n\nYou could argue I can do this with Ansible, but it won't be as elegant with single item hosts files, `gather_facts: no` and `delegate_to: localhost` throughout the playbooks. It will likely only be half as much fun trying to force it into my way of thinking.\n\n",
    "bugtrack_url": null,
    "license": "Apache-2.0",
    "summary": "Playbooks for data. Open, process and save table based data.",
    "version": "1.0.20",
    "project_urls": {
        "Homepage": "https://github.com/kellerza/data-playbook",
        "Repository": "https://github.com/kellerza/data-playbook"
    },
    "split_keywords": [
        "data",
        " tables",
        " excel",
        " mongodb",
        " generators"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "b957114da51741dc6367fd2aa48ae12d3d31a61c4356dd8ac46d463836320994",
                "md5": "92273b2cf890a78f91251698aa6bcb9e",
                "sha256": "988e35851f33d590d5249fcf79264ab3adc0bb89101a07d6e411c95e64f6401f"
            },
            "downloads": -1,
            "filename": "dataplaybook-1.0.20-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "92273b2cf890a78f91251698aa6bcb9e",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">3.10",
            "size": 43603,
            "upload_time": "2024-10-02T13:13:26",
            "upload_time_iso_8601": "2024-10-02T13:13:26.501356Z",
            "url": "https://files.pythonhosted.org/packages/b9/57/114da51741dc6367fd2aa48ae12d3d31a61c4356dd8ac46d463836320994/dataplaybook-1.0.20-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "27b634731ca237bade0a5b891b537f065a0c3380c67d163e0c320116eb172c56",
                "md5": "a5c239334c5e0c4fe6c16729a546a218",
                "sha256": "402be86268aa14581e7c05b9ccb9bec049eafd4ab1286c6ba7bb972360316118"
            },
            "downloads": -1,
            "filename": "dataplaybook-1.0.20.tar.gz",
            "has_sig": false,
            "md5_digest": "a5c239334c5e0c4fe6c16729a546a218",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">3.10",
            "size": 38662,
            "upload_time": "2024-10-02T13:13:27",
            "upload_time_iso_8601": "2024-10-02T13:13:27.800099Z",
            "url": "https://files.pythonhosted.org/packages/27/b6/34731ca237bade0a5b891b537f065a0c3380c67d163e0c320116eb172c56/dataplaybook-1.0.20.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-10-02 13:13:27",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "kellerza",
    "github_project": "data-playbook",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "lcname": "dataplaybook"
}
        
Elapsed time: 2.05992s