polarmints


Namepolarmints JSON
Version 0.1.24 PyPI version JSON
download
home_pageNone
Summarysyntactic sugar and additional namespaces for polars
upload_time2024-10-04 21:11:55
maintainerNone
docs_urlNone
authorJohn Smith
requires_python>=3.7
licenseNone
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # Polarmints
Syntactic sugar for [polars](https://docs.pola.rs/user-guide/migration/pandas/) <br>
Apologies, not all features documented so feel free to explore codebase 

## Extensions
extends polars Dataframes with additional namespaces for convenience functions<br>
example:
```python
import polars as pl
from polarmints import PolarMints, c, DF
__all__ = [PolarMints] # required for extending DFs with polarmints, even though not explicitly used  

df = DF({
    'a': [1, 2, 3],
    'b': [1, 2, 3],
})

df2 = DF({
    'a': [1, 2, 3],
    'c': [1, 2, 3],
}, schema_overrides={'a': pl.Int16})

# df.pm: convenience helper funcs
joined = df2.pm.join(df, 'a') # implicitly converts datatypes before joining two DFs whose column types don't match

# this is contrived example since it's more efficient to do in polars: pl.DataFrame.with_column(pl.col('a') + 1) 
# however pandas may have other dataframe and series methods not yet implemented in polars
added_col = df.pd.assign(a2=1)
```

## DAG
Given an input _pl.DataFrame_ each _@node_ decorated method on a SubClass of _DagBase_ represents a derived column which could themselves depend on other derived columns. A dag is required to represent this hierarchy of dependencies, i.e. which columns to derive first and which ones can be done in parallel. this framework is inspired by [MDF](https://github.com/man-group/mdf) and the gromit dag in [beacon.io](https://www.beacon.io/) except the nodes represent [polars expressions](https://docs.pola.rs/py-polars/html/reference/expressions/index.html) instead of plain python. 

Example usage : 
```python
from polarmints.dag.core import DagBase, node, s
from polarmints import c, DF

class DagExample(DagBase):

    @node
    def DerivedCol(self):
        return c['raw2'] + 2

    @node
    def OverridenCol(self):
        """
        input column with this name will be overridden by this method if instance is initialized with
        override_existing=True
        """
        return c['raw1'] + 1

    @node
    def DerivedCol_2ndOrder(self):
        """
        NOTE: 's' and 'c' are effectively the same, 's' is merely for readability to distinguish derived columns (s)
        from raw inputs (c)
        """
        return s['OverridenCol'] + c['raw3']

    @node
    def DerivedCol_2ndOrder_B(self):
        return s['OverridenCol'] + s['DerivedCol']


if __name__ == '__main__':
    # this is an instance instead of class because some usages may require initializing the dag with instance specific
    # params when multiple instances are used in the same process.
    example = DagExample()

    # mock inputs
    df = DF({
        'raw1': [1, 2, 3],
        'raw2': [1, 2, 3],
        'raw3': [1, 2, 3],
        'OverridenCol': [10, 11, 12]
    })

    # select desired derived columns from mock inputs using dag
    df1 = example.with_cols(df,
        # func siganture: *args and **kwargs expresisons behave the same way as pl.DataFrame.with_column() and .select()          
        example.DerivedCol_2ndOrder,
        example.OverridenCol, #this will not be overridden
        'raw2',  # can be mixed with raw pl.Exprs that don't depend on the DAG nodes
        c['raw3'] + 2,
        
        **{
            'd1': example.DerivedCol,
            'd2': example.DerivedCol_2ndOrder_B,
            'd3': c['raw1'] * c['raw2']
        },
    )
    print(df1)

    """
    shape: (3, 8)
    ┌──────┬──────┬──────┬──────────────┬─────────────────────┬─────┬─────┬─────┐
    │ raw1 ┆ raw2 ┆ raw3 ┆ OverridenCol ┆ DerivedCol_2ndOrder ┆ d1  ┆ d2  ┆ d3  │
    │ ---  ┆ ---  ┆ ---  ┆ ---          ┆ ---                 ┆ --- ┆ --- ┆ --- │
    │ i64  ┆ i64  ┆ i64  ┆ i64          ┆ i64                 ┆ i64 ┆ i64 ┆ i64 │
    ╞══════╪══════╪══════╪══════════════╪═════════════════════╪═════╪═════╪═════╡
    │ 1    ┆ 1    ┆ 1    ┆ 10           ┆ 11                  ┆ 3   ┆ 13  ┆ 1   │
    │ 2    ┆ 2    ┆ 2    ┆ 11           ┆ 13                  ┆ 4   ┆ 15  ┆ 4   │
    │ 3    ┆ 3    ┆ 3    ┆ 12           ┆ 15                  ┆ 5   ┆ 17  ┆ 9   │
    └──────┴──────┴──────┴──────────────┴─────────────────────┴─────┴─────┴─────┘
    """

    # another example with more params yielding more implicitly derived columns
    expressions = [
        example.DerivedCol_2ndOrder, example.DerivedCol_2ndOrder_B,
    ]
    df2 = example.select(df, 'raw2', *expressions,
         include_deps=True, # include intermediate dependencies as columns in result DF for higher order nodes
         override_existing=True, # override the existing column if dict key or node name conflicts with raw input column
    )
    print(df2)

    """
    shape: (3, 5)
    ┌──────┬────────────┬──────────────┬───────────────────────┬─────────────────────┐
    │ raw2 ┆ DerivedCol ┆ OverridenCol ┆ DerivedCol_2ndOrder_B ┆ DerivedCol_2ndOrder │
    │ ---  ┆ ---        ┆ ---          ┆ ---                   ┆ ---                 │
    │ i64  ┆ i64        ┆ i64          ┆ i64                   ┆ i64                 │
    ╞══════╪════════════╪══════════════╪═══════════════════════╪═════════════════════╡
    │ 1    ┆ 3          ┆ 2            ┆ 5                     ┆ 3                   │
    │ 2    ┆ 4          ┆ 3            ┆ 7                     ┆ 5                   │
    │ 3    ┆ 5          ┆ 4            ┆ 9                     ┆ 7                   │
    └──────┴────────────┴──────────────┴───────────────────────┴─────────────────────┘
    """

    # for debugging: examine which derived expressions can be evaluated in parallel for each step
    ordered_exprs = example.ordered_exprs(expressions)
    print([[str(e) for e in oe] for oe in ordered_exprs])

    """
    [
        [
            '[(col("raw1")) + (1)].alias("OverridenCol")', 
            '[(col("raw2")) + (2)].alias("DerivedCol")'
        ], [
            '[(col("OverridenCol")) + (col("raw3"))].alias("DerivedCol_2ndOrder")',
            '[(col("OverridenCol")) + (col("DerivedCol"))].alias("DerivedCol_2ndOrder_B")'
        ]
    ]
    """


```

            

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "polarmints",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.7",
    "maintainer_email": null,
    "keywords": null,
    "author": "John Smith",
    "author_email": null,
    "download_url": "https://files.pythonhosted.org/packages/66/cc/bc29fb373554419ffe16d7c9aa9c7bf2b5d0798d78ce368da3cedee6945e/polarmints-0.1.24.tar.gz",
    "platform": null,
    "description": "# Polarmints\nSyntactic sugar for [polars](https://docs.pola.rs/user-guide/migration/pandas/) <br>\nApologies, not all features documented so feel free to explore codebase \n\n## Extensions\nextends polars Dataframes with additional namespaces for convenience functions<br>\nexample:\n```python\nimport polars as pl\nfrom polarmints import PolarMints, c, DF\n__all__ = [PolarMints] # required for extending DFs with polarmints, even though not explicitly used  \n\ndf = DF({\n    'a': [1, 2, 3],\n    'b': [1, 2, 3],\n})\n\ndf2 = DF({\n    'a': [1, 2, 3],\n    'c': [1, 2, 3],\n}, schema_overrides={'a': pl.Int16})\n\n# df.pm: convenience helper funcs\njoined = df2.pm.join(df, 'a') # implicitly converts datatypes before joining two DFs whose column types don't match\n\n# this is contrived example since it's more efficient to do in polars: pl.DataFrame.with_column(pl.col('a') + 1) \n# however pandas may have other dataframe and series methods not yet implemented in polars\nadded_col = df.pd.assign(a2=1)\n```\n\n## DAG\nGiven an input _pl.DataFrame_ each _@node_ decorated method on a SubClass of _DagBase_ represents a derived column which could themselves depend on other derived columns. A dag is required to represent this hierarchy of dependencies, i.e. which columns to derive first and which ones can be done in parallel. this framework is inspired by [MDF](https://github.com/man-group/mdf) and the gromit dag in [beacon.io](https://www.beacon.io/) except the nodes represent [polars expressions](https://docs.pola.rs/py-polars/html/reference/expressions/index.html) instead of plain python. \n\nExample usage : \n```python\nfrom polarmints.dag.core import DagBase, node, s\nfrom polarmints import c, DF\n\nclass DagExample(DagBase):\n\n    @node\n    def DerivedCol(self):\n        return c['raw2'] + 2\n\n    @node\n    def OverridenCol(self):\n        \"\"\"\n        input column with this name will be overridden by this method if instance is initialized with\n        override_existing=True\n        \"\"\"\n        return c['raw1'] + 1\n\n    @node\n    def DerivedCol_2ndOrder(self):\n        \"\"\"\n        NOTE: 's' and 'c' are effectively the same, 's' is merely for readability to distinguish derived columns (s)\n        from raw inputs (c)\n        \"\"\"\n        return s['OverridenCol'] + c['raw3']\n\n    @node\n    def DerivedCol_2ndOrder_B(self):\n        return s['OverridenCol'] + s['DerivedCol']\n\n\nif __name__ == '__main__':\n    # this is an instance instead of class because some usages may require initializing the dag with instance specific\n    # params when multiple instances are used in the same process.\n    example = DagExample()\n\n    # mock inputs\n    df = DF({\n        'raw1': [1, 2, 3],\n        'raw2': [1, 2, 3],\n        'raw3': [1, 2, 3],\n        'OverridenCol': [10, 11, 12]\n    })\n\n    # select desired derived columns from mock inputs using dag\n    df1 = example.with_cols(df,\n        # func siganture: *args and **kwargs expresisons behave the same way as pl.DataFrame.with_column() and .select()          \n        example.DerivedCol_2ndOrder,\n        example.OverridenCol, #this will not be overridden\n        'raw2',  # can be mixed with raw pl.Exprs that don't depend on the DAG nodes\n        c['raw3'] + 2,\n        \n        **{\n            'd1': example.DerivedCol,\n            'd2': example.DerivedCol_2ndOrder_B,\n            'd3': c['raw1'] * c['raw2']\n        },\n    )\n    print(df1)\n\n    \"\"\"\n    shape: (3, 8)\n    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2510\n    \u2502 raw1 \u2506 raw2 \u2506 raw3 \u2506 OverridenCol \u2506 DerivedCol_2ndOrder \u2506 d1  \u2506 d2  \u2506 d3  \u2502\n    \u2502 ---  \u2506 ---  \u2506 ---  \u2506 ---          \u2506 ---                 \u2506 --- \u2506 --- \u2506 --- \u2502\n    \u2502 i64  \u2506 i64  \u2506 i64  \u2506 i64          \u2506 i64                 \u2506 i64 \u2506 i64 \u2506 i64 \u2502\n    \u255e\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2561\n    \u2502 1    \u2506 1    \u2506 1    \u2506 10           \u2506 11                  \u2506 3   \u2506 13  \u2506 1   \u2502\n    \u2502 2    \u2506 2    \u2506 2    \u2506 11           \u2506 13                  \u2506 4   \u2506 15  \u2506 4   \u2502\n    \u2502 3    \u2506 3    \u2506 3    \u2506 12           \u2506 15                  \u2506 5   \u2506 17  \u2506 9   \u2502\n    \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2518\n    \"\"\"\n\n    # another example with more params yielding more implicitly derived columns\n    expressions = [\n        example.DerivedCol_2ndOrder, example.DerivedCol_2ndOrder_B,\n    ]\n    df2 = example.select(df, 'raw2', *expressions,\n         include_deps=True, # include intermediate dependencies as columns in result DF for higher order nodes\n         override_existing=True, # override the existing column if dict key or node name conflicts with raw input column\n    )\n    print(df2)\n\n    \"\"\"\n    shape: (3, 5)\n    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n    \u2502 raw2 \u2506 DerivedCol \u2506 OverridenCol \u2506 DerivedCol_2ndOrder_B \u2506 DerivedCol_2ndOrder \u2502\n    \u2502 ---  \u2506 ---        \u2506 ---          \u2506 ---                   \u2506 ---                 \u2502\n    \u2502 i64  \u2506 i64        \u2506 i64          \u2506 i64                   \u2506 i64                 \u2502\n    \u255e\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n    \u2502 1    \u2506 3          \u2506 2            \u2506 5                     \u2506 3                   \u2502\n    \u2502 2    \u2506 4          \u2506 3            \u2506 7                     \u2506 5                   \u2502\n    \u2502 3    \u2506 5          \u2506 4            \u2506 9                     \u2506 7                   \u2502\n    \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n    \"\"\"\n\n    # for debugging: examine which derived expressions can be evaluated in parallel for each step\n    ordered_exprs = example.ordered_exprs(expressions)\n    print([[str(e) for e in oe] for oe in ordered_exprs])\n\n    \"\"\"\n    [\n        [\n            '[(col(\"raw1\")) + (1)].alias(\"OverridenCol\")', \n            '[(col(\"raw2\")) + (2)].alias(\"DerivedCol\")'\n        ], [\n            '[(col(\"OverridenCol\")) + (col(\"raw3\"))].alias(\"DerivedCol_2ndOrder\")',\n            '[(col(\"OverridenCol\")) + (col(\"DerivedCol\"))].alias(\"DerivedCol_2ndOrder_B\")'\n        ]\n    ]\n    \"\"\"\n\n\n```\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "syntactic sugar and additional namespaces for polars",
    "version": "0.1.24",
    "project_urls": null,
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "4c2f419c21fa106b23b5e5cbcccb2d0657194e2f04647cc34649d7e57d317ceb",
                "md5": "388aaf1b1ba8a10f290978825a31377f",
                "sha256": "fd2a2f0c1bc95a02bcd8b55ac044be141c0a710ecec16ea224ed9ca0e8f2b2d8"
            },
            "downloads": -1,
            "filename": "polarmints-0.1.24-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "388aaf1b1ba8a10f290978825a31377f",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.7",
            "size": 15366,
            "upload_time": "2024-10-04T21:11:54",
            "upload_time_iso_8601": "2024-10-04T21:11:54.112738Z",
            "url": "https://files.pythonhosted.org/packages/4c/2f/419c21fa106b23b5e5cbcccb2d0657194e2f04647cc34649d7e57d317ceb/polarmints-0.1.24-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "66ccbc29fb373554419ffe16d7c9aa9c7bf2b5d0798d78ce368da3cedee6945e",
                "md5": "a52127ea8895013eea7473b7cf8c37b3",
                "sha256": "27fe9b32839a350aa3b42b51ff696bd5c329f2c43b6ffb8754fd30888962f63e"
            },
            "downloads": -1,
            "filename": "polarmints-0.1.24.tar.gz",
            "has_sig": false,
            "md5_digest": "a52127ea8895013eea7473b7cf8c37b3",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.7",
            "size": 12167,
            "upload_time": "2024-10-04T21:11:55",
            "upload_time_iso_8601": "2024-10-04T21:11:55.118890Z",
            "url": "https://files.pythonhosted.org/packages/66/cc/bc29fb373554419ffe16d7c9aa9c7bf2b5d0798d78ce368da3cedee6945e/polarmints-0.1.24.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-10-04 21:11:55",
    "github": false,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "lcname": "polarmints"
}
        
Elapsed time: 0.33362s