polars-ds


Namepolars-ds JSON
Version 0.6.2 PyPI version JSON
download
home_pageNone
SummaryNone
upload_time2024-11-09 21:29:18
maintainerNone
docs_urlNone
authorNone
requires_python>=3.9
licenseNone
keywords polars-extension scientific-computing data-science
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            <h1 align="center">
  <b>Polars for Data Science</b>
  <br>
</h1>

<p align="center">
  <a href="https://discord.gg/VaYJrrwbNU">Discord</a>
  |
  <a href="https://polars-ds-extension.readthedocs.io/en/latest/">Documentation</a>
  |
  <a href="https://github.com/abstractqqq/polars_ds_extension/blob/main/examples/basics.ipynb">User Guide</a>
  |
  <a href="https://github.com/abstractqqq/polars_ds_extension/blob/main/CONTRIBUTING.md">Want to Contribute?</a>
<br>
<b>pip install polars-ds</b>
</p>

# PDS (polars_ds)

PDS is a modern data science package that

1. is fast and furious
2. is small and lean, with minimal dependencies
3. has an intuitive and concise API (if you know Polars already)
4. has dataframe friendly design
5. and covers a wide variety of data science topics, such as simple statistics, linear regression, string edit distances, tabular data transforms, feature extraction, traditional modelling pipelines, model evaluation metrics, etc., etc..

It stands on the shoulders of the great **Polars** dataframe. You can see [examples](./examples/basics.ipynb). Here are some highlights!

```python
import polars as pl
import polars_ds as pds
# Parallel evaluation of multiple ML metrics on different segments of data
df.lazy().group_by("segments").agg( 
    pds.query_roc_auc("actual", "predicted").alias("roc_auc"),
    pds.query_log_loss("actual", "predicted").alias("log_loss"),
).collect()

shape: (2, 3)
┌──────────┬──────────┬──────────┐
│ segments ┆ roc_auc  ┆ log_loss │
│ ---      ┆ ---      ┆ ---      │
│ str      ┆ f64      ┆ f64      │
╞══════════╪══════════╪══════════╡
│ a        ┆ 0.497745 ┆ 1.006438 │
│ b        ┆ 0.498801 ┆ 0.997226 │
└──────────┴──────────┴──────────┘
```

Tabular Machine Learning Data Transformation Pipeline

```Python
import polars as pl
import polars.selectors as cs
from polars_ds.pipeline import Pipeline, Blueprint

bp = (
    # If we specify a target, then target will be excluded from any transformations.
    Blueprint(df, name = "example", target = "approved") 
    .lowercase() # lowercase all columns
    .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"]))
    .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") 
    .impute(["existing_emi"], method = "median")
    .append_expr( # generate some features
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
        pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform
    )
    .scale( 
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard"
    ) # Scale the columns up to this point. The columns below won't be scaled
    .append_expr( # Add missing flags
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing")
    )
    .one_hot_encode("gender", drop_first=True)
    .woe_encode("city_category")
    .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above
)

pipe:Pipeline = bp.materialize()
# Check out the result in our example notebooks! (examples/pipeline.ipynb)
df_transformed = pipe.transform(df)
df_transformed.head()
```

Get all neighbors within radius r, call them best friends, and count the number

```python
df.select(
    pl.col("id"),
    pds.query_radius_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in 3d space
        index = pl.col("id"),
        r = 0.1, 
        dist = "sql2", # squared l2
        parallel = True
    ).alias("best friends"),
).with_columns( # -1 to remove the point itself
    (pl.col("best friends").list.len() - 1).alias("best friends count")
).head()

shape: (5, 3)
┌─────┬───────────────────┬────────────────────┐
│ id  ┆ best friends      ┆ best friends count │
│ --- ┆ ---               ┆ ---                │
│ u32 ┆ list[u32]         ┆ u32                │
╞═════╪═══════════════════╪════════════════════╡
│ 0   ┆ [0, 811, … 1435]  ┆ 152                │
│ 1   ┆ [1, 953, … 1723]  ┆ 159                │
│ 2   ┆ [2, 355, … 835]   ┆ 243                │
│ 3   ┆ [3, 102, … 1129]  ┆ 110                │
│ 4   ┆ [4, 1280, … 1543] ┆ 226                │
└─────┴───────────────────┴────────────────────┘
```

Run a linear regression on each category:

```Python

df = pds.random_data(size=5_000, n_cols=0).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
    pds.random_int(0, 3).alias("categories")
).with_columns(
    y = pl.col("x1") * 0.5 + pl.col("x2") * 0.25 - pl.col("x3") * 0.15 + pds.random() * 0.0001
)

df.group_by("categories").agg(
    pds.query_lstsq(
        "x1", "x2", "x3", 
        target = "y",
        method = "l2",
        l2_reg = 0.05,
        add_bias = False
    ).alias("coeffs")
) 

shape: (3, 2)
┌────────────┬─────────────────────────────────┐
│ categories ┆ coeffs                          │
│ ---        ┆ ---                             │
│ i32        ┆ list[f64]                       │
╞════════════╪═════════════════════════════════╡
│ 0          ┆ [0.499912, 0.250005, -0.149846… │
│ 1          ┆ [0.499922, 0.250004, -0.149856… │
│ 2          ┆ [0.499923, 0.250004, -0.149855… │
└────────────┴─────────────────────────────────┘
```

Various String Edit distances

```Python
df.select( # Column "word", compared to string in pl.lit(). It also supports column vs column comparison
    pds.str_leven("word", pl.lit("asasasa"), return_sim=True).alias("Levenshtein"),
    pds.str_osa("word", pl.lit("apples"), return_sim=True).alias("Optimal String Alignment"),
    pds.str_jw("word", pl.lit("apples")).alias("Jaro-Winkler"),
)
```

In-dataframe statistical tests

```Python
df.group_by("market_id").agg(
    pds.ttest_ind("var1", "var2", equal_var=False).alias("t-test"),
    pds.chi2("category_1", "category_2").alias("chi2-test"),
    pds.f_test("var1", group = "category_1").alias("f-test")
)

shape: (3, 4)
┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐
│ market_id ┆ t-test               ┆ chi2-test            ┆ f-test              │
│ ---       ┆ ---                  ┆ ---                  ┆ ---                 │
│ i64       ┆ struct[2]            ┆ struct[2]            ┆ struct[2]           │
╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡
│ 0         ┆ {2.072749,0.038272}  ┆ {33.487634,0.588673} ┆ {0.312367,0.869842} │
│ 1         ┆ {0.469946,0.638424}  ┆ {42.672477,0.206119} ┆ {2.148937,0.072536} │
│ 2         ┆ {-1.175325,0.239949} ┆ {28.55723,0.806758}  ┆ {0.506678,0.730849} │
└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘
```

Multiple Convolutions at once!

```Python
# Multiple Convolutions at once
# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`
# Method: `fft`, `direct`
# Currently slower than SciPy but provides parallelism because of Polars
df.select(
    pds.convolve("f", [-1, 0, 0, 0, 1], mode = "full", method = "fft"), # column f with the kernel given here
    pds.convolve("a", [-1, 0, 0, 0, 1], mode = "full", method = "direct"),
    pds.convolve("b", [-1, 0, 0, 0, 1], mode = "full", method = "direct"),
).head()
```

And more!

## Getting Started

```python
import polars_ds as pds
```

To make full use of the Diagnosis module, do

```python
pip install "polars_ds[plot]"
```

## How Fast is it?

Feel free to take a look at our [benchmark notebook](./benchmarks/benchmarks.ipynb)!

Generally speaking, the more expressions you want to evaluate simultaneously, the faster Polars + PDS will be than Pandas + (SciPy / Sklearn / NumPy). The more CPU cores you have on your machine, the bigger the time difference will be in favor of Polars + PDS. 

Why does speed matter? 

If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute.  

## HELP WANTED!

1. Documentation writing, Doc Review, and Benchmark preparation

## Road Map

1. K-means, K-medoids clustering as expressions and also standalone modules.
2. Other improvement items. See issues.

# Disclaimer

**Currently in Beta. Feel free to submit feature requests in the issues section of the repo. This library will only depend on python Polars (for most of its core) and will try to be as stable as possible for polars>=1 (It currently supports polars>=0.20.16 but that will be dropped soon). Exceptions will be made when Polars's update forces changes in the plugins.**

This package is not tested with Polars streaming mode and is not designed to work with data so big that has to be streamed.

# Credits

1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See [here](https://github.com/tsoding/seroost)
2. Some statistics functions are taken from Statrs (MIT) and internalized. See [here](https://github.com/statrs-dev/statrs/tree/master)
3. Linear algebra routines are powered partly by [faer](https://crates.io/crates/faer)
4. String similarity metrics are soooo fast because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs)

# Other related Projects

1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime)

            

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "polars-ds",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.9",
    "maintainer_email": null,
    "keywords": "polars-extension, scientific-computing, data-science",
    "author": null,
    "author_email": "Tianren Qin <tq9695@gmail.com>",
    "download_url": "https://files.pythonhosted.org/packages/d7/c8/9290dce8fc4dbd9923fbf6ae90adfb089c23a7e6009764cd27613759d76d/polars_ds-0.6.2.tar.gz",
    "platform": null,
    "description": "<h1 align=\"center\">\n  <b>Polars for Data Science</b>\n  <br>\n</h1>\n\n<p align=\"center\">\n  <a href=\"https://discord.gg/VaYJrrwbNU\">Discord</a>\n  |\n  <a href=\"https://polars-ds-extension.readthedocs.io/en/latest/\">Documentation</a>\n  |\n  <a href=\"https://github.com/abstractqqq/polars_ds_extension/blob/main/examples/basics.ipynb\">User Guide</a>\n  |\n  <a href=\"https://github.com/abstractqqq/polars_ds_extension/blob/main/CONTRIBUTING.md\">Want to Contribute?</a>\n<br>\n<b>pip install polars-ds</b>\n</p>\n\n# PDS (polars_ds)\n\nPDS is a modern data science package that\n\n1. is fast and furious\n2. is small and lean, with minimal dependencies\n3. has an intuitive and concise API (if you know Polars already)\n4. has dataframe friendly design\n5. and covers a wide variety of data science topics, such as simple statistics, linear regression, string edit distances, tabular data transforms, feature extraction, traditional modelling pipelines, model evaluation metrics, etc., etc..\n\nIt stands on the shoulders of the great **Polars** dataframe. You can see [examples](./examples/basics.ipynb). Here are some highlights!\n\n```python\nimport polars as pl\nimport polars_ds as pds\n# Parallel evaluation of multiple ML metrics on different segments of data\ndf.lazy().group_by(\"segments\").agg( \n    pds.query_roc_auc(\"actual\", \"predicted\").alias(\"roc_auc\"),\n    pds.query_log_loss(\"actual\", \"predicted\").alias(\"log_loss\"),\n).collect()\n\nshape: (2, 3)\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 segments \u2506 roc_auc  \u2506 log_loss \u2502\n\u2502 ---      \u2506 ---      \u2506 ---      \u2502\n\u2502 str      \u2506 f64      \u2506 f64      \u2502\n\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n\u2502 a        \u2506 0.497745 \u2506 1.006438 \u2502\n\u2502 b        \u2506 0.498801 \u2506 0.997226 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n```\n\nTabular Machine Learning Data Transformation Pipeline\n\n```Python\nimport polars as pl\nimport polars.selectors as cs\nfrom polars_ds.pipeline import Pipeline, Blueprint\n\nbp = (\n    # If we specify a target, then target will be excluded from any transformations.\n    Blueprint(df, name = \"example\", target = \"approved\") \n    .lowercase() # lowercase all columns\n    .select(cs.numeric() | cs.by_name([\"gender\", \"employer_category1\", \"city_category\"]))\n    .linear_impute(features = [\"var1\", \"existing_emi\"], target = \"loan_period\") \n    .impute([\"existing_emi\"], method = \"median\")\n    .append_expr( # generate some features\n        pl.col(\"existing_emi\").log1p().alias(\"existing_emi_log1p\"),\n        pl.col(\"loan_amount\").log1p().alias(\"loan_amount_log1p\"),\n        pl.col(\"loan_amount\").sqrt().alias(\"loan_amount_sqrt\"),\n        pl.col(\"loan_amount\").shift(-1).alias(\"loan_amount_lag_1\") # any kind of lag transform\n    )\n    .scale( \n        cs.numeric().exclude([\"var1\", \"existing_emi_log1p\"]), method = \"standard\"\n    ) # Scale the columns up to this point. The columns below won't be scaled\n    .append_expr( # Add missing flags\n        pl.col(\"employer_category1\").is_null().cast(pl.UInt8).alias(\"employer_category1_is_missing\")\n    )\n    .one_hot_encode(\"gender\", drop_first=True)\n    .woe_encode(\"city_category\")\n    .target_encode(\"employer_category1\", min_samples_leaf = 20, smoothing = 10.0) # same as above\n)\n\npipe:Pipeline = bp.materialize()\n# Check out the result in our example notebooks! (examples/pipeline.ipynb)\ndf_transformed = pipe.transform(df)\ndf_transformed.head()\n```\n\nGet all neighbors within radius r, call them best friends, and count the number\n\n```python\ndf.select(\n    pl.col(\"id\"),\n    pds.query_radius_ptwise(\n        pl.col(\"var1\"), pl.col(\"var2\"), pl.col(\"var3\"), # Columns used as the coordinates in 3d space\n        index = pl.col(\"id\"),\n        r = 0.1, \n        dist = \"sql2\", # squared l2\n        parallel = True\n    ).alias(\"best friends\"),\n).with_columns( # -1 to remove the point itself\n    (pl.col(\"best friends\").list.len() - 1).alias(\"best friends count\")\n).head()\n\nshape: (5, 3)\n\u250c\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 id  \u2506 best friends      \u2506 best friends count \u2502\n\u2502 --- \u2506 ---               \u2506 ---                \u2502\n\u2502 u32 \u2506 list[u32]         \u2506 u32                \u2502\n\u255e\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n\u2502 0   \u2506 [0, 811, \u2026 1435]  \u2506 152                \u2502\n\u2502 1   \u2506 [1, 953, \u2026 1723]  \u2506 159                \u2502\n\u2502 2   \u2506 [2, 355, \u2026 835]   \u2506 243                \u2502\n\u2502 3   \u2506 [3, 102, \u2026 1129]  \u2506 110                \u2502\n\u2502 4   \u2506 [4, 1280, \u2026 1543] \u2506 226                \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n```\n\nRun a linear regression on each category:\n\n```Python\n\ndf = pds.random_data(size=5_000, n_cols=0).select(\n    pds.random(0.0, 1.0).alias(\"x1\"),\n    pds.random(0.0, 1.0).alias(\"x2\"),\n    pds.random(0.0, 1.0).alias(\"x3\"),\n    pds.random_int(0, 3).alias(\"categories\")\n).with_columns(\n    y = pl.col(\"x1\") * 0.5 + pl.col(\"x2\") * 0.25 - pl.col(\"x3\") * 0.15 + pds.random() * 0.0001\n)\n\ndf.group_by(\"categories\").agg(\n    pds.query_lstsq(\n        \"x1\", \"x2\", \"x3\", \n        target = \"y\",\n        method = \"l2\",\n        l2_reg = 0.05,\n        add_bias = False\n    ).alias(\"coeffs\")\n) \n\nshape: (3, 2)\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 categories \u2506 coeffs                          \u2502\n\u2502 ---        \u2506 ---                             \u2502\n\u2502 i32        \u2506 list[f64]                       \u2502\n\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n\u2502 0          \u2506 [0.499912, 0.250005, -0.149846\u2026 \u2502\n\u2502 1          \u2506 [0.499922, 0.250004, -0.149856\u2026 \u2502\n\u2502 2          \u2506 [0.499923, 0.250004, -0.149855\u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n```\n\nVarious String Edit distances\n\n```Python\ndf.select( # Column \"word\", compared to string in pl.lit(). It also supports column vs column comparison\n    pds.str_leven(\"word\", pl.lit(\"asasasa\"), return_sim=True).alias(\"Levenshtein\"),\n    pds.str_osa(\"word\", pl.lit(\"apples\"), return_sim=True).alias(\"Optimal String Alignment\"),\n    pds.str_jw(\"word\", pl.lit(\"apples\")).alias(\"Jaro-Winkler\"),\n)\n```\n\nIn-dataframe statistical tests\n\n```Python\ndf.group_by(\"market_id\").agg(\n    pds.ttest_ind(\"var1\", \"var2\", equal_var=False).alias(\"t-test\"),\n    pds.chi2(\"category_1\", \"category_2\").alias(\"chi2-test\"),\n    pds.f_test(\"var1\", group = \"category_1\").alias(\"f-test\")\n)\n\nshape: (3, 4)\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 market_id \u2506 t-test               \u2506 chi2-test            \u2506 f-test              \u2502\n\u2502 ---       \u2506 ---                  \u2506 ---                  \u2506 ---                 \u2502\n\u2502 i64       \u2506 struct[2]            \u2506 struct[2]            \u2506 struct[2]           \u2502\n\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n\u2502 0         \u2506 {2.072749,0.038272}  \u2506 {33.487634,0.588673} \u2506 {0.312367,0.869842} \u2502\n\u2502 1         \u2506 {0.469946,0.638424}  \u2506 {42.672477,0.206119} \u2506 {2.148937,0.072536} \u2502\n\u2502 2         \u2506 {-1.175325,0.239949} \u2506 {28.55723,0.806758}  \u2506 {0.506678,0.730849} \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n```\n\nMultiple Convolutions at once!\n\n```Python\n# Multiple Convolutions at once\n# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`\n# Method: `fft`, `direct`\n# Currently slower than SciPy but provides parallelism because of Polars\ndf.select(\n    pds.convolve(\"f\", [-1, 0, 0, 0, 1], mode = \"full\", method = \"fft\"), # column f with the kernel given here\n    pds.convolve(\"a\", [-1, 0, 0, 0, 1], mode = \"full\", method = \"direct\"),\n    pds.convolve(\"b\", [-1, 0, 0, 0, 1], mode = \"full\", method = \"direct\"),\n).head()\n```\n\nAnd more!\n\n## Getting Started\n\n```python\nimport polars_ds as pds\n```\n\nTo make full use of the Diagnosis module, do\n\n```python\npip install \"polars_ds[plot]\"\n```\n\n## How Fast is it?\n\nFeel free to take a look at our [benchmark notebook](./benchmarks/benchmarks.ipynb)!\n\nGenerally speaking, the more expressions you want to evaluate simultaneously, the faster Polars + PDS will be than Pandas + (SciPy / Sklearn / NumPy). The more CPU cores you have on your machine, the bigger the time difference will be in favor of Polars + PDS. \n\nWhy does speed matter? \n\nIf your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute.  \n\n## HELP WANTED!\n\n1. Documentation writing, Doc Review, and Benchmark preparation\n\n## Road Map\n\n1. K-means, K-medoids clustering as expressions and also standalone modules.\n2. Other improvement items. See issues.\n\n# Disclaimer\n\n**Currently in Beta. Feel free to submit feature requests in the issues section of the repo. This library will only depend on python Polars (for most of its core) and will try to be as stable as possible for polars>=1 (It currently supports polars>=0.20.16 but that will be dropped soon). Exceptions will be made when Polars's update forces changes in the plugins.**\n\nThis package is not tested with Polars streaming mode and is not designed to work with data so big that has to be streamed.\n\n# Credits\n\n1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See [here](https://github.com/tsoding/seroost)\n2. Some statistics functions are taken from Statrs (MIT) and internalized. See [here](https://github.com/statrs-dev/statrs/tree/master)\n3. Linear algebra routines are powered partly by [faer](https://crates.io/crates/faer)\n4. String similarity metrics are soooo fast because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs)\n\n# Other related Projects\n\n1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime)\n",
    "bugtrack_url": null,
    "license": null,
    "summary": null,
    "version": "0.6.2",
    "project_urls": null,
    "split_keywords": [
        "polars-extension",
        " scientific-computing",
        " data-science"
    ],
    "urls": [
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "3a108ae9c91626c20eb1f73d98ceb14c9d9d90ad8537666d516197fa87360cce",
                "md5": "84c40ad46ffd2d4301d46ae56f60ebf1",
                "sha256": "b63d17492ee9ec7783eef9efdd74289394700afcf13ad41caaa3d1203adcf1c8"
            },
            "downloads": -1,
            "filename": "polars_ds-0.6.2-cp39-abi3-macosx_10_12_x86_64.whl",
            "has_sig": false,
            "md5_digest": "84c40ad46ffd2d4301d46ae56f60ebf1",
            "packagetype": "bdist_wheel",
            "python_version": "cp39",
            "requires_python": ">=3.9",
            "size": 11252997,
            "upload_time": "2024-11-09T21:29:04",
            "upload_time_iso_8601": "2024-11-09T21:29:04.296208Z",
            "url": "https://files.pythonhosted.org/packages/3a/10/8ae9c91626c20eb1f73d98ceb14c9d9d90ad8537666d516197fa87360cce/polars_ds-0.6.2-cp39-abi3-macosx_10_12_x86_64.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "78472f6f3d23716575f63e431bba2ad18e253416c64651ba82cbe446dbae4339",
                "md5": "9241004c1bc3729c3eb49c6030f9840c",
                "sha256": "b8ba1b8f813dc76aec4e7f6d9ef0c9fc6632bd2ed1000a6dbe7481ba806ca8b5"
            },
            "downloads": -1,
            "filename": "polars_ds-0.6.2-cp39-abi3-macosx_11_0_arm64.whl",
            "has_sig": false,
            "md5_digest": "9241004c1bc3729c3eb49c6030f9840c",
            "packagetype": "bdist_wheel",
            "python_version": "cp39",
            "requires_python": ">=3.9",
            "size": 9898738,
            "upload_time": "2024-11-09T21:29:07",
            "upload_time_iso_8601": "2024-11-09T21:29:07.452105Z",
            "url": "https://files.pythonhosted.org/packages/78/47/2f6f3d23716575f63e431bba2ad18e253416c64651ba82cbe446dbae4339/polars_ds-0.6.2-cp39-abi3-macosx_11_0_arm64.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "89433d97b2d9a4ef0c893fd6eab1d2741801a6d5dd34934138d1bcb0697abacb",
                "md5": "023bde2b353978e3b2f32752926522ac",
                "sha256": "bbc15f1af2a98cb294aec2da5a249c303b01451ce0c56af24d20345f589c3bc8"
            },
            "downloads": -1,
            "filename": "polars_ds-0.6.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
            "has_sig": false,
            "md5_digest": "023bde2b353978e3b2f32752926522ac",
            "packagetype": "bdist_wheel",
            "python_version": "cp39",
            "requires_python": ">=3.9",
            "size": 11661067,
            "upload_time": "2024-11-09T21:29:10",
            "upload_time_iso_8601": "2024-11-09T21:29:10.653037Z",
            "url": "https://files.pythonhosted.org/packages/89/43/3d97b2d9a4ef0c893fd6eab1d2741801a6d5dd34934138d1bcb0697abacb/polars_ds-0.6.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "ce9931e6541a7184710f4c6598da64b124b805ab725a9703c11d4b7d580a8f99",
                "md5": "e1966eaf772be6727ecd062c540862df",
                "sha256": "42f51a49bdbf4f62b3bd0bead6685c6eb23608ef60a662dfb9b0c988305cb10e"
            },
            "downloads": -1,
            "filename": "polars_ds-0.6.2-cp39-abi3-manylinux_2_24_aarch64.whl",
            "has_sig": false,
            "md5_digest": "e1966eaf772be6727ecd062c540862df",
            "packagetype": "bdist_wheel",
            "python_version": "cp39",
            "requires_python": ">=3.9",
            "size": 10218979,
            "upload_time": "2024-11-09T21:29:12",
            "upload_time_iso_8601": "2024-11-09T21:29:12.631411Z",
            "url": "https://files.pythonhosted.org/packages/ce/99/31e6541a7184710f4c6598da64b124b805ab725a9703c11d4b7d580a8f99/polars_ds-0.6.2-cp39-abi3-manylinux_2_24_aarch64.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "f8c06a600dc57049c889c56ce7f36350a31a574f5327caf06d064023d02d1573",
                "md5": "031db172be0465756a60151e237b2453",
                "sha256": "d7bf12b7086a457b29d8b3cc53a0c796b2189c6d885fe22b2e3b33ec960b4dc6"
            },
            "downloads": -1,
            "filename": "polars_ds-0.6.2-cp39-abi3-win_amd64.whl",
            "has_sig": false,
            "md5_digest": "031db172be0465756a60151e237b2453",
            "packagetype": "bdist_wheel",
            "python_version": "cp39",
            "requires_python": ">=3.9",
            "size": 12541342,
            "upload_time": "2024-11-09T21:29:15",
            "upload_time_iso_8601": "2024-11-09T21:29:15.123354Z",
            "url": "https://files.pythonhosted.org/packages/f8/c0/6a600dc57049c889c56ce7f36350a31a574f5327caf06d064023d02d1573/polars_ds-0.6.2-cp39-abi3-win_amd64.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "d7c89290dce8fc4dbd9923fbf6ae90adfb089c23a7e6009764cd27613759d76d",
                "md5": "b214ce411e39c2cb56394364f544196a",
                "sha256": "2a85a1d4321b8ba9180bf7835ef55dea3275ac17024084601e2a2464703ed671"
            },
            "downloads": -1,
            "filename": "polars_ds-0.6.2.tar.gz",
            "has_sig": false,
            "md5_digest": "b214ce411e39c2cb56394364f544196a",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.9",
            "size": 2121087,
            "upload_time": "2024-11-09T21:29:18",
            "upload_time_iso_8601": "2024-11-09T21:29:18.434993Z",
            "url": "https://files.pythonhosted.org/packages/d7/c8/9290dce8fc4dbd9923fbf6ae90adfb089c23a7e6009764cd27613759d76d/polars_ds-0.6.2.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-11-09 21:29:18",
    "github": false,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "lcname": "polars-ds"
}
        
Elapsed time: 1.43810s