sqlframe


Namesqlframe JSON
Version 3.13.0 PyPI version JSON
download
home_pagehttps://github.com/eakmanrq/sqlframe
SummaryTurning PySpark Into a Universal DataFrame API
upload_time2024-12-29 17:27:41
maintainerNone
docs_urlNone
authorRyan Eakman
requires_python>=3.9
licenseMIT
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            <div align="center">
  <img src="https://sqlframe.readthedocs.io/en/stable/docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
</div>

SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.

SQLFrame currently supports the following engines (many more in development):

* [BigQuery](https://sqlframe.readthedocs.io/en/stable/bigquery/)
* [DuckDB](https://sqlframe.readthedocs.io/en/stable/duckdb)
* [Postgres](https://sqlframe.readthedocs.io/en/stable/postgres)
* [Snowflake](https://sqlframe.readthedocs.io/en/stable/snowflake)
* [Spark](https://sqlframe.readthedocs.io/en/stable/spark)

There are also two engines in development. These engines lack test coverage and robust documentation, but are available for early testing:

* [Redshift](https://sqlframe.readthedocs.io/en/stable/redshift)
* [Databricks](https://sqlframe.readthedocs.io/en/stable/databricks)

SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.

* [Standalone](https://sqlframe.readthedocs.io/en/stable/standalone)

SQLFrame is great for:

* Users who want a DataFrame API that leverages the full power of their engine to do the processing
* Users who want to run PySpark code quickly locally without the overhead of starting a Spark session
* Users who want a SQL representation of their DataFrame code for debugging or sharing with others
* Users who want to run PySpark DataFrame code without the complexity of using Spark for processing

## Installation

```bash
# BigQuery
pip install "sqlframe[bigquery]"
# DuckDB
pip install "sqlframe[duckdb]"
# Postgres
pip install "sqlframe[postgres]"
# Snowflake
pip install "sqlframe[snowflake]"
# Spark
pip install "sqlframe[spark]"
# Redshift (in development)
pip install "sqlframe[redshift]"
# Databricks (in development)
pip install "sqlframe[databricks]"
# Standalone
pip install sqlframe
```

See specific engine documentation for additional setup instructions.

## Configuration

SQLFrame generates consistently accurate yet complex SQL for engine execution. 
However, when using df.sql(optimize=True), it produces more human-readable SQL. 
For details on how to configure this output and leverage OpenAI to enhance the SQL, see [Generated SQL Configuration](https://sqlframe.readthedocs.io/en/stable/configuration/#generated-sql).

SQLFrame by default uses the Spark dialect for input and output.
This can be changed to make SQLFrame feel more like a native DataFrame API for the engine you are using.
See [Input and Output Dialect Configuration](https://sqlframe.readthedocs.io/en/stable/configuration/#input-and-output-dialect).

## Activating SQLFrame

SQLFrame can either replace pyspark imports or be used alongside them.
To replace pyspark imports, use the [activate function](https://sqlframe.readthedocs.io/en/stable/configuration/#activating-sqlframe) to set the engine to use.

```python
from sqlframe import activate

# Activate SQLFrame to run directly on DuckDB
activate(engine="duckdb")

from pyspark.sql import SparkSession
session = SparkSession.builder.getOrCreate()
```

SQLFrame can also be directly imported which both maintains pyspark imports but also allows for a more engine-native DataFrame API:

```python
from sqlframe.duckdb import DuckDBSession

session = DuckDBSession.builder.getOrCreate()
```

## Example Usage

```python
from sqlframe import activate

# Activate SQLFrame to run directly on BigQuery
activate(engine="bigquery")

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

session = SparkSession.builder.getOrCreate()
table_path = '"bigquery-public-data".samples.natality'
# Top 5 years with the greatest year-over-year % change in new families with single child
df = (
  session.table(table_path)
  .where(F.col("ever_born") == 1)
  .groupBy("year")
  .agg(F.count("*").alias("num_single_child_families"))
  .withColumn(
    "last_year_num_single_child_families",
    F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
  )
  .withColumn(
    "percent_change",
    (F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
    / F.col("last_year_num_single_child_families")
  )
  .orderBy(F.abs(F.col("percent_change")).desc())
  .select(
    F.col("year").alias("year"),
    F.format_number("num_single_child_families", 0).alias("new families single child"),
    F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
  )
  .limit(5)
)
```
```python
>>> df.sql(optimize=True)
WITH `t94228` AS (
  SELECT
    `natality`.`year` AS `year`,
    COUNT(*) AS `num_single_child_families`
  FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
  WHERE
    `natality`.`ever_born` = 1
  GROUP BY
    `natality`.`year`
), `t39093` AS (
  SELECT
    `t94228`.`year` AS `year`,
    `t94228`.`num_single_child_families` AS `num_single_child_families`,
    LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
  FROM `t94228` AS `t94228`
)
SELECT
  `t39093`.`year` AS `year`,
  FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `new families single child`,
  FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
FROM `t39093` AS `t39093`
ORDER BY
  ABS(`percent_change`) DESC
LIMIT 5
```
```python
>>> df.show()
+------+---------------------------+----------------+
| year | new families single child | percent change |
+------+---------------------------+----------------+
| 1989 |         1,650,246         |     25.02      |
| 1974 |          783,448          |     14.49      |
| 1977 |         1,057,379         |     11.38      |
| 1985 |         1,308,476         |     11.15      |
| 1975 |          868,985          |     10.92      |
+------+---------------------------+----------------+
```



            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/eakmanrq/sqlframe",
    "name": "sqlframe",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.9",
    "maintainer_email": null,
    "keywords": null,
    "author": "Ryan Eakman",
    "author_email": "eakmanrq@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/ac/5b/182f032d1a75957e9af06b225747b216af7d136656e298e2240021a9d564/sqlframe-3.13.0.tar.gz",
    "platform": null,
    "description": "<div align=\"center\">\n  <img src=\"https://sqlframe.readthedocs.io/en/stable/docs/images/sqlframe_logo.png\" alt=\"SQLFrame Logo\" width=\"400\"/>\n</div>\n\nSQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.\n\nSQLFrame currently supports the following engines (many more in development):\n\n* [BigQuery](https://sqlframe.readthedocs.io/en/stable/bigquery/)\n* [DuckDB](https://sqlframe.readthedocs.io/en/stable/duckdb)\n* [Postgres](https://sqlframe.readthedocs.io/en/stable/postgres)\n* [Snowflake](https://sqlframe.readthedocs.io/en/stable/snowflake)\n* [Spark](https://sqlframe.readthedocs.io/en/stable/spark)\n\nThere are also two engines in development. These engines lack test coverage and robust documentation, but are available for early testing:\n\n* [Redshift](https://sqlframe.readthedocs.io/en/stable/redshift)\n* [Databricks](https://sqlframe.readthedocs.io/en/stable/databricks)\n\nSQLFrame also has a \"Standalone\" session that be used to generate SQL without any connection to a database engine.\n\n* [Standalone](https://sqlframe.readthedocs.io/en/stable/standalone)\n\nSQLFrame is great for:\n\n* Users who want a DataFrame API that leverages the full power of their engine to do the processing\n* Users who want to run PySpark code quickly locally without the overhead of starting a Spark session\n* Users who want a SQL representation of their DataFrame code for debugging or sharing with others\n* Users who want to run PySpark DataFrame code without the complexity of using Spark for processing\n\n## Installation\n\n```bash\n# BigQuery\npip install \"sqlframe[bigquery]\"\n# DuckDB\npip install \"sqlframe[duckdb]\"\n# Postgres\npip install \"sqlframe[postgres]\"\n# Snowflake\npip install \"sqlframe[snowflake]\"\n# Spark\npip install \"sqlframe[spark]\"\n# Redshift (in development)\npip install \"sqlframe[redshift]\"\n# Databricks (in development)\npip install \"sqlframe[databricks]\"\n# Standalone\npip install sqlframe\n```\n\nSee specific engine documentation for additional setup instructions.\n\n## Configuration\n\nSQLFrame generates consistently accurate yet complex SQL for engine execution. \nHowever, when using df.sql(optimize=True), it produces more human-readable SQL. \nFor details on how to configure this output and leverage OpenAI to enhance the SQL, see [Generated SQL Configuration](https://sqlframe.readthedocs.io/en/stable/configuration/#generated-sql).\n\nSQLFrame by default uses the Spark dialect for input and output.\nThis can be changed to make SQLFrame feel more like a native DataFrame API for the engine you are using.\nSee [Input and Output Dialect Configuration](https://sqlframe.readthedocs.io/en/stable/configuration/#input-and-output-dialect).\n\n## Activating SQLFrame\n\nSQLFrame can either replace pyspark imports or be used alongside them.\nTo replace pyspark imports, use the [activate function](https://sqlframe.readthedocs.io/en/stable/configuration/#activating-sqlframe) to set the engine to use.\n\n```python\nfrom sqlframe import activate\n\n# Activate SQLFrame to run directly on DuckDB\nactivate(engine=\"duckdb\")\n\nfrom pyspark.sql import SparkSession\nsession = SparkSession.builder.getOrCreate()\n```\n\nSQLFrame can also be directly imported which both maintains pyspark imports but also allows for a more engine-native DataFrame API:\n\n```python\nfrom sqlframe.duckdb import DuckDBSession\n\nsession = DuckDBSession.builder.getOrCreate()\n```\n\n## Example Usage\n\n```python\nfrom sqlframe import activate\n\n# Activate SQLFrame to run directly on BigQuery\nactivate(engine=\"bigquery\")\n\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql import functions as F\nfrom pyspark.sql import Window\n\nsession = SparkSession.builder.getOrCreate()\ntable_path = '\"bigquery-public-data\".samples.natality'\n# Top 5 years with the greatest year-over-year % change in new families with single child\ndf = (\n  session.table(table_path)\n  .where(F.col(\"ever_born\") == 1)\n  .groupBy(\"year\")\n  .agg(F.count(\"*\").alias(\"num_single_child_families\"))\n  .withColumn(\n    \"last_year_num_single_child_families\",\n    F.lag(F.col(\"num_single_child_families\"), 1).over(Window.orderBy(\"year\"))\n  )\n  .withColumn(\n    \"percent_change\",\n    (F.col(\"num_single_child_families\") - F.col(\"last_year_num_single_child_families\"))\n    / F.col(\"last_year_num_single_child_families\")\n  )\n  .orderBy(F.abs(F.col(\"percent_change\")).desc())\n  .select(\n    F.col(\"year\").alias(\"year\"),\n    F.format_number(\"num_single_child_families\", 0).alias(\"new families single child\"),\n    F.format_number(F.col(\"percent_change\") * 100, 2).alias(\"percent change\"),\n  )\n  .limit(5)\n)\n```\n```python\n>>> df.sql(optimize=True)\nWITH `t94228` AS (\n  SELECT\n    `natality`.`year` AS `year`,\n    COUNT(*) AS `num_single_child_families`\n  FROM `bigquery-public-data`.`samples`.`natality` AS `natality`\n  WHERE\n    `natality`.`ever_born` = 1\n  GROUP BY\n    `natality`.`year`\n), `t39093` AS (\n  SELECT\n    `t94228`.`year` AS `year`,\n    `t94228`.`num_single_child_families` AS `num_single_child_families`,\n    LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`\n  FROM `t94228` AS `t94228`\n)\nSELECT\n  `t39093`.`year` AS `year`,\n  FORMAT('%\\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `new families single child`,\n  FORMAT('%\\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`\nFROM `t39093` AS `t39093`\nORDER BY\n  ABS(`percent_change`) DESC\nLIMIT 5\n```\n```python\n>>> df.show()\n+------+---------------------------+----------------+\n| year | new families single child | percent change |\n+------+---------------------------+----------------+\n| 1989 |         1,650,246         |     25.02      |\n| 1974 |          783,448          |     14.49      |\n| 1977 |         1,057,379         |     11.38      |\n| 1985 |         1,308,476         |     11.15      |\n| 1975 |          868,985          |     10.92      |\n+------+---------------------------+----------------+\n```\n\n\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "Turning PySpark Into a Universal DataFrame API",
    "version": "3.13.0",
    "project_urls": {
        "Homepage": "https://github.com/eakmanrq/sqlframe"
    },
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "bb9e38f9ae56dfbd3fb8a0fa5772a73001eb2a12b39de46282fb05d6e2d6428d",
                "md5": "8098889a85b5d8b9e23a4b8ab6a7bbd0",
                "sha256": "21cc8c366e926ebad9cd46293cff09c0840cb1e8262014a811718342f27ea5af"
            },
            "downloads": -1,
            "filename": "sqlframe-3.13.0-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "8098889a85b5d8b9e23a4b8ab6a7bbd0",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.9",
            "size": 181919,
            "upload_time": "2024-12-29T17:27:38",
            "upload_time_iso_8601": "2024-12-29T17:27:38.060141Z",
            "url": "https://files.pythonhosted.org/packages/bb/9e/38f9ae56dfbd3fb8a0fa5772a73001eb2a12b39de46282fb05d6e2d6428d/sqlframe-3.13.0-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "ac5b182f032d1a75957e9af06b225747b216af7d136656e298e2240021a9d564",
                "md5": "56f66198fe1cd578345becfe81567e16",
                "sha256": "213be1800a34f7872236069145591d56a9db9c3316c1c0b3d14f639ed8d4fa99"
            },
            "downloads": -1,
            "filename": "sqlframe-3.13.0.tar.gz",
            "has_sig": false,
            "md5_digest": "56f66198fe1cd578345becfe81567e16",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.9",
            "size": 29031572,
            "upload_time": "2024-12-29T17:27:41",
            "upload_time_iso_8601": "2024-12-29T17:27:41.659080Z",
            "url": "https://files.pythonhosted.org/packages/ac/5b/182f032d1a75957e9af06b225747b216af7d136656e298e2240021a9d564/sqlframe-3.13.0.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-12-29 17:27:41",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "eakmanrq",
    "github_project": "sqlframe",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "lcname": "sqlframe"
}
        
Elapsed time: 0.47021s