|RTD| |License|
Welcome to Spooq!
=================
Spooq is your **PySpark** based helper library for ETL data ingestion pipeline in Data Lakes.
The main components are:
* Extractors
* Transformers
* Loaders
Those components are independent and can be used separately or be plugged-in into a pipeline instance.
You can also use the custom functions from the Mapper transformer directly with PySpark (f.e. ``select`` or ``withColumn``).
Example of Mapper Transformer
=============================
.. code-block:: python
from pyspark.sql import Row
from pyspark.sql import functions as F, types as T
from spooq.transformer import Mapper
from spooq.transformer import mapper_transformations as spq
input_df = spark.createDataFrame(
[
Row(
struct_a=Row(idx="000_123_456", sts="enabled", ts="1597069446000"),
struct_b=Row(itms="1,2,4", sts="whitelisted", ts="2020-08-12T12:43:14+0000"),
struct_c=Row(email="abc@def.com", gndr="F", dt="2020-08-05", cmt="fine"),
),
Row(
struct_a=Row(idx="000_654_321", sts="off", ts="1597069500784"),
struct_b=Row(itms="5", sts="blacklisted", ts="2020-07-01T12:43:14+0000"),
struct_c=Row(email="", gndr="m", dt="2020-06-27", cmt="faulty"),
),
],
schema="""
a: struct<idx string, sts string, ts string>,
b: struct<itms string, sts string, ts string>,
c: struct<email string, gndr string, dt string, cmt string>
"""
)
input_df.printSchema()
root
|-- a: struct (nullable = true)
| |-- idx: string (nullable = true)
| |-- sts: string (nullable = true)
| |-- ts: string (nullable = true)
|-- b: struct (nullable = true)
| |-- itms: string (nullable = true)
| |-- sts: string (nullable = true)
| |-- ts: string (nullable = true)
|-- c: struct (nullable = true)
| |-- email: string (nullable = true)
| |-- gndr: string (nullable = true)
| |-- dt: string (nullable = true)
| |-- cmt: string (nullable = true)
mapping = [
# output_name # source # transformation
("index", "a.idx", spq.to_int), # removes leading zeros and underline characters
("is_enabled", "a.sts", spq.to_bool), # recognizes additional words like "on", "off", "disabled", "enabled", ...
("a_updated_at", "a.ts", spq.to_timestamp), # supports unix timestamps in ms or seconds and strings
("items", "b.itms", spq.str_to_array(cast="int")), # splits a comma delimited string into an array and casts its elements
("block_status", "b.sts", spq.map_values(mapping={"whitelisted": "allowed", "blacklisted": "blocked"})), # applies lookup dictionary
("b_updated_at", "b.ts", spq.to_timestamp), # supports unix timestamps in ms or seconds and strings
("has_email", "c.email", spq.has_value), # interprets also empty strings as no value, although, zeros are values
("gender", "c.gndr", spq.apply(func=F.lower)), # applies provided function to all values
("creation_date", "c.dt", spq.to_timestamp(cast="date")), # explicitly casts result after transformation
("processed_at", F.current_timestamp(), spq.as_is), # source column is a function, no transformation to the results
("comment", "c.cmt", "string"), # no transformation, only cast; alternatively: spq.to_str or spq.as_is(cast="string")
]
output_df = Mapper(mapping).transform(input_df)
output_df.show(truncate=False)
+------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+
|index |is_enabled|a_updated_at |items |block_status|b_updated_at |has_email|gender|creation_date|processed_at |comment|
+------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+
|123456|true |2020-08-10 16:24:06 |[1, 2, 4]|allowed |2020-08-12 14:43:14|true |f |2020-08-05 |2022-08-12 09:17:09.83|fine |
|654321|false |2020-08-10 16:25:00.784|[5] |blocked |2020-07-01 14:43:14|false |m |2020-06-27 |2022-08-12 09:17:09.83|faulty |
+------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+
output_df.printSchema()
root
|-- index: integer (nullable = true)
|-- is_enabled: boolean (nullable = true)
|-- a_updated_at: timestamp (nullable = true)
|-- items: array (nullable = true)
| |-- element: integer (containsNull = true)
|-- block_status: string (nullable = true)
|-- b_updated_at: timestamp (nullable = true)
|-- has_email: boolean (nullable = false)
|-- gender: string (nullable = true)
|-- creation_date: date (nullable = true)
|-- processed_at: timestamp (nullable = false)
|-- comment: string (nullable = true)
Features / Components
=====================
`Transformers <https://spooq.readthedocs.io/en/latest/transformer/overview.html>`_
----------------------------------------------------------------------------------
* `Custom Mapping Transformations <https://spooq.readthedocs.io/en/latest/transformer/mapper.html#module-spooq.transformer.mapper_transformations>`_
* `Exploder <https://spooq.readthedocs.io/en/latest/transformer/exploder.html>`_
* `Filter <https://spooq.readthedocs.io/en/latest/transformer/sieve.html>`_
* `Mapper (Restructuring of complex DataFrames) <https://spooq.readthedocs.io/en/latest/transformer/mapper.html>`_
* `Threshold-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/threshold_cleaner.html>`_
* `Enumeration-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/enum_cleaner.html>`_
* `Newest by Group (Most current record per ID) <https://spooq.readthedocs.io/en/latest/transformer/newest_by_group.html>`_
`Extractors <https://spooq.readthedocs.io/en/latest/extractor/overview.html>`_
----------------------------------------------------------------------------------
* `JSON Files <https://spooq.readthedocs.io/en/latest/extractor/json.html>`_
* `JDBC Source <https://spooq.readthedocs.io/en/latest/extractor/jdbc.html>`_
`Loaders <https://spooq.readthedocs.io/en/latest/loader/overview.html>`_
----------------------------------------------------------------------------------
* `Hive Database <https://spooq.readthedocs.io/en/latest/loader/hive_loader.html>`_
Installation
============
.. code-block:: python
pip install spooq
Online Documentation
=====================
For a more details please consult the online documentation at |Onlinedocs|
Changelog
============
|Changelog|
Contribution
============
Please see |Contribute| for more information.
License
=========
This library is licensed under the |License|
-------------------------------------------------------------------------------------------------------------
.. |RTD| image:: https://readthedocs.org/projects/spooq/badge/?version=latest
:target: https://spooq.readthedocs.io/en/latest/?badge=latest
:alt: Documentation Status
.. |Onlinedocs| image:: https://about.readthedocs.com/theme/img/logo-wordmark-dark.svg
:target: https://spooq.readthedocs.io/
:alt: Online Documentation
:width: 120px
.. |License| image:: https://img.shields.io/badge/license-MIT-blue.svg
:target: https://github.com/Breaka84/Spooq/blob/master/LICENSE
:alt: Project License
.. |Changelog| image:: https://img.shields.io/badge/CHANGELOG-8A2BE2
:target: https://spooq.readthedocs.io/en/latest/changelog.html
:alt: Changelog
.. |Contribute| image:: https://img.shields.io/badge/CONTRIBUTING-8A2BE2
:target: https://spooq.readthedocs.io/en/latest/contribute.html
:alt: Contribute
Raw data
{
"_id": null,
"home_page": "https://github.com/Breaka84/Spooq",
"name": "Spooq",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "spooq, spark, hive, cloudera, hadoop, etl, data ingestion, data wrangling, databricks, big data, batch, streaming, data engineering",
"author": "David Hohensinn",
"author_email": "breaka@gmx.at",
"download_url": "https://files.pythonhosted.org/packages/da/bd/fd61fc4440d0e8d8836df7023a939d28ad520f8eaaebd3afbf755db060c1/spooq-3.4.2.tar.gz",
"platform": null,
"description": "|RTD| |License|\n\nWelcome to Spooq!\n=================\n\nSpooq is your **PySpark** based helper library for ETL data ingestion pipeline in Data Lakes.\n\nThe main components are:\n * Extractors\n * Transformers\n * Loaders\n\nThose components are independent and can be used separately or be plugged-in into a pipeline instance.\nYou can also use the custom functions from the Mapper transformer directly with PySpark (f.e. ``select`` or ``withColumn``).\n\nExample of Mapper Transformer\n=============================\n\n.. code-block:: python\n\n from pyspark.sql import Row\n from pyspark.sql import functions as F, types as T\n from spooq.transformer import Mapper\n from spooq.transformer import mapper_transformations as spq\n\n input_df = spark.createDataFrame(\n [\n Row(\n struct_a=Row(idx=\"000_123_456\", sts=\"enabled\", ts=\"1597069446000\"),\n struct_b=Row(itms=\"1,2,4\", sts=\"whitelisted\", ts=\"2020-08-12T12:43:14+0000\"),\n struct_c=Row(email=\"abc@def.com\", gndr=\"F\", dt=\"2020-08-05\", cmt=\"fine\"),\n ),\n Row(\n struct_a=Row(idx=\"000_654_321\", sts=\"off\", ts=\"1597069500784\"),\n struct_b=Row(itms=\"5\", sts=\"blacklisted\", ts=\"2020-07-01T12:43:14+0000\"),\n struct_c=Row(email=\"\", gndr=\"m\", dt=\"2020-06-27\", cmt=\"faulty\"),\n ),\n ],\n schema=\"\"\"\n a: struct<idx string, sts string, ts string>,\n b: struct<itms string, sts string, ts string>,\n c: struct<email string, gndr string, dt string, cmt string>\n \"\"\"\n )\n input_df.printSchema()\n root\n |-- a: struct (nullable = true)\n | |-- idx: string (nullable = true)\n | |-- sts: string (nullable = true)\n | |-- ts: string (nullable = true)\n |-- b: struct (nullable = true)\n | |-- itms: string (nullable = true)\n | |-- sts: string (nullable = true)\n | |-- ts: string (nullable = true)\n |-- c: struct (nullable = true)\n | |-- email: string (nullable = true)\n | |-- gndr: string (nullable = true)\n | |-- dt: string (nullable = true)\n | |-- cmt: string (nullable = true)\n\n\n mapping = [\n # output_name # source # transformation\n (\"index\", \"a.idx\", spq.to_int), # removes leading zeros and underline characters\n (\"is_enabled\", \"a.sts\", spq.to_bool), # recognizes additional words like \"on\", \"off\", \"disabled\", \"enabled\", ...\n (\"a_updated_at\", \"a.ts\", spq.to_timestamp), # supports unix timestamps in ms or seconds and strings\n (\"items\", \"b.itms\", spq.str_to_array(cast=\"int\")), # splits a comma delimited string into an array and casts its elements\n (\"block_status\", \"b.sts\", spq.map_values(mapping={\"whitelisted\": \"allowed\", \"blacklisted\": \"blocked\"})), # applies lookup dictionary\n (\"b_updated_at\", \"b.ts\", spq.to_timestamp), # supports unix timestamps in ms or seconds and strings\n (\"has_email\", \"c.email\", spq.has_value), # interprets also empty strings as no value, although, zeros are values\n (\"gender\", \"c.gndr\", spq.apply(func=F.lower)), # applies provided function to all values\n (\"creation_date\", \"c.dt\", spq.to_timestamp(cast=\"date\")), # explicitly casts result after transformation\n (\"processed_at\", F.current_timestamp(), spq.as_is), # source column is a function, no transformation to the results\n (\"comment\", \"c.cmt\", \"string\"), # no transformation, only cast; alternatively: spq.to_str or spq.as_is(cast=\"string\")\n ]\n output_df = Mapper(mapping).transform(input_df)\n\n output_df.show(truncate=False)\n +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+\n |index |is_enabled|a_updated_at |items |block_status|b_updated_at |has_email|gender|creation_date|processed_at |comment|\n +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+\n |123456|true |2020-08-10 16:24:06 |[1, 2, 4]|allowed |2020-08-12 14:43:14|true |f |2020-08-05 |2022-08-12 09:17:09.83|fine |\n |654321|false |2020-08-10 16:25:00.784|[5] |blocked |2020-07-01 14:43:14|false |m |2020-06-27 |2022-08-12 09:17:09.83|faulty |\n +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+\n\n\n output_df.printSchema()\n root\n |-- index: integer (nullable = true)\n |-- is_enabled: boolean (nullable = true)\n |-- a_updated_at: timestamp (nullable = true)\n |-- items: array (nullable = true)\n | |-- element: integer (containsNull = true)\n |-- block_status: string (nullable = true)\n |-- b_updated_at: timestamp (nullable = true)\n |-- has_email: boolean (nullable = false)\n |-- gender: string (nullable = true)\n |-- creation_date: date (nullable = true)\n |-- processed_at: timestamp (nullable = false)\n |-- comment: string (nullable = true)\n\nFeatures / Components\n=====================\n\n`Transformers <https://spooq.readthedocs.io/en/latest/transformer/overview.html>`_\n----------------------------------------------------------------------------------\n\n* `Custom Mapping Transformations <https://spooq.readthedocs.io/en/latest/transformer/mapper.html#module-spooq.transformer.mapper_transformations>`_\n* `Exploder <https://spooq.readthedocs.io/en/latest/transformer/exploder.html>`_\n* `Filter <https://spooq.readthedocs.io/en/latest/transformer/sieve.html>`_\n* `Mapper (Restructuring of complex DataFrames) <https://spooq.readthedocs.io/en/latest/transformer/mapper.html>`_\n* `Threshold-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/threshold_cleaner.html>`_\n* `Enumeration-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/enum_cleaner.html>`_\n* `Newest by Group (Most current record per ID) <https://spooq.readthedocs.io/en/latest/transformer/newest_by_group.html>`_\n\n\n`Extractors <https://spooq.readthedocs.io/en/latest/extractor/overview.html>`_\n----------------------------------------------------------------------------------\n \n* `JSON Files <https://spooq.readthedocs.io/en/latest/extractor/json.html>`_\n* `JDBC Source <https://spooq.readthedocs.io/en/latest/extractor/jdbc.html>`_\n\n`Loaders <https://spooq.readthedocs.io/en/latest/loader/overview.html>`_\n----------------------------------------------------------------------------------\n \n* `Hive Database <https://spooq.readthedocs.io/en/latest/loader/hive_loader.html>`_\n\nInstallation\n============\n\n.. code-block:: python\n\n pip install spooq\n\n\nOnline Documentation\n=====================\n\nFor a more details please consult the online documentation at |Onlinedocs|\n\nChangelog\n============\n\n|Changelog|\n\nContribution\n============\n\nPlease see |Contribute| for more information.\n\nLicense\n=========\n\nThis library is licensed under the |License|\n\n-------------------------------------------------------------------------------------------------------------\n\n.. |RTD| image:: https://readthedocs.org/projects/spooq/badge/?version=latest\n :target: https://spooq.readthedocs.io/en/latest/?badge=latest\n :alt: Documentation Status\n\n.. |Onlinedocs| image:: https://about.readthedocs.com/theme/img/logo-wordmark-dark.svg\n :target: https://spooq.readthedocs.io/\n :alt: Online Documentation\n :width: 120px\n\n.. |License| image:: https://img.shields.io/badge/license-MIT-blue.svg\n :target: https://github.com/Breaka84/Spooq/blob/master/LICENSE\n :alt: Project License\n\n.. |Changelog| image:: https://img.shields.io/badge/CHANGELOG-8A2BE2\n :target: https://spooq.readthedocs.io/en/latest/changelog.html\n :alt: Changelog\n\n.. |Contribute| image:: https://img.shields.io/badge/CONTRIBUTING-8A2BE2\n :target: https://spooq.readthedocs.io/en/latest/contribute.html\n :alt: Contribute\n",
"bugtrack_url": null,
"license": null,
"summary": "Spooq is a PySpark based helper library for ETL data ingestion pipeline in Data Lakes.",
"version": "3.4.2",
"project_urls": {
"Documentation": "https://spooq.readthedocs.io",
"Homepage": "https://github.com/Breaka84/Spooq"
},
"split_keywords": [
"spooq",
" spark",
" hive",
" cloudera",
" hadoop",
" etl",
" data ingestion",
" data wrangling",
" databricks",
" big data",
" batch",
" streaming",
" data engineering"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "3f09c1661bfc62c5231f6584b8130b31832d7167a6d13ddd80417b497ca73eb5",
"md5": "4352cf06c6750802b87a263eb1c76127",
"sha256": "483f5bd3814e7723fcb9da0f497f0a1fb45cbf43987f8627b33d73e27c2bcc7e"
},
"downloads": -1,
"filename": "Spooq-3.4.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "4352cf06c6750802b87a263eb1c76127",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 54572,
"upload_time": "2024-08-08T13:28:22",
"upload_time_iso_8601": "2024-08-08T13:28:22.645284Z",
"url": "https://files.pythonhosted.org/packages/3f/09/c1661bfc62c5231f6584b8130b31832d7167a6d13ddd80417b497ca73eb5/Spooq-3.4.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "dabdfd61fc4440d0e8d8836df7023a939d28ad520f8eaaebd3afbf755db060c1",
"md5": "409df4df89ddac63b24c18d83b13204b",
"sha256": "46b1bfff8aeb7bc72a7f4d7dd6163330e319b5da33829590e1e5dfc0e6723856"
},
"downloads": -1,
"filename": "spooq-3.4.2.tar.gz",
"has_sig": false,
"md5_digest": "409df4df89ddac63b24c18d83b13204b",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 43470,
"upload_time": "2024-08-08T13:28:24",
"upload_time_iso_8601": "2024-08-08T13:28:24.332836Z",
"url": "https://files.pythonhosted.org/packages/da/bd/fd61fc4440d0e8d8836df7023a939d28ad520f8eaaebd3afbf755db060c1/spooq-3.4.2.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-08-08 13:28:24",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "Breaka84",
"github_project": "Spooq",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [
{
"name": "pandas",
"specs": []
},
{
"name": "future",
"specs": []
}
],
"lcname": "spooq"
}