Spooq


NameSpooq JSON
Version 3.4.0 PyPI version JSON
download
home_pagehttps://github.com/Breaka84/Spooq
SummarySpooq is a PySpark based helper library for ETL data ingestion pipeline in Data Lakes.
upload_time2024-03-20 09:36:41
maintainerNone
docs_urlNone
authorDavid Hohensinn
requires_pythonNone
licenseNone
keywords spooq spark hive cloudera hadoop etl data ingestion data wrangling databricks big data batch streaming data engineering
VCS
bugtrack_url
requirements pandas future
Travis-CI No Travis.
coveralls test coverage No coveralls.
            |RTD| |License|

Welcome to Spooq!
=================

Spooq is your **PySpark** based helper library for ETL data ingestion pipeline in Data Lakes.

The main components are:
  * Extractors
  * Transformers
  * Loaders

Those components are independent and can be used separately or be plugged-in into a pipeline instance.
You can also use the custom functions from the Mapper transformer directly with PySpark (f.e. ``select`` or ``withColumn``).

Example of Mapper Transformer
=============================

.. code-block:: python

   from pyspark.sql import Row
   from pyspark.sql import functions as F, types as T
   from spooq.transformer import Mapper
   from spooq.transformer import mapper_transformations as spq

   input_df = spark.createDataFrame(
      [
         Row(
            struct_a=Row(idx="000_123_456", sts="enabled", ts="1597069446000"),
            struct_b=Row(itms="1,2,4", sts="whitelisted", ts="2020-08-12T12:43:14+0000"),
            struct_c=Row(email="abc@def.com", gndr="F", dt="2020-08-05", cmt="fine"),
         ),
         Row(
            struct_a=Row(idx="000_654_321", sts="off", ts="1597069500784"),
            struct_b=Row(itms="5", sts="blacklisted", ts="2020-07-01T12:43:14+0000"),
            struct_c=Row(email="", gndr="m", dt="2020-06-27", cmt="faulty"),
         ),
      ],
      schema="""
         a: struct<idx string, sts string, ts string>,
         b: struct<itms string, sts string, ts string>,
         c: struct<email string, gndr string, dt string, cmt string>
      """
   )
   input_df.printSchema()
   root
    |-- a: struct (nullable = true)
    |    |-- idx: string (nullable = true)
    |    |-- sts: string (nullable = true)
    |    |-- ts: string (nullable = true)
    |-- b: struct (nullable = true)
    |    |-- itms: string (nullable = true)
    |    |-- sts: string (nullable = true)
    |    |-- ts: string (nullable = true)
    |-- c: struct (nullable = true)
    |    |-- email: string (nullable = true)
    |    |-- gndr: string (nullable = true)
    |    |-- dt: string (nullable = true)
    |    |-- cmt: string (nullable = true)


   mapping = [
       # output_name     # source                # transformation
      ("index",          "a.idx",                spq.to_int),  # removes leading zeros and underline characters
      ("is_enabled",     "a.sts",                spq.to_bool),  # recognizes additional words like "on", "off", "disabled", "enabled", ...
      ("a_updated_at",   "a.ts",                 spq.to_timestamp),  # supports unix timestamps in ms or seconds and strings
      ("items",          "b.itms",               spq.str_to_array(cast="int")),  # splits a comma delimited string into an array and casts its elements
      ("block_status",   "b.sts",                spq.map_values(mapping={"whitelisted": "allowed", "blacklisted": "blocked"})),  # applies lookup dictionary
      ("b_updated_at",   "b.ts",                 spq.to_timestamp),  # supports unix timestamps in ms or seconds and strings
      ("has_email",      "c.email",              spq.has_value),  # interprets also empty strings as no value, although, zeros are values
      ("gender",         "c.gndr",               spq.apply(func=F.lower)),  # applies provided function to all values
      ("creation_date",  "c.dt",                 spq.to_timestamp(cast="date")),  # explicitly casts result after transformation
      ("processed_at",   F.current_timestamp(),  spq.as_is),  # source column is a function, no transformation to the results
      ("comment",        "c.cmt",                "string"),  # no transformation, only cast; alternatively: spq.to_str or spq.as_is(cast="string")
   ]
   output_df = Mapper(mapping).transform(input_df)

   output_df.show(truncate=False)
   +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+
   |index |is_enabled|a_updated_at           |items    |block_status|b_updated_at       |has_email|gender|creation_date|processed_at          |comment|
   +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+
   |123456|true      |2020-08-10 16:24:06    |[1, 2, 4]|allowed     |2020-08-12 14:43:14|true     |f     |2020-08-05   |2022-08-12 09:17:09.83|fine   |
   |654321|false     |2020-08-10 16:25:00.784|[5]      |blocked     |2020-07-01 14:43:14|false    |m     |2020-06-27   |2022-08-12 09:17:09.83|faulty |
   +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+


   output_df.printSchema()
   root
    |-- index: integer (nullable = true)
    |-- is_enabled: boolean (nullable = true)
    |-- a_updated_at: timestamp (nullable = true)
    |-- items: array (nullable = true)
    |    |-- element: integer (containsNull = true)
    |-- block_status: string (nullable = true)
    |-- b_updated_at: timestamp (nullable = true)
    |-- has_email: boolean (nullable = false)
    |-- gender: string (nullable = true)
    |-- creation_date: date (nullable = true)
    |-- processed_at: timestamp (nullable = false)
    |-- comment: string (nullable = true)

Features / Components
=====================

`Transformers <https://spooq.readthedocs.io/en/latest/transformer/overview.html>`_
----------------------------------------------------------------------------------

* `Custom Mapping Transformations <https://spooq.readthedocs.io/en/latest/transformer/mapper.html#module-spooq.transformer.mapper_transformations>`_
* `Exploder <https://spooq.readthedocs.io/en/latest/transformer/exploder.html>`_
* `Filter <https://spooq.readthedocs.io/en/latest/transformer/sieve.html>`_
* `Mapper (Restructuring of complex DataFrames) <https://spooq.readthedocs.io/en/latest/transformer/mapper.html>`_
* `Threshold-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/threshold_cleaner.html>`_
* `Enumeration-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/enum_cleaner.html>`_
* `Newest by Group (Most current record per ID) <https://spooq.readthedocs.io/en/latest/transformer/newest_by_group.html>`_


`Extractors <https://spooq.readthedocs.io/en/latest/extractor/overview.html>`_
----------------------------------------------------------------------------------
  
* `JSON Files <https://spooq.readthedocs.io/en/latest/extractor/json.html>`_
* `JDBC Source <https://spooq.readthedocs.io/en/latest/extractor/jdbc.html>`_

`Loaders <https://spooq.readthedocs.io/en/latest/loader/overview.html>`_
----------------------------------------------------------------------------------
  
* `Hive Database <https://spooq.readthedocs.io/en/latest/loader/hive_loader.html>`_

Installation
============

.. code-block:: python

    pip install spooq


Online Documentation
=====================

For a more details please consult the online documentation at |Onlinedocs|

Changelog
============

|Changelog|

Contribution
============

Please see |Contribute| for more information.

License
=========

This library is licensed under the |License|

-------------------------------------------------------------------------------------------------------------

.. |RTD| image:: https://readthedocs.org/projects/spooq/badge/?version=latest
   :target: https://spooq.readthedocs.io/en/latest/?badge=latest
   :alt: Documentation Status

.. |Onlinedocs| image:: https://about.readthedocs.com/theme/img/logo-wordmark-dark.svg
   :target: https://spooq.readthedocs.io/
   :alt: Online Documentation
   :width: 120px

.. |License| image:: https://img.shields.io/badge/license-MIT-blue.svg
   :target: https://github.com/Breaka84/Spooq/blob/master/LICENSE
   :alt: Project License

.. |Changelog| image:: https://img.shields.io/badge/CHANGELOG-8A2BE2
   :target: https://spooq.readthedocs.io/en/latest/changelog.html
   :alt: Changelog

.. |Contribute| image:: https://img.shields.io/badge/CONTRIBUTING-8A2BE2
   :target: https://spooq.readthedocs.io/en/latest/contribute.html
   :alt: Contribute

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/Breaka84/Spooq",
    "name": "Spooq",
    "maintainer": null,
    "docs_url": null,
    "requires_python": null,
    "maintainer_email": null,
    "keywords": "spooq, spark, hive, cloudera, hadoop, etl, data ingestion, data wrangling, databricks, big data, batch, streaming, data engineering",
    "author": "David Hohensinn",
    "author_email": "breaka@gmx.at",
    "download_url": "https://files.pythonhosted.org/packages/6b/16/06e69dbc942e0c20bed8a0e989a3a28ebd9e966b41af79e9884f3bd70892/Spooq-3.4.0.tar.gz",
    "platform": null,
    "description": "|RTD| |License|\n\nWelcome to Spooq!\n=================\n\nSpooq is your **PySpark** based helper library for ETL data ingestion pipeline in Data Lakes.\n\nThe main components are:\n  * Extractors\n  * Transformers\n  * Loaders\n\nThose components are independent and can be used separately or be plugged-in into a pipeline instance.\nYou can also use the custom functions from the Mapper transformer directly with PySpark (f.e. ``select`` or ``withColumn``).\n\nExample of Mapper Transformer\n=============================\n\n.. code-block:: python\n\n   from pyspark.sql import Row\n   from pyspark.sql import functions as F, types as T\n   from spooq.transformer import Mapper\n   from spooq.transformer import mapper_transformations as spq\n\n   input_df = spark.createDataFrame(\n      [\n         Row(\n            struct_a=Row(idx=\"000_123_456\", sts=\"enabled\", ts=\"1597069446000\"),\n            struct_b=Row(itms=\"1,2,4\", sts=\"whitelisted\", ts=\"2020-08-12T12:43:14+0000\"),\n            struct_c=Row(email=\"abc@def.com\", gndr=\"F\", dt=\"2020-08-05\", cmt=\"fine\"),\n         ),\n         Row(\n            struct_a=Row(idx=\"000_654_321\", sts=\"off\", ts=\"1597069500784\"),\n            struct_b=Row(itms=\"5\", sts=\"blacklisted\", ts=\"2020-07-01T12:43:14+0000\"),\n            struct_c=Row(email=\"\", gndr=\"m\", dt=\"2020-06-27\", cmt=\"faulty\"),\n         ),\n      ],\n      schema=\"\"\"\n         a: struct<idx string, sts string, ts string>,\n         b: struct<itms string, sts string, ts string>,\n         c: struct<email string, gndr string, dt string, cmt string>\n      \"\"\"\n   )\n   input_df.printSchema()\n   root\n    |-- a: struct (nullable = true)\n    |    |-- idx: string (nullable = true)\n    |    |-- sts: string (nullable = true)\n    |    |-- ts: string (nullable = true)\n    |-- b: struct (nullable = true)\n    |    |-- itms: string (nullable = true)\n    |    |-- sts: string (nullable = true)\n    |    |-- ts: string (nullable = true)\n    |-- c: struct (nullable = true)\n    |    |-- email: string (nullable = true)\n    |    |-- gndr: string (nullable = true)\n    |    |-- dt: string (nullable = true)\n    |    |-- cmt: string (nullable = true)\n\n\n   mapping = [\n       # output_name     # source                # transformation\n      (\"index\",          \"a.idx\",                spq.to_int),  # removes leading zeros and underline characters\n      (\"is_enabled\",     \"a.sts\",                spq.to_bool),  # recognizes additional words like \"on\", \"off\", \"disabled\", \"enabled\", ...\n      (\"a_updated_at\",   \"a.ts\",                 spq.to_timestamp),  # supports unix timestamps in ms or seconds and strings\n      (\"items\",          \"b.itms\",               spq.str_to_array(cast=\"int\")),  # splits a comma delimited string into an array and casts its elements\n      (\"block_status\",   \"b.sts\",                spq.map_values(mapping={\"whitelisted\": \"allowed\", \"blacklisted\": \"blocked\"})),  # applies lookup dictionary\n      (\"b_updated_at\",   \"b.ts\",                 spq.to_timestamp),  # supports unix timestamps in ms or seconds and strings\n      (\"has_email\",      \"c.email\",              spq.has_value),  # interprets also empty strings as no value, although, zeros are values\n      (\"gender\",         \"c.gndr\",               spq.apply(func=F.lower)),  # applies provided function to all values\n      (\"creation_date\",  \"c.dt\",                 spq.to_timestamp(cast=\"date\")),  # explicitly casts result after transformation\n      (\"processed_at\",   F.current_timestamp(),  spq.as_is),  # source column is a function, no transformation to the results\n      (\"comment\",        \"c.cmt\",                \"string\"),  # no transformation, only cast; alternatively: spq.to_str or spq.as_is(cast=\"string\")\n   ]\n   output_df = Mapper(mapping).transform(input_df)\n\n   output_df.show(truncate=False)\n   +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+\n   |index |is_enabled|a_updated_at           |items    |block_status|b_updated_at       |has_email|gender|creation_date|processed_at          |comment|\n   +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+\n   |123456|true      |2020-08-10 16:24:06    |[1, 2, 4]|allowed     |2020-08-12 14:43:14|true     |f     |2020-08-05   |2022-08-12 09:17:09.83|fine   |\n   |654321|false     |2020-08-10 16:25:00.784|[5]      |blocked     |2020-07-01 14:43:14|false    |m     |2020-06-27   |2022-08-12 09:17:09.83|faulty |\n   +------+----------+-----------------------+---------+------------+-------------------+---------+------+-------------+----------------------+-------+\n\n\n   output_df.printSchema()\n   root\n    |-- index: integer (nullable = true)\n    |-- is_enabled: boolean (nullable = true)\n    |-- a_updated_at: timestamp (nullable = true)\n    |-- items: array (nullable = true)\n    |    |-- element: integer (containsNull = true)\n    |-- block_status: string (nullable = true)\n    |-- b_updated_at: timestamp (nullable = true)\n    |-- has_email: boolean (nullable = false)\n    |-- gender: string (nullable = true)\n    |-- creation_date: date (nullable = true)\n    |-- processed_at: timestamp (nullable = false)\n    |-- comment: string (nullable = true)\n\nFeatures / Components\n=====================\n\n`Transformers <https://spooq.readthedocs.io/en/latest/transformer/overview.html>`_\n----------------------------------------------------------------------------------\n\n* `Custom Mapping Transformations <https://spooq.readthedocs.io/en/latest/transformer/mapper.html#module-spooq.transformer.mapper_transformations>`_\n* `Exploder <https://spooq.readthedocs.io/en/latest/transformer/exploder.html>`_\n* `Filter <https://spooq.readthedocs.io/en/latest/transformer/sieve.html>`_\n* `Mapper (Restructuring of complex DataFrames) <https://spooq.readthedocs.io/en/latest/transformer/mapper.html>`_\n* `Threshold-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/threshold_cleaner.html>`_\n* `Enumeration-based Cleanser <https://spooq.readthedocs.io/en/latest/transformer/enum_cleaner.html>`_\n* `Newest by Group (Most current record per ID) <https://spooq.readthedocs.io/en/latest/transformer/newest_by_group.html>`_\n\n\n`Extractors <https://spooq.readthedocs.io/en/latest/extractor/overview.html>`_\n----------------------------------------------------------------------------------\n  \n* `JSON Files <https://spooq.readthedocs.io/en/latest/extractor/json.html>`_\n* `JDBC Source <https://spooq.readthedocs.io/en/latest/extractor/jdbc.html>`_\n\n`Loaders <https://spooq.readthedocs.io/en/latest/loader/overview.html>`_\n----------------------------------------------------------------------------------\n  \n* `Hive Database <https://spooq.readthedocs.io/en/latest/loader/hive_loader.html>`_\n\nInstallation\n============\n\n.. code-block:: python\n\n    pip install spooq\n\n\nOnline Documentation\n=====================\n\nFor a more details please consult the online documentation at |Onlinedocs|\n\nChangelog\n============\n\n|Changelog|\n\nContribution\n============\n\nPlease see |Contribute| for more information.\n\nLicense\n=========\n\nThis library is licensed under the |License|\n\n-------------------------------------------------------------------------------------------------------------\n\n.. |RTD| image:: https://readthedocs.org/projects/spooq/badge/?version=latest\n   :target: https://spooq.readthedocs.io/en/latest/?badge=latest\n   :alt: Documentation Status\n\n.. |Onlinedocs| image:: https://about.readthedocs.com/theme/img/logo-wordmark-dark.svg\n   :target: https://spooq.readthedocs.io/\n   :alt: Online Documentation\n   :width: 120px\n\n.. |License| image:: https://img.shields.io/badge/license-MIT-blue.svg\n   :target: https://github.com/Breaka84/Spooq/blob/master/LICENSE\n   :alt: Project License\n\n.. |Changelog| image:: https://img.shields.io/badge/CHANGELOG-8A2BE2\n   :target: https://spooq.readthedocs.io/en/latest/changelog.html\n   :alt: Changelog\n\n.. |Contribute| image:: https://img.shields.io/badge/CONTRIBUTING-8A2BE2\n   :target: https://spooq.readthedocs.io/en/latest/contribute.html\n   :alt: Contribute\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "Spooq is a PySpark based helper library for ETL data ingestion pipeline in Data Lakes.",
    "version": "3.4.0",
    "project_urls": {
        "Documentation": "https://spooq.readthedocs.io",
        "Homepage": "https://github.com/Breaka84/Spooq"
    },
    "split_keywords": [
        "spooq",
        " spark",
        " hive",
        " cloudera",
        " hadoop",
        " etl",
        " data ingestion",
        " data wrangling",
        " databricks",
        " big data",
        " batch",
        " streaming",
        " data engineering"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "ca9057e9183b7f392504c0a395a33977f59ca3782deb7dbdc606ede15fe433db",
                "md5": "36e87abf3820a9af80235c8014f8b4b6",
                "sha256": "9265fd965bae23868fdd2f5ad97b1fa22e236ad337e4fe618d9443a28c624732"
            },
            "downloads": -1,
            "filename": "Spooq-3.4.0-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "36e87abf3820a9af80235c8014f8b4b6",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 49872,
            "upload_time": "2024-03-20T09:36:39",
            "upload_time_iso_8601": "2024-03-20T09:36:39.431558Z",
            "url": "https://files.pythonhosted.org/packages/ca/90/57e9183b7f392504c0a395a33977f59ca3782deb7dbdc606ede15fe433db/Spooq-3.4.0-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "6b1606e69dbc942e0c20bed8a0e989a3a28ebd9e966b41af79e9884f3bd70892",
                "md5": "85b850205ae4f5bef292c9ad7dfb4fe4",
                "sha256": "18a15d500afa52675396bca56f1afb1de785b895083870f8b05f37be83ce383b"
            },
            "downloads": -1,
            "filename": "Spooq-3.4.0.tar.gz",
            "has_sig": false,
            "md5_digest": "85b850205ae4f5bef292c9ad7dfb4fe4",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 39547,
            "upload_time": "2024-03-20T09:36:41",
            "upload_time_iso_8601": "2024-03-20T09:36:41.440145Z",
            "url": "https://files.pythonhosted.org/packages/6b/16/06e69dbc942e0c20bed8a0e989a3a28ebd9e966b41af79e9884f3bd70892/Spooq-3.4.0.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-03-20 09:36:41",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "Breaka84",
    "github_project": "Spooq",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "requirements": [
        {
            "name": "pandas",
            "specs": []
        },
        {
            "name": "future",
            "specs": []
        }
    ],
    "lcname": "spooq"
}
        
Elapsed time: 0.21619s