datadigger

Name	datadigger JSON
Version	0.11 JSON
	download
home_page	None
Summary	The package is geared towards automating text-related tasks and is useful for data extraction, web scraping, and text file management.
upload_time	2025-08-27 13:51:41
maintainer	None
docs_url	None
author	Ramesh Chandra
requires_python	>=3.6
license	None
keywords
VCS
bugtrack_url
requirements	No requirements were recorded.
Travis-CI	No Travis.
coveralls test coverage	No coveralls.

            # datadigger

`datadigger` is a Python package designed to simplify text processing tasks, such as extracting, manipulating, and saving text data from various sources. It includes utility functions for working with text, handling files (e.g., reading/writing CSV), interacting with HTML elements via BeautifulSoup, and performing operations like string standardization, element extraction with CSS selectors, and more. 

---

## ✨ Key Features

- 📝 **String manipulation**: Clean, standardize, and sanitize text.  
- 📂 **File handling**: Read/write CSV, TXT, and other files with optional headers and delimiters.  
- 🌐 **HTML parsing**: Extract text or attributes using **CSS selectors** or **XPath**.  
- 📦 **JSON utilities**: Access deeply nested values and normalize data.  
- 🧹 **Flexible error handling**: Graceful behavior with missing/invalid inputs.  

---

## 📦 Installation

Install via **pip**:

```bash
pip install datadigger


# ================================================================
1. Create a Directory

from datadigger import create_directory
# Creates a directory if it doesn't already exist.


# Example 1: Creating a new directory
create_directory("new_folder")

# Example 2: Creating nested directories
create_directory("parent_folder/sub_folder")

# ================================================================
2. Standardize a String

from datadigger import standardized_string
# This function standardizes the input string by removing escape sequences like \n, \t, and \r, removing HTML tags, collapsing multiple spaces, and trimming leading/trailing spaces.


# Example 1: Standardize a string with newlines, tabs, and HTML tags
input_string_1 = "<html><body>  Hello \nWorld!  \tThis is a test.  </body></html>"
print("Standardized String 1:", standardized_string(input_string_1))

# Example 2: Input string with multiple spaces and line breaks
input_string_2 = "  This   is   a  \n\n   string   with  spaces and \t tabs.  "
print("Standardized String 2:", standardized_string(input_string_2))

# Example 3: Pass an empty string
input_string_3 = ""
print("Standardized String 3:", standardized_string(input_string_3))

# Example 4: Pass None (invalid input)
input_string_4 = None
print("Standardized String 4:", standardized_string(input_string_4))

================================================================
3. Remove Common Elements

from datadigger import remove_common_elements

# Example 1: Lists
print(remove_common_elements([1, 2, 3, 4, 5], [3, 4, 6]))
# Output: [1, 2, 5]

# Example 2: Set + Tuple
print(remove_common_elements({1, 2, 3, 4, 5}, (3, 4, 6)))
# Output: {1, 2, 5}

# Example 3: Missing arguments
print(remove_common_elements([1, 2], None))
# Output: "Value not passed for: remove_by"

print(remove_common_elements(None, None))
# Output: "Value not passed for: remove_in, remove_by"


================================================================
4. Save to CSV

from datadigger import save_to_csv

list_data = [[1, 'Alice', 23], [2, 'Bob', 30], [3, 'Charlie', 25]]
column_header_list = ['ID', 'Name', 'Age']
output_file_path = 'output_data.csv'

# Default separator (comma)
save_to_csv(list_data, column_header_list, output_file_path)

# Tab separator
save_to_csv(list_data, column_header_list, output_file_path, sep="\t")

# Semicolon separator
save_to_csv(list_data, column_header_list, output_file_path, sep=";")

Output (default, sep=","):
ID,Name,Age
1,Alice,23
2,Bob,30
3,Charlie,25

Output (sep="\t"):
ID  Name    Age
1   Alice   23
2   Bob 30
3   Charlie 25



================================================================
5. Read CSV

from datadigger import read_csv

csv_file_path = 'data.csv'
get_value_by_col_name = 'URL'
filter_col_name = 'Category'
include_filter_col_values = ['Tech']

result = read_csv(csv_file_path, get_value_by_col_name, filter_col_name, include_filter_col_values)
print(result)

Sample CSV

Category,URL
Tech,https://tech1.com
Tech,https://tech2.com
Science,https://science1.com

Result

['https://tech1.com', 'https://tech2.com']

================================================================
6. Extract JSON Content

from datadigger import get_json_content

json_data = {"user": {"name": "John", "age": 30}}
keys = ["user", "name"]

print(get_json_content(json_data, keys))
# Output: "John"

================================================================
7. Extract with CSS Selectors

from bs4 import BeautifulSoup
from datadigger import get_selector_content

html_content = """
<html>
  <body>
    <div class="example">Example Text</div>
    <a href="https://example.com">Link</a>
  </body>
</html>
"""
soup_obj = BeautifulSoup(html_content, "html.parser")

print(get_selector_content(soup_obj=soup_obj, css_selector_ele=".example"))
# [<div class="example">Example Text</div>]

print(get_selector_content(soup_obj=soup_obj, css_selector=".example"))
# "Example Text"

print(get_selector_content(soup_obj=soup_obj, css_selector="a", attr="href"))
# "https://example.com"

print(get_selector_content(soup_obj))
# "Example Text Link"


================================================================
8. Extract with XPath

from datadigger import get_xpath_content
from lxml import etree

html_content = """
<html>
    <body>
        <div>
            <h1>Welcome to My Website</h1>
            <p class="description">This is a paragraph.</p>
            <a href="http://example.com" id="example-link">Click here</a>
        </div>
    </body>
</html>
"""

tree = etree.HTML(html_content)

print(get_xpath_content(tree, xpath="//h1"))
# "Welcome to My Website"

print(get_xpath_content(tree, xpath="//a[@id='example-link']", attr="href"))
# "http://example.com"

print(get_xpath_content(tree, xpath="//a", attr="id"))
# "example-link"


================================================================
9. Save & Read Files

from datadigger import save_file, read_file

# Save file
save_file("output", "This is a new file.", "example.txt")  
save_file("output", "Appending content.", "example.txt", mode="a")  
save_file("output", "Special characters: äöüß", "example_latin1.txt", encoding="latin-1")  

# Read file
content = read_file("output/example.txt")
print(content)

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "datadigger",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.6",
    "maintainer_email": null,
    "keywords": null,
    "author": "Ramesh Chandra",
    "author_email": "rameshsofter@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/f8/ca/a3d880d3bcea12f79628bacd9195ecf5cc2ffbb510ce9c748e53de9ec4e3/datadigger-0.11.tar.gz",
    "platform": null,
    "description": "# datadigger\r\n\r\n`datadigger` is a Python package designed to simplify text processing tasks, such as extracting, manipulating, and saving text data from various sources. It includes utility functions for working with text, handling files (e.g., reading/writing CSV), interacting with HTML elements via BeautifulSoup, and performing operations like string standardization, element extraction with CSS selectors, and more. \r\n\r\n---\r\n\r\n## \u2728 Key Features\r\n\r\n- \ud83d\udcdd **String manipulation**: Clean, standardize, and sanitize text.  \r\n- \ud83d\udcc2 **File handling**: Read/write CSV, TXT, and other files with optional headers and delimiters.  \r\n- \ud83c\udf10 **HTML parsing**: Extract text or attributes using **CSS selectors** or **XPath**.  \r\n- \ud83d\udce6 **JSON utilities**: Access deeply nested values and normalize data.  \r\n- \ud83e\uddf9 **Flexible error handling**: Graceful behavior with missing/invalid inputs.  \r\n\r\n---\r\n\r\n## \ud83d\udce6 Installation\r\n\r\nInstall via **pip**:\r\n\r\n```bash\r\npip install datadigger\r\n\r\n\r\n# ================================================================\r\n1. Create a Directory\r\n\r\nfrom datadigger import create_directory\r\n# Creates a directory if it doesn't already exist.\r\n\r\n\r\n# Example 1: Creating a new directory\r\ncreate_directory(\"new_folder\")\r\n\r\n# Example 2: Creating nested directories\r\ncreate_directory(\"parent_folder/sub_folder\")\r\n\r\n# ================================================================\r\n2. Standardize a String\r\n\r\nfrom datadigger import standardized_string\r\n# This function standardizes the input string by removing escape sequences like \\n, \\t, and \\r, removing HTML tags, collapsing multiple spaces, and trimming leading/trailing spaces.\r\n\r\n\r\n# Example 1: Standardize a string with newlines, tabs, and HTML tags\r\ninput_string_1 = \"<html><body>  Hello \\nWorld!  \\tThis is a test.  </body></html>\"\r\nprint(\"Standardized String 1:\", standardized_string(input_string_1))\r\n\r\n# Example 2: Input string with multiple spaces and line breaks\r\ninput_string_2 = \"  This   is   a  \\n\\n   string   with  spaces and \\t tabs.  \"\r\nprint(\"Standardized String 2:\", standardized_string(input_string_2))\r\n\r\n# Example 3: Pass an empty string\r\ninput_string_3 = \"\"\r\nprint(\"Standardized String 3:\", standardized_string(input_string_3))\r\n\r\n# Example 4: Pass None (invalid input)\r\ninput_string_4 = None\r\nprint(\"Standardized String 4:\", standardized_string(input_string_4))\r\n\r\n================================================================\r\n3. Remove Common Elements\r\n\r\nfrom datadigger import remove_common_elements\r\n\r\n# Example 1: Lists\r\nprint(remove_common_elements([1, 2, 3, 4, 5], [3, 4, 6]))\r\n# Output: [1, 2, 5]\r\n\r\n# Example 2: Set + Tuple\r\nprint(remove_common_elements({1, 2, 3, 4, 5}, (3, 4, 6)))\r\n# Output: {1, 2, 5}\r\n\r\n# Example 3: Missing arguments\r\nprint(remove_common_elements([1, 2], None))\r\n# Output: \"Value not passed for: remove_by\"\r\n\r\nprint(remove_common_elements(None, None))\r\n# Output: \"Value not passed for: remove_in, remove_by\"\r\n\r\n\r\n================================================================\r\n4. Save to CSV\r\n\r\nfrom datadigger import save_to_csv\r\n\r\nlist_data = [[1, 'Alice', 23], [2, 'Bob', 30], [3, 'Charlie', 25]]\r\ncolumn_header_list = ['ID', 'Name', 'Age']\r\noutput_file_path = 'output_data.csv'\r\n\r\n# Default separator (comma)\r\nsave_to_csv(list_data, column_header_list, output_file_path)\r\n\r\n# Tab separator\r\nsave_to_csv(list_data, column_header_list, output_file_path, sep=\"\\t\")\r\n\r\n# Semicolon separator\r\nsave_to_csv(list_data, column_header_list, output_file_path, sep=\";\")\r\n\r\nOutput (default, sep=\",\"):\r\nID,Name,Age\r\n1,Alice,23\r\n2,Bob,30\r\n3,Charlie,25\r\n\r\nOutput (sep=\"\\t\"):\r\nID  Name    Age\r\n1   Alice   23\r\n2   Bob 30\r\n3   Charlie 25\r\n\r\n\r\n\r\n================================================================\r\n5. Read CSV\r\n\r\nfrom datadigger import read_csv\r\n\r\ncsv_file_path = 'data.csv'\r\nget_value_by_col_name = 'URL'\r\nfilter_col_name = 'Category'\r\ninclude_filter_col_values = ['Tech']\r\n\r\nresult = read_csv(csv_file_path, get_value_by_col_name, filter_col_name, include_filter_col_values)\r\nprint(result)\r\n\r\nSample CSV\r\n\r\nCategory,URL\r\nTech,https://tech1.com\r\nTech,https://tech2.com\r\nScience,https://science1.com\r\n\r\nResult\r\n\r\n['https://tech1.com', 'https://tech2.com']\r\n\r\n================================================================\r\n6. Extract JSON Content\r\n\r\nfrom datadigger import get_json_content\r\n\r\njson_data = {\"user\": {\"name\": \"John\", \"age\": 30}}\r\nkeys = [\"user\", \"name\"]\r\n\r\nprint(get_json_content(json_data, keys))\r\n# Output: \"John\"\r\n\r\n================================================================\r\n7. Extract with CSS Selectors\r\n\r\nfrom bs4 import BeautifulSoup\r\nfrom datadigger import get_selector_content\r\n\r\nhtml_content = \"\"\"\r\n<html>\r\n  <body>\r\n    <div class=\"example\">Example Text</div>\r\n    <a href=\"https://example.com\">Link</a>\r\n  </body>\r\n</html>\r\n\"\"\"\r\nsoup_obj = BeautifulSoup(html_content, \"html.parser\")\r\n\r\nprint(get_selector_content(soup_obj=soup_obj, css_selector_ele=\".example\"))\r\n# [<div class=\"example\">Example Text</div>]\r\n\r\nprint(get_selector_content(soup_obj=soup_obj, css_selector=\".example\"))\r\n# \"Example Text\"\r\n\r\nprint(get_selector_content(soup_obj=soup_obj, css_selector=\"a\", attr=\"href\"))\r\n# \"https://example.com\"\r\n\r\nprint(get_selector_content(soup_obj))\r\n# \"Example Text Link\"\r\n\r\n\r\n================================================================\r\n8. Extract with XPath\r\n\r\nfrom datadigger import get_xpath_content\r\nfrom lxml import etree\r\n\r\nhtml_content = \"\"\"\r\n<html>\r\n    <body>\r\n        <div>\r\n            <h1>Welcome to My Website</h1>\r\n            <p class=\"description\">This is a paragraph.</p>\r\n            <a href=\"http://example.com\" id=\"example-link\">Click here</a>\r\n        </div>\r\n    </body>\r\n</html>\r\n\"\"\"\r\n\r\ntree = etree.HTML(html_content)\r\n\r\nprint(get_xpath_content(tree, xpath=\"//h1\"))\r\n# \"Welcome to My Website\"\r\n\r\nprint(get_xpath_content(tree, xpath=\"//a[@id='example-link']\", attr=\"href\"))\r\n# \"http://example.com\"\r\n\r\nprint(get_xpath_content(tree, xpath=\"//a\", attr=\"id\"))\r\n# \"example-link\"\r\n\r\n\r\n================================================================\r\n9. Save & Read Files\r\n\r\nfrom datadigger import save_file, read_file\r\n\r\n# Save file\r\nsave_file(\"output\", \"This is a new file.\", \"example.txt\")  \r\nsave_file(\"output\", \"Appending content.\", \"example.txt\", mode=\"a\")  \r\nsave_file(\"output\", \"Special characters: \u00e4\u00f6\u00fc\u00df\", \"example_latin1.txt\", encoding=\"latin-1\")  \r\n\r\n# Read file\r\ncontent = read_file(\"output/example.txt\")\r\nprint(content)\r\n\r\n\r\n\r\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "The package is geared towards automating text-related tasks and is useful for data extraction, web scraping, and text file management.",
    "version": "0.11",
    "project_urls": null,
    "split_keywords": [],
    "urls": [
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "6842240ec899e3c8f5aa305dfcb9f08178866c227fde02ad6631e4ca78da0969",
                "md5": "f7bff424060439c04bcd3477b84ad2d2",
                "sha256": "718ab5b22f4bc3530d970bd6c0ac6516c30615fcf78f4d26ebf3caf2009004aa"
            },
            "downloads": -1,
            "filename": "datadigger-0.11-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "f7bff424060439c04bcd3477b84ad2d2",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.6",
            "size": 11741,
            "upload_time": "2025-08-27T13:51:40",
            "upload_time_iso_8601": "2025-08-27T13:51:40.371188Z",
            "url": "https://files.pythonhosted.org/packages/68/42/240ec899e3c8f5aa305dfcb9f08178866c227fde02ad6631e4ca78da0969/datadigger-0.11-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "f8caa3d880d3bcea12f79628bacd9195ecf5cc2ffbb510ce9c748e53de9ec4e3",
                "md5": "de263d12be258b1163c2a920dc3e6e89",
                "sha256": "47dab4fbd7d6031973d33595b0eecdd9fe78f3e5c1034bf4aad1703ece7199c8"
            },
            "downloads": -1,
            "filename": "datadigger-0.11.tar.gz",
            "has_sig": false,
            "md5_digest": "de263d12be258b1163c2a920dc3e6e89",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.6",
            "size": 13555,
            "upload_time": "2025-08-27T13:51:41",
            "upload_time_iso_8601": "2025-08-27T13:51:41.378764Z",
            "url": "https://files.pythonhosted.org/packages/f8/ca/a3d880d3bcea12f79628bacd9195ecf5cc2ffbb510ce9c748e53de9ec4e3/datadigger-0.11.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2025-08-27 13:51:41",
    "github": false,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "lcname": "datadigger"
}

Ramesh Chandra