# Fast hash in 2D Arrays (Numpy/Pandas/lists/tuples)
## pip install arrayhascher
### Tested against Windows / Python 3.11 / Anaconda
## Cython (and a C/C++ compiler) must be installed
```python
Computes a hash value for each column in a DataFrame/NumPy Array/list/tuple.
Parameters:
- df (numpy.ndarray, pandas.Series, pandas.DataFrame, list, tuple): 2D (!) Input data to compute hash values for.
- fail_convert_to_string (bool, optional): If True, tries to convert non-string columns to strings after failed hashing. - The original data won't change!
If False, raises an exception if conversion fails. Default is True.
- whole_result (bool, optional): If True, returns an array of hash values for each element in the DataFrame/NumPy Array/list/tuple.
If False, returns a condensed array of hash values for each column.
Default is False.
Returns:
- numpy.ndarray: If `whole_result` is False, returns a condensed array of hash values for each column.
If `whole_result` is True, returns an array of hash values for each element in the DataFrame.
Example:
import pandas as pd
from arrayhascher import get_hash_column
def test_drop_duplicates(df,hashdata):
# Example of how to delete duplicates
return df.assign(__XXXX___DELETE____=hashdata).drop_duplicates(subset='__XXXX___DELETE____').drop(
columns='__XXXX___DELETE____')
# With pandas ----------------------------------------------------------------
df = pd.read_csv(
"https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv"
)
df = pd.concat([df for _ in range(10000)], ignore_index=True)
df = df.sample(len(df))
hashdata = get_hash_column(df, fail_convert_to_string=True, whole_result=False)
# Out[3]:
# array([-4123592378399267822, -20629003135630820, 1205215161148196795,
# ..., 4571993557129865534, -5454081294880889185,
# 2672790383060839465], dtype=int64)
# %timeit test_drop_duplicates(df,hashdata)
# %timeit df.drop_duplicates()
# 947 ms ± 18.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# 2.94 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# Numpy only ----------------------------------------------------------------
hashdata = get_hash_column(df.to_numpy(), fail_convert_to_string=True, whole_result=False)
print(hashdata)
# # array([-4123592378399267822, -20629003135630820, 1205215161148196795,
# # ..., 4571993557129865534, -5454081294880889185,
# # 2672790383060839465], dtype=int64)
# Works also with lists ------------------------------------------------------
get_hash_column(df[:100].to_numpy().tolist(), fail_convert_to_string=True, whole_result=False)
# array([-5436153420663104440, -1384246600780856199, 177114776690388363,
# 788413506175135724, 1442743010667139722, -6386366738900951630,
# -8610361015858259700, 3995349003546064044, 3627302932646306514,
# 3448626572271213155, -1555175565302024830, 3265835764424924148, ....
# And tuples ----------------------------------------------------------------
tuple(map(tuple, df[:100].to_numpy().tolist()))
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/arrayhascher",
"name": "arrayhascher",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "cython,arrays,hash,numpy",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/76/00/cba8c25067062fc273ad80979557317526416d53b8896b0304a3084a5c98/arrayhascher-0.11.tar.gz",
"platform": null,
"description": "\r\n# Fast hash in 2D Arrays (Numpy/Pandas/lists/tuples)\r\n\r\n## pip install arrayhascher\r\n\r\n### Tested against Windows / Python 3.11 / Anaconda\r\n\r\n\r\n## Cython (and a C/C++ compiler) must be installed\r\n\r\n\r\n\r\n```python\r\n Computes a hash value for each column in a DataFrame/NumPy Array/list/tuple.\r\n\r\n Parameters:\r\n - df (numpy.ndarray, pandas.Series, pandas.DataFrame, list, tuple): 2D (!) Input data to compute hash values for.\r\n - fail_convert_to_string (bool, optional): If True, tries to convert non-string columns to strings after failed hashing. - The original data won't change!\r\n If False, raises an exception if conversion fails. Default is True.\r\n - whole_result (bool, optional): If True, returns an array of hash values for each element in the DataFrame/NumPy Array/list/tuple.\r\n If False, returns a condensed array of hash values for each column.\r\n Default is False.\r\n\r\n Returns:\r\n - numpy.ndarray: If `whole_result` is False, returns a condensed array of hash values for each column.\r\n If `whole_result` is True, returns an array of hash values for each element in the DataFrame.\r\n\r\n Example:\r\n import pandas as pd\r\n\r\n from arrayhascher import get_hash_column\r\n\r\n def test_drop_duplicates(df,hashdata):\r\n # Example of how to delete duplicates\r\n\r\n return df.assign(__XXXX___DELETE____=hashdata).drop_duplicates(subset='__XXXX___DELETE____').drop(\r\n columns='__XXXX___DELETE____')\r\n\r\n # With pandas ----------------------------------------------------------------\r\n df = pd.read_csv(\r\n \"https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv\"\r\n )\r\n df = pd.concat([df for _ in range(10000)], ignore_index=True)\r\n df = df.sample(len(df))\r\n hashdata = get_hash_column(df, fail_convert_to_string=True, whole_result=False)\r\n # Out[3]:\r\n # array([-4123592378399267822, -20629003135630820, 1205215161148196795,\r\n # ..., 4571993557129865534, -5454081294880889185,\r\n # 2672790383060839465], dtype=int64)\r\n\r\n # %timeit test_drop_duplicates(df,hashdata)\r\n # %timeit df.drop_duplicates()\r\n # 947 ms \u00b1 18.1 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)\r\n # 2.94 s \u00b1 10.1 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)\r\n\r\n # Numpy only ----------------------------------------------------------------\r\n hashdata = get_hash_column(df.to_numpy(), fail_convert_to_string=True, whole_result=False)\r\n print(hashdata)\r\n # # array([-4123592378399267822, -20629003135630820, 1205215161148196795,\r\n # # ..., 4571993557129865534, -5454081294880889185,\r\n # # 2672790383060839465], dtype=int64)\r\n\r\n # Works also with lists ------------------------------------------------------\r\n get_hash_column(df[:100].to_numpy().tolist(), fail_convert_to_string=True, whole_result=False)\r\n # array([-5436153420663104440, -1384246600780856199, 177114776690388363,\r\n # 788413506175135724, 1442743010667139722, -6386366738900951630,\r\n # -8610361015858259700, 3995349003546064044, 3627302932646306514,\r\n # 3448626572271213155, -1555175565302024830, 3265835764424924148, ....\r\n # And tuples ----------------------------------------------------------------\r\n tuple(map(tuple, df[:100].to_numpy().tolist()))\r\n```\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Fast hash in 2D Arrays (Numpy/Pandas/lists/tuples)",
"version": "0.11",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/arrayhascher"
},
"split_keywords": [
"cython",
"arrays",
"hash",
"numpy"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "b5b7b815087f49ea87de8abccec4d8f61a637706c4e6ce85e802dd5d3d7d5428",
"md5": "6bb9ac1af988e073c1b44ea29c970087",
"sha256": "5a1aec3a4128f9523f1291ecf310da643a4308489cf69e97f02f1b59fefec9b0"
},
"downloads": -1,
"filename": "arrayhascher-0.11-py3-none-any.whl",
"has_sig": false,
"md5_digest": "6bb9ac1af988e073c1b44ea29c970087",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 87804,
"upload_time": "2023-12-10T04:24:59",
"upload_time_iso_8601": "2023-12-10T04:24:59.664577Z",
"url": "https://files.pythonhosted.org/packages/b5/b7/b815087f49ea87de8abccec4d8f61a637706c4e6ce85e802dd5d3d7d5428/arrayhascher-0.11-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "7600cba8c25067062fc273ad80979557317526416d53b8896b0304a3084a5c98",
"md5": "f2cfe947013ac3de3c149ff9009e2c5f",
"sha256": "342167a6d5cb42da05a568657c4109b0413839da1f6409752fa3ee1ac180b6bb"
},
"downloads": -1,
"filename": "arrayhascher-0.11.tar.gz",
"has_sig": false,
"md5_digest": "f2cfe947013ac3de3c149ff9009e2c5f",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 84545,
"upload_time": "2023-12-10T04:25:01",
"upload_time_iso_8601": "2023-12-10T04:25:01.993390Z",
"url": "https://files.pythonhosted.org/packages/76/00/cba8c25067062fc273ad80979557317526416d53b8896b0304a3084a5c98/arrayhascher-0.11.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-12-10 04:25:01",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "arrayhascher",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "Cython",
"specs": []
},
{
"name": "numpy",
"specs": []
},
{
"name": "pandas",
"specs": []
}
],
"lcname": "arrayhascher"
}