# Calculate overlapping values between two arrays and return the results as a DataFrame
## Tested against Windows 10 / Python 3.10 / Anaconda
## pip install stridesduplicatefinder
#### Problem: you have to lists of different sizes and want to find the overlapping values.
## Using pure Python - working, but slow
#### all indices / same values
```python
a1=[1,2,3,4,5,6,7]
a2=[0,0,3,1,5,6,8,1,32,]
res1=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2]
print(res1)
# [(0, 3, 1, 1), (0, 7, 1, 1), (2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]
```
#### same indices / same values
```python
res2=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2 and index1==index2]
print(res2)
# [(2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]
```
## Using stridesduplicatefinder - numpy or numexpr
```python
from stridesduplicatefinder import get_overlapping
def test_numexpr():
start = perf_counter()
_ = get_overlapping(
fu="a==b", a=a1, b=a2, numpy_or_numexpr="numexpr", same_index_required=False
)
print(f"numexpr test: {perf_counter() - start}")
print(_)
def test_numpy():
start = perf_counter()
_ = get_overlapping(
fu=lambda a, b: a == b,
a=a1,
b=a2,
numpy_or_numexpr="numpy",
same_index_required=False,
)
print(f"numpy test: {perf_counter() - start}")
print(_)
def python_test():
start = perf_counter()
_ = [(i1, i2, a, b) for i2, a in enumerate(a1) for i1, b in enumerate(a2) if a == b]
print(f"python test: {perf_counter() - start}")
print(_[:10])
a1 = np.random.randint(1, 100, size=(19000,),dtype=np.int64)
a2 = np.random.randint(1, 100, size=(7777,),dtype=np.int64)
from time import perf_counter
python_test()
# python test: 13.229658300006122
test_numpy()
# numpy test: 0.5666937999994843
test_numexpr()
# numexpr test: 0.48387080000247806
```
```python
Calculate overlapping values between two arrays and return the results as a DataFrame.
Parameters:
- fu: function or string to be evaluated as a condition for overlap.
- a: First input array.
- b: Second input array.
- numpy_or_numexpr: 'numpy' or 'numexpr' indicating the evaluation method.
- same_index_required: If True, only return rows where index1 == index2.
Returns:
- A DataFrame with columns 'index1', 'value1', 'index2', 'value2' containing
information about overlapping values.
Example Usage:
- To find overlapping values between two NumPy arrays:
a1 = np.random.randint(1, 10, size=(100000,))
a2 = np.random.randint(1, 10, size=(100,))
df1 = get_overlapping(
fu="a==b", a=a1, b=a2, numpy_or_numexpr="numexpr", same_index_required=True
)
print(df1)
- To find overlapping values using a custom function:
a1 = np.random.randint(1, 10, size=(100000,))
a2 = np.random.randint(1, 10, size=(100,))
df2 = get_overlapping(
fu=lambda a, b: a == b,
a=a1,
b=a2,
numpy_or_numexpr="numpy",
same_index_required=False,
)
print(df2)
- To find overlapping values between two arrays of strings:
a1 = np.array(["aa", "b", "c", "d", "ee11", "f", "gg", "h", "i", "j"])
a1 = np.repeat(a1, 1000)
a2 = np.array(["aa", "b", "c", "ee11", "f", "gg"])
a2 = np.repeat(a2, 1000)
np.random.shuffle(a1)
np.random.shuffle(a2)
df3 = get_overlapping(
fu="a == b",
a=np.char.array(a1).encode("utf-8"),
b=np.char.array(a2).encode("utf-8"),
numpy_or_numexpr="numexpr",
same_index_required=True,
)
print(df3)
# index1 value1 index2 value2
# 0 5 1 5 1
# 1 20 8 20 8
# 2 33 5 33 5
# 3 34 1 34 1
# 4 41 5 41 5
# 5 43 2 43 2
# 6 51 7 51 7
# 7 52 1 52 1
# 8 55 7 55 7
# 9 57 1 57 1
# 10 70 2 70 2
# 11 74 8 74 8
# index1 value1 index2 value2
# 0 0 4 8 4
# 1 0 4 12 4
# 2 0 4 13 4
# 3 0 4 26 4
# 4 0 4 53 4
# ... ... ... ...
# 1112213 99999 9 47 9
# 1112214 99999 9 62 9
# 1112215 99999 9 72 9
# 1112216 99999 9 81 9
# 1112217 99999 9 96 9
# [1112218 rows x 4 columns]
# index1 value1 index2 value2
# 0 1 gg 4 gg
# 1 1 gg 5 gg
# 2 1 gg 10 gg
# 3 1 gg 13 gg
# 4 1 gg 17 gg
# ... ... ... ...
# 5999995 9999 c 5978 c
# 5999996 9999 c 5979 c
# 5999997 9999 c 5990 c
# 5999998 9999 c 5992 c
# 5999999 9999 c 5995 c
# [6000000 rows x 4 columns]
# index1 value1 index2 value2
# 0 31 b'aa' 31 b'aa'
# 1 40 b'b' 40 b'b'
# 2 46 b'aa' 46 b'aa'
# 3 47 b'gg' 47 b'gg'
# 4 65 b'b' 65 b'b'
# .. ... ... ... ...
# 626 5966 b'aa' 5966 b'aa'
# 627 5982 b'f' 5982 b'f'
# 628 5985 b'ee11' 5985 b'ee11'
# 629 5995 b'c' 5995 b'c'
# 630 5996 b'gg' 5996 b'gg'
# [631 rows x 4 columns]
The function computes the overlapping values based on the specified condition (function or string)
and returns a DataFrame with the results. If `same_index_required` is set to True, it filters
the results to include only rows where the indices match.
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/stridesduplicatefinder",
"name": "stridesduplicatefinder",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "numpy,numexpr,strides",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/f0/b1/e8e73fa23e2847aba474f934835167929532d145ea049640cd9e677ffef9/stridesduplicatefinder-0.10.tar.gz",
"platform": null,
"description": "\r\n# Calculate overlapping values between two arrays and return the results as a DataFrame\r\n\r\n## Tested against Windows 10 / Python 3.10 / Anaconda\r\n\r\n## pip install stridesduplicatefinder\r\n\r\n#### Problem: you have to lists of different sizes and want to find the overlapping values. \r\n\r\n## Using pure Python - working, but slow\r\n\r\n#### all indices / same values\r\n\r\n```python\r\na1=[1,2,3,4,5,6,7]\r\na2=[0,0,3,1,5,6,8,1,32,]\r\nres1=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2]\r\nprint(res1)\r\n# [(0, 3, 1, 1), (0, 7, 1, 1), (2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]\r\n```\r\n\r\n#### same indices / same values\r\n\r\n```python\r\nres2=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2 and index1==index2]\r\nprint(res2)\r\n# [(2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]\r\n```\r\n\r\n## Using stridesduplicatefinder - numpy or numexpr\r\n\r\n```python\r\nfrom stridesduplicatefinder import get_overlapping\r\n\r\ndef test_numexpr():\r\n start = perf_counter()\r\n\r\n _ = get_overlapping(\r\n fu=\"a==b\", a=a1, b=a2, numpy_or_numexpr=\"numexpr\", same_index_required=False\r\n )\r\n print(f\"numexpr test: {perf_counter() - start}\")\r\n print(_)\r\n\r\n\r\ndef test_numpy():\r\n start = perf_counter()\r\n _ = get_overlapping(\r\n fu=lambda a, b: a == b,\r\n a=a1,\r\n b=a2,\r\n numpy_or_numexpr=\"numpy\",\r\n same_index_required=False,\r\n )\r\n print(f\"numpy test: {perf_counter() - start}\")\r\n print(_)\r\n\r\n\r\ndef python_test():\r\n start = perf_counter()\r\n _ = [(i1, i2, a, b) for i2, a in enumerate(a1) for i1, b in enumerate(a2) if a == b]\r\n print(f\"python test: {perf_counter() - start}\")\r\n print(_[:10])\r\n\r\n\r\n\r\na1 = np.random.randint(1, 100, size=(19000,),dtype=np.int64)\r\na2 = np.random.randint(1, 100, size=(7777,),dtype=np.int64)\r\nfrom time import perf_counter\r\n\r\npython_test()\r\n# python test: 13.229658300006122\r\n\r\ntest_numpy()\r\n# numpy test: 0.5666937999994843\r\n\r\ntest_numexpr()\r\n# numexpr test: 0.48387080000247806\r\n\r\n\r\n```\r\n\r\n\r\n```python\r\n\r\nCalculate overlapping values between two arrays and return the results as a DataFrame.\r\n\r\nParameters:\r\n- fu: function or string to be evaluated as a condition for overlap.\r\n- a: First input array.\r\n- b: Second input array.\r\n- numpy_or_numexpr: 'numpy' or 'numexpr' indicating the evaluation method.\r\n- same_index_required: If True, only return rows where index1 == index2.\r\n\r\nReturns:\r\n- A DataFrame with columns 'index1', 'value1', 'index2', 'value2' containing\r\n information about overlapping values.\r\n\r\nExample Usage:\r\n- To find overlapping values between two NumPy arrays:\r\n \r\n a1 = np.random.randint(1, 10, size=(100000,))\r\n a2 = np.random.randint(1, 10, size=(100,))\r\n df1 = get_overlapping(\r\n\t fu=\"a==b\", a=a1, b=a2, numpy_or_numexpr=\"numexpr\", same_index_required=True\r\n )\r\n print(df1)\r\n \r\n\r\n- To find overlapping values using a custom function:\r\n \r\n a1 = np.random.randint(1, 10, size=(100000,))\r\n a2 = np.random.randint(1, 10, size=(100,))\r\n df2 = get_overlapping(\r\n\t fu=lambda a, b: a == b,\r\n\t a=a1,\r\n\t b=a2,\r\n\t numpy_or_numexpr=\"numpy\",\r\n\t same_index_required=False,\r\n )\r\n print(df2)\r\n \r\n\r\n- To find overlapping values between two arrays of strings:\r\n \r\n a1 = np.array([\"aa\", \"b\", \"c\", \"d\", \"ee11\", \"f\", \"gg\", \"h\", \"i\", \"j\"])\r\n a1 = np.repeat(a1, 1000)\r\n a2 = np.array([\"aa\", \"b\", \"c\", \"ee11\", \"f\", \"gg\"])\r\n a2 = np.repeat(a2, 1000)\r\n np.random.shuffle(a1)\r\n np.random.shuffle(a2)\r\n df3 = get_overlapping(\r\n\t fu=\"a == b\",\r\n\t a=np.char.array(a1).encode(\"utf-8\"),\r\n\t b=np.char.array(a2).encode(\"utf-8\"),\r\n\t numpy_or_numexpr=\"numexpr\",\r\n\t same_index_required=True,\r\n )\r\n print(df3)\r\n \r\n\t\t# index1 value1 index2 value2\r\n\t# 0 5 1 5 1\r\n\t# 1 20 8 20 8\r\n\t# 2 33 5 33 5\r\n\t# 3 34 1 34 1\r\n\t# 4 41 5 41 5\r\n\t# 5 43 2 43 2\r\n\t# 6 51 7 51 7\r\n\t# 7 52 1 52 1\r\n\t# 8 55 7 55 7\r\n\t# 9 57 1 57 1\r\n\t# 10 70 2 70 2\r\n\t# 11 74 8 74 8\r\n\r\n\r\n\t# index1 value1 index2 value2\r\n\t# 0 0 4 8 4\r\n\t# 1 0 4 12 4\r\n\t# 2 0 4 13 4\r\n\t# 3 0 4 26 4\r\n\t# 4 0 4 53 4\r\n\t# ... ... ... ...\r\n\t# 1112213 99999 9 47 9\r\n\t# 1112214 99999 9 62 9\r\n\t# 1112215 99999 9 72 9\r\n\t# 1112216 99999 9 81 9\r\n\t# 1112217 99999 9 96 9\r\n\t# [1112218 rows x 4 columns]\r\n\r\n\r\n\t# index1 value1 index2 value2\r\n\t# 0 1 gg 4 gg\r\n\t# 1 1 gg 5 gg\r\n\t# 2 1 gg 10 gg\r\n\t# 3 1 gg 13 gg\r\n\t# 4 1 gg 17 gg\r\n\t# ... ... ... ...\r\n\t# 5999995 9999 c 5978 c\r\n\t# 5999996 9999 c 5979 c\r\n\t# 5999997 9999 c 5990 c\r\n\t# 5999998 9999 c 5992 c\r\n\t# 5999999 9999 c 5995 c\r\n\t# [6000000 rows x 4 columns]\r\n\r\n\r\n\t# index1 value1 index2 value2\r\n\t# 0 31 b'aa' 31 b'aa'\r\n\t# 1 40 b'b' 40 b'b'\r\n\t# 2 46 b'aa' 46 b'aa'\r\n\t# 3 47 b'gg' 47 b'gg'\r\n\t# 4 65 b'b' 65 b'b'\r\n\t# .. ... ... ... ...\r\n\t# 626 5966 b'aa' 5966 b'aa'\r\n\t# 627 5982 b'f' 5982 b'f'\r\n\t# 628 5985 b'ee11' 5985 b'ee11'\r\n\t# 629 5995 b'c' 5995 b'c'\r\n\t# 630 5996 b'gg' 5996 b'gg'\r\n\t# [631 rows x 4 columns]\r\n\r\nThe function computes the overlapping values based on the specified condition (function or string)\r\nand returns a DataFrame with the results. If `same_index_required` is set to True, it filters\r\nthe results to include only rows where the indices match.\r\n```\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Calculate overlapping values between two arrays and return the results as a DataFrame",
"version": "0.10",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/stridesduplicatefinder"
},
"split_keywords": [
"numpy",
"numexpr",
"strides"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "f5906bddc3b23a639ed8f0460b6addad87b643ec06e70240607fc9dd7c11f2d0",
"md5": "07a9a1b3fe332f226a81fa7b07829440",
"sha256": "df2c2ccf6227ecc2c62b74bdba596031942af8668c9fdfceb860a57ba3ad201d"
},
"downloads": -1,
"filename": "stridesduplicatefinder-0.10-py3-none-any.whl",
"has_sig": false,
"md5_digest": "07a9a1b3fe332f226a81fa7b07829440",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 25213,
"upload_time": "2023-09-09T22:20:01",
"upload_time_iso_8601": "2023-09-09T22:20:01.018412Z",
"url": "https://files.pythonhosted.org/packages/f5/90/6bddc3b23a639ed8f0460b6addad87b643ec06e70240607fc9dd7c11f2d0/stridesduplicatefinder-0.10-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "f0b1e8e73fa23e2847aba474f934835167929532d145ea049640cd9e677ffef9",
"md5": "c2f9511cb87d31dcd7ee6c7f2a174766",
"sha256": "10b516d4ed9438eb11b0ddc50ca2f82dfeabf83aeb3ff53c7894eccbe403c576"
},
"downloads": -1,
"filename": "stridesduplicatefinder-0.10.tar.gz",
"has_sig": false,
"md5_digest": "c2f9511cb87d31dcd7ee6c7f2a174766",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 24534,
"upload_time": "2023-09-09T22:20:05",
"upload_time_iso_8601": "2023-09-09T22:20:05.320215Z",
"url": "https://files.pythonhosted.org/packages/f0/b1/e8e73fa23e2847aba474f934835167929532d145ea049640cd9e677ffef9/stridesduplicatefinder-0.10.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-09-09 22:20:05",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "stridesduplicatefinder",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [],
"lcname": "stridesduplicatefinder"
}