stridesduplicatefinder


Namestridesduplicatefinder JSON
Version 0.10 PyPI version JSON
download
home_pagehttps://github.com/hansalemaos/stridesduplicatefinder
SummaryCalculate overlapping values between two arrays and return the results as a DataFrame
upload_time2023-09-09 22:20:05
maintainer
docs_urlNone
authorJohannes Fischer
requires_python
licenseMIT
keywords numpy numexpr strides
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            
# Calculate overlapping values between two arrays and return the results as a DataFrame

## Tested against Windows 10 / Python 3.10 / Anaconda

## pip install stridesduplicatefinder

#### Problem: you have to lists of different sizes and want to find the overlapping values. 

## Using pure Python - working, but slow

#### all indices / same values

```python
a1=[1,2,3,4,5,6,7]
a2=[0,0,3,1,5,6,8,1,32,]
res1=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2]
print(res1)
# [(0, 3, 1, 1), (0, 7, 1, 1), (2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]
```

#### same indices / same values

```python
res2=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2 and index1==index2]
print(res2)
# [(2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]
```

## Using stridesduplicatefinder - numpy or numexpr

```python
from stridesduplicatefinder import get_overlapping

def test_numexpr():
    start = perf_counter()

    _ = get_overlapping(
        fu="a==b", a=a1, b=a2, numpy_or_numexpr="numexpr", same_index_required=False
    )
    print(f"numexpr test: {perf_counter() - start}")
    print(_)


def test_numpy():
    start = perf_counter()
    _ = get_overlapping(
        fu=lambda a, b: a == b,
        a=a1,
        b=a2,
        numpy_or_numexpr="numpy",
        same_index_required=False,
    )
    print(f"numpy test: {perf_counter() - start}")
    print(_)


def python_test():
    start = perf_counter()
    _ = [(i1, i2, a, b) for i2, a in enumerate(a1) for i1, b in enumerate(a2) if a == b]
    print(f"python test: {perf_counter() - start}")
    print(_[:10])



a1 = np.random.randint(1, 100, size=(19000,),dtype=np.int64)
a2 = np.random.randint(1, 100, size=(7777,),dtype=np.int64)
from time import perf_counter

python_test()
# python test: 13.229658300006122

test_numpy()
# numpy test: 0.5666937999994843

test_numexpr()
# numexpr test: 0.48387080000247806


```


```python

Calculate overlapping values between two arrays and return the results as a DataFrame.

Parameters:
- fu: function or string to be evaluated as a condition for overlap.
- a: First input array.
- b: Second input array.
- numpy_or_numexpr: 'numpy' or 'numexpr' indicating the evaluation method.
- same_index_required: If True, only return rows where index1 == index2.

Returns:
- A DataFrame with columns 'index1', 'value1', 'index2', 'value2' containing
  information about overlapping values.

Example Usage:
- To find overlapping values between two NumPy arrays:
  
  a1 = np.random.randint(1, 10, size=(100000,))
  a2 = np.random.randint(1, 10, size=(100,))
  df1 = get_overlapping(
	  fu="a==b", a=a1, b=a2, numpy_or_numexpr="numexpr", same_index_required=True
  )
  print(df1)
  

- To find overlapping values using a custom function:
  
  a1 = np.random.randint(1, 10, size=(100000,))
  a2 = np.random.randint(1, 10, size=(100,))
  df2 = get_overlapping(
	  fu=lambda a, b: a == b,
	  a=a1,
	  b=a2,
	  numpy_or_numexpr="numpy",
	  same_index_required=False,
  )
  print(df2)
  

- To find overlapping values between two arrays of strings:
  
  a1 = np.array(["aa", "b", "c", "d", "ee11", "f", "gg", "h", "i", "j"])
  a1 = np.repeat(a1, 1000)
  a2 = np.array(["aa", "b", "c", "ee11", "f", "gg"])
  a2 = np.repeat(a2, 1000)
  np.random.shuffle(a1)
  np.random.shuffle(a2)
  df3 = get_overlapping(
	  fu="a == b",
	  a=np.char.array(a1).encode("utf-8"),
	  b=np.char.array(a2).encode("utf-8"),
	  numpy_or_numexpr="numexpr",
	  same_index_required=True,
  )
  print(df3)
  
		#     index1  value1  index2  value2
	# 0        5       1       5       1
	# 1       20       8      20       8
	# 2       33       5      33       5
	# 3       34       1      34       1
	# 4       41       5      41       5
	# 5       43       2      43       2
	# 6       51       7      51       7
	# 7       52       1      52       1
	# 8       55       7      55       7
	# 9       57       1      57       1
	# 10      70       2      70       2
	# 11      74       8      74       8


	#          index1  value1  index2  value2
	# 0             0       4       8       4
	# 1             0       4      12       4
	# 2             0       4      13       4
	# 3             0       4      26       4
	# 4             0       4      53       4
	#          ...     ...     ...     ...
	# 1112213   99999       9      47       9
	# 1112214   99999       9      62       9
	# 1112215   99999       9      72       9
	# 1112216   99999       9      81       9
	# 1112217   99999       9      96       9
	# [1112218 rows x 4 columns]


	#          index1 value1  index2 value2
	# 0             1     gg       4     gg
	# 1             1     gg       5     gg
	# 2             1     gg      10     gg
	# 3             1     gg      13     gg
	# 4             1     gg      17     gg
	#          ...    ...     ...    ...
	# 5999995    9999      c    5978      c
	# 5999996    9999      c    5979      c
	# 5999997    9999      c    5990      c
	# 5999998    9999      c    5992      c
	# 5999999    9999      c    5995      c
	# [6000000 rows x 4 columns]


	#      index1   value1  index2   value2
	# 0        31    b'aa'      31    b'aa'
	# 1        40     b'b'      40     b'b'
	# 2        46    b'aa'      46    b'aa'
	# 3        47    b'gg'      47    b'gg'
	# 4        65     b'b'      65     b'b'
	# ..      ...      ...     ...      ...
	# 626    5966    b'aa'    5966    b'aa'
	# 627    5982     b'f'    5982     b'f'
	# 628    5985  b'ee11'    5985  b'ee11'
	# 629    5995     b'c'    5995     b'c'
	# 630    5996    b'gg'    5996    b'gg'
	# [631 rows x 4 columns]

The function computes the overlapping values based on the specified condition (function or string)
and returns a DataFrame with the results. If `same_index_required` is set to True, it filters
the results to include only rows where the indices match.
```

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/hansalemaos/stridesduplicatefinder",
    "name": "stridesduplicatefinder",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "numpy,numexpr,strides",
    "author": "Johannes Fischer",
    "author_email": "aulasparticularesdealemaosp@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/f0/b1/e8e73fa23e2847aba474f934835167929532d145ea049640cd9e677ffef9/stridesduplicatefinder-0.10.tar.gz",
    "platform": null,
    "description": "\r\n# Calculate overlapping values between two arrays and return the results as a DataFrame\r\n\r\n## Tested against Windows 10 / Python 3.10 / Anaconda\r\n\r\n## pip install stridesduplicatefinder\r\n\r\n#### Problem: you have to lists of different sizes and want to find the overlapping values. \r\n\r\n## Using pure Python - working, but slow\r\n\r\n#### all indices / same values\r\n\r\n```python\r\na1=[1,2,3,4,5,6,7]\r\na2=[0,0,3,1,5,6,8,1,32,]\r\nres1=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2]\r\nprint(res1)\r\n# [(0, 3, 1, 1), (0, 7, 1, 1), (2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]\r\n```\r\n\r\n#### same indices / same values\r\n\r\n```python\r\nres2=[(index1, index2, value1, value2) for index1, value1 in enumerate(a1) for index2, value2 in enumerate(a2) if value1 == value2 and index1==index2]\r\nprint(res2)\r\n# [(2, 2, 3, 3), (4, 4, 5, 5), (5, 5, 6, 6)]\r\n```\r\n\r\n## Using stridesduplicatefinder - numpy or numexpr\r\n\r\n```python\r\nfrom stridesduplicatefinder import get_overlapping\r\n\r\ndef test_numexpr():\r\n    start = perf_counter()\r\n\r\n    _ = get_overlapping(\r\n        fu=\"a==b\", a=a1, b=a2, numpy_or_numexpr=\"numexpr\", same_index_required=False\r\n    )\r\n    print(f\"numexpr test: {perf_counter() - start}\")\r\n    print(_)\r\n\r\n\r\ndef test_numpy():\r\n    start = perf_counter()\r\n    _ = get_overlapping(\r\n        fu=lambda a, b: a == b,\r\n        a=a1,\r\n        b=a2,\r\n        numpy_or_numexpr=\"numpy\",\r\n        same_index_required=False,\r\n    )\r\n    print(f\"numpy test: {perf_counter() - start}\")\r\n    print(_)\r\n\r\n\r\ndef python_test():\r\n    start = perf_counter()\r\n    _ = [(i1, i2, a, b) for i2, a in enumerate(a1) for i1, b in enumerate(a2) if a == b]\r\n    print(f\"python test: {perf_counter() - start}\")\r\n    print(_[:10])\r\n\r\n\r\n\r\na1 = np.random.randint(1, 100, size=(19000,),dtype=np.int64)\r\na2 = np.random.randint(1, 100, size=(7777,),dtype=np.int64)\r\nfrom time import perf_counter\r\n\r\npython_test()\r\n# python test: 13.229658300006122\r\n\r\ntest_numpy()\r\n# numpy test: 0.5666937999994843\r\n\r\ntest_numexpr()\r\n# numexpr test: 0.48387080000247806\r\n\r\n\r\n```\r\n\r\n\r\n```python\r\n\r\nCalculate overlapping values between two arrays and return the results as a DataFrame.\r\n\r\nParameters:\r\n- fu: function or string to be evaluated as a condition for overlap.\r\n- a: First input array.\r\n- b: Second input array.\r\n- numpy_or_numexpr: 'numpy' or 'numexpr' indicating the evaluation method.\r\n- same_index_required: If True, only return rows where index1 == index2.\r\n\r\nReturns:\r\n- A DataFrame with columns 'index1', 'value1', 'index2', 'value2' containing\r\n  information about overlapping values.\r\n\r\nExample Usage:\r\n- To find overlapping values between two NumPy arrays:\r\n  \r\n  a1 = np.random.randint(1, 10, size=(100000,))\r\n  a2 = np.random.randint(1, 10, size=(100,))\r\n  df1 = get_overlapping(\r\n\t  fu=\"a==b\", a=a1, b=a2, numpy_or_numexpr=\"numexpr\", same_index_required=True\r\n  )\r\n  print(df1)\r\n  \r\n\r\n- To find overlapping values using a custom function:\r\n  \r\n  a1 = np.random.randint(1, 10, size=(100000,))\r\n  a2 = np.random.randint(1, 10, size=(100,))\r\n  df2 = get_overlapping(\r\n\t  fu=lambda a, b: a == b,\r\n\t  a=a1,\r\n\t  b=a2,\r\n\t  numpy_or_numexpr=\"numpy\",\r\n\t  same_index_required=False,\r\n  )\r\n  print(df2)\r\n  \r\n\r\n- To find overlapping values between two arrays of strings:\r\n  \r\n  a1 = np.array([\"aa\", \"b\", \"c\", \"d\", \"ee11\", \"f\", \"gg\", \"h\", \"i\", \"j\"])\r\n  a1 = np.repeat(a1, 1000)\r\n  a2 = np.array([\"aa\", \"b\", \"c\", \"ee11\", \"f\", \"gg\"])\r\n  a2 = np.repeat(a2, 1000)\r\n  np.random.shuffle(a1)\r\n  np.random.shuffle(a2)\r\n  df3 = get_overlapping(\r\n\t  fu=\"a == b\",\r\n\t  a=np.char.array(a1).encode(\"utf-8\"),\r\n\t  b=np.char.array(a2).encode(\"utf-8\"),\r\n\t  numpy_or_numexpr=\"numexpr\",\r\n\t  same_index_required=True,\r\n  )\r\n  print(df3)\r\n  \r\n\t\t#     index1  value1  index2  value2\r\n\t# 0        5       1       5       1\r\n\t# 1       20       8      20       8\r\n\t# 2       33       5      33       5\r\n\t# 3       34       1      34       1\r\n\t# 4       41       5      41       5\r\n\t# 5       43       2      43       2\r\n\t# 6       51       7      51       7\r\n\t# 7       52       1      52       1\r\n\t# 8       55       7      55       7\r\n\t# 9       57       1      57       1\r\n\t# 10      70       2      70       2\r\n\t# 11      74       8      74       8\r\n\r\n\r\n\t#          index1  value1  index2  value2\r\n\t# 0             0       4       8       4\r\n\t# 1             0       4      12       4\r\n\t# 2             0       4      13       4\r\n\t# 3             0       4      26       4\r\n\t# 4             0       4      53       4\r\n\t#          ...     ...     ...     ...\r\n\t# 1112213   99999       9      47       9\r\n\t# 1112214   99999       9      62       9\r\n\t# 1112215   99999       9      72       9\r\n\t# 1112216   99999       9      81       9\r\n\t# 1112217   99999       9      96       9\r\n\t# [1112218 rows x 4 columns]\r\n\r\n\r\n\t#          index1 value1  index2 value2\r\n\t# 0             1     gg       4     gg\r\n\t# 1             1     gg       5     gg\r\n\t# 2             1     gg      10     gg\r\n\t# 3             1     gg      13     gg\r\n\t# 4             1     gg      17     gg\r\n\t#          ...    ...     ...    ...\r\n\t# 5999995    9999      c    5978      c\r\n\t# 5999996    9999      c    5979      c\r\n\t# 5999997    9999      c    5990      c\r\n\t# 5999998    9999      c    5992      c\r\n\t# 5999999    9999      c    5995      c\r\n\t# [6000000 rows x 4 columns]\r\n\r\n\r\n\t#      index1   value1  index2   value2\r\n\t# 0        31    b'aa'      31    b'aa'\r\n\t# 1        40     b'b'      40     b'b'\r\n\t# 2        46    b'aa'      46    b'aa'\r\n\t# 3        47    b'gg'      47    b'gg'\r\n\t# 4        65     b'b'      65     b'b'\r\n\t# ..      ...      ...     ...      ...\r\n\t# 626    5966    b'aa'    5966    b'aa'\r\n\t# 627    5982     b'f'    5982     b'f'\r\n\t# 628    5985  b'ee11'    5985  b'ee11'\r\n\t# 629    5995     b'c'    5995     b'c'\r\n\t# 630    5996    b'gg'    5996    b'gg'\r\n\t# [631 rows x 4 columns]\r\n\r\nThe function computes the overlapping values based on the specified condition (function or string)\r\nand returns a DataFrame with the results. If `same_index_required` is set to True, it filters\r\nthe results to include only rows where the indices match.\r\n```\r\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "Calculate overlapping values between two arrays and return the results as a DataFrame",
    "version": "0.10",
    "project_urls": {
        "Homepage": "https://github.com/hansalemaos/stridesduplicatefinder"
    },
    "split_keywords": [
        "numpy",
        "numexpr",
        "strides"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "f5906bddc3b23a639ed8f0460b6addad87b643ec06e70240607fc9dd7c11f2d0",
                "md5": "07a9a1b3fe332f226a81fa7b07829440",
                "sha256": "df2c2ccf6227ecc2c62b74bdba596031942af8668c9fdfceb860a57ba3ad201d"
            },
            "downloads": -1,
            "filename": "stridesduplicatefinder-0.10-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "07a9a1b3fe332f226a81fa7b07829440",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 25213,
            "upload_time": "2023-09-09T22:20:01",
            "upload_time_iso_8601": "2023-09-09T22:20:01.018412Z",
            "url": "https://files.pythonhosted.org/packages/f5/90/6bddc3b23a639ed8f0460b6addad87b643ec06e70240607fc9dd7c11f2d0/stridesduplicatefinder-0.10-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "f0b1e8e73fa23e2847aba474f934835167929532d145ea049640cd9e677ffef9",
                "md5": "c2f9511cb87d31dcd7ee6c7f2a174766",
                "sha256": "10b516d4ed9438eb11b0ddc50ca2f82dfeabf83aeb3ff53c7894eccbe403c576"
            },
            "downloads": -1,
            "filename": "stridesduplicatefinder-0.10.tar.gz",
            "has_sig": false,
            "md5_digest": "c2f9511cb87d31dcd7ee6c7f2a174766",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 24534,
            "upload_time": "2023-09-09T22:20:05",
            "upload_time_iso_8601": "2023-09-09T22:20:05.320215Z",
            "url": "https://files.pythonhosted.org/packages/f0/b1/e8e73fa23e2847aba474f934835167929532d145ea049640cd9e677ffef9/stridesduplicatefinder-0.10.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-09-09 22:20:05",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "hansalemaos",
    "github_project": "stridesduplicatefinder",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [],
    "lcname": "stridesduplicatefinder"
}
        
Elapsed time: 0.12067s