isincython


Nameisincython JSON
Version 0.13 PyPI version JSON
download
home_pagehttps://github.com/hansalemaos/isincython
Summaryfast isin() function using Cython (C++) - up to 80 times faster than NumPy/Pandas.
upload_time2023-11-30 21:11:17
maintainer
docs_urlNone
authorJohannes Fischer
requires_python
licenseMIT
keywords isin cython
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            
# fast isin() function using Cython (C++) - up to 80 times faster than NumPy/Pandas.

## pip install isincython

### Tested against Python 3.11 / Windows 10



## Cython (and a C/C++ compiler) must be installed to use the optimized Cython implementation.

This module provides functions for efficiently checking if elements in one array
are present in another array. It includes a Cython implementation for improved performance.

Note: The Cython implementation is compiled during the first import, and the compiled
extension module is stored in the same directory. Subsequent imports will use the
precompiled module for improved performance.



```python
import timeit
from isincython import generate_random_arrays, fast_isin
import numpy as np

size = 10000000
low = 0
high = 254
arras = [
    (size, "float32", low, high),
    (size, "float64", low, high),
    (size, np.uint8, low, high),
    (size, np.int8, low, high),
    (size, np.int16, low, high),
    (size, np.int32, low, high),
    (size, np.int64, low, high),
    (size, np.uint16, low, high),
    (size, np.uint32, low, high),
    (size, np.uint64, low, high),
]

reps = 1
for a in arras:
    arr = generate_random_arrays(*a)
    seq = generate_random_arrays(size // 10, *a[1:])
    s = """u=fast_isin(arr,seq)"""
    u = fast_isin(arr, seq)
    print("c++", arr[u])
    t1 = timeit.timeit(s, globals=globals(), number=reps) / reps
    print(t1)
    s2 = """q=np.isin(arr,seq)"""
    q = np.isin(arr, seq)
    print("numpy", arr[q])

    t2 = timeit.timeit(s2, globals=globals(), number=reps) / reps
    print(t2)
    print(np.all(q == u))

    print("-----------------")

haystack = np.array(
    [
        b"Cumings",
        b"Heikkinen",
        b"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",
        b"aaa",
        b"bbbb()",
        b"Futrelle",
        b"Allen",
        b"Cumings, Mrs. John Bradley (Florence Briggs Thayer)q",
        b"Braund, Mr. Owen Harris",
        b"Heikkinen, Miss. Laina",
        b"Futrelle, Mrs. Jacques Heath (Lily May Peel)",
        b"Allen, Mr. William Henry",
        b"Braund",
    ],
    dtype="S",
)
needels = np.array(
    [
        b"Braund, Mr. Owen Harris",
        b"Cumings, Mrs. John Bradley (Florence Briggs Th",
        b"Heikkinen, Miss. Lxxaina",
        b"Futrelle, Mrs. Jacqxues Heath (Lily May Peel)",
        b"Allen, Mxr. William Henry",
        b"sdfsdd",
        b"aaa",
        b"bbbb()",
    ],
    dtype="S",
)
haystack = np.ascontiguousarray(np.concatenate([haystack for _ in range(200000)]))
needels = np.ascontiguousarray(np.concatenate([needels for _ in range(10000)]))

s = "o = fast_isin(haystack, needels)"
t1 = timeit.timeit(s, globals=globals(), number=reps) / reps
s1 = "o = np.isin(haystack, needels)"
t2 = timeit.timeit(s1, globals=globals(), number=reps) / reps
print(f"c++ {t1}")
print(f"numpy {t2}")
o1 = fast_isin(haystack, needels)
o2 = np.isin(haystack, needels)
print(np.all(o1 == o2))
needels = needels.astype("U")
haystack = haystack.astype("U")
s = "o = fast_isin(haystack, needels)"
t1 = timeit.timeit(s, globals=globals(), number=reps) / reps
s1 = "o = np.isin(haystack, needels)"
t2 = timeit.timeit(s1, globals=globals(), number=reps) / reps
print(f"c++ {t1}")
print(f"numpy {t2}")
o1 = fast_isin(haystack, needels)
o2 = np.isin(haystack, needels)
print(np.all(o1 == o2))

# c++ [136.03264   62.5741   156.39038  ...  78.545906 229.14676  186.44472 ]
# 0.39614199999778066
# numpy [136.03264   62.5741   156.39038  ...  78.545906 229.14676  186.44472 ]
# 2.1623376999996253
# True
# -----------------
# c++ []
# 0.4184691000045859
# numpy []
# 2.189824300003238
# True
# -----------------
# c++ [126 128  31 ... 113 190 146]
# 0.011114299995824695
# numpy [126 128  31 ... 113 190 146]
# 0.05381579999811947
# True
# -----------------
# c++ [  23   35   52 ...   54   98 -125]
# 0.010347299998102244
# numpy [  23   35   52 ...   54   98 -125]
# 0.8121466000011424
# True
# -----------------
# c++ [144  29  89 ...  90  34 202]
# 0.012101899999834131
# numpy [144  29  89 ...  90  34 202]
# 0.05841199999849778
# True
# -----------------
# c++ [ 93  51 131 ... 231 147 140]
# 0.013264799999888055
# numpy [ 93  51 131 ... 231 147 140]
# 0.07822610000584973
# True
# -----------------
# c++ [138 158 233 ...  64  82 160]
# 0.018734699995548
# numpy [138 158 233 ...  64  82 160]
# 0.09425780000310624
# True
# -----------------
# c++ [158  17 126 ...  55   7 116]
# 0.011595800002396572
# numpy [158  17 126 ...  55   7 116]
# 0.06014610000420362
# True
# -----------------
# c++ [ 60  12 226 ... 152 190 155]
# 0.013999900002090726
# numpy [ 60  12 226 ... 152 190 155]
# 0.07416449999436736
# True
# -----------------
# c++ [239  84  81 ... 146  85  63]
# 0.026196500002697576
# numpy [239  84  81 ... 146  85  63]
# 0.11476380000385689
# True
# -----------------
# c++ 0.7991062000000966
# numpy 2.1993997000026866
# True
# c++ 1.7051588000031188
# numpy 3.0464809000040987
# True

```

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/hansalemaos/isincython",
    "name": "isincython",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "isin,Cython",
    "author": "Johannes Fischer",
    "author_email": "aulasparticularesdealemaosp@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/c1/d1/62bbeb2467423950753fc8796784290649ff0aecd2447182f13e66b50bef/isincython-0.13.tar.gz",
    "platform": null,
    "description": "\r\n# fast isin() function using Cython (C++) - up to 80 times faster than NumPy/Pandas.\r\n\r\n## pip install isincython\r\n\r\n### Tested against Python 3.11 / Windows 10\r\n\r\n\r\n\r\n## Cython (and a C/C++ compiler) must be installed to use the optimized Cython implementation.\r\n\r\nThis module provides functions for efficiently checking if elements in one array\r\nare present in another array. It includes a Cython implementation for improved performance.\r\n\r\nNote: The Cython implementation is compiled during the first import, and the compiled\r\nextension module is stored in the same directory. Subsequent imports will use the\r\nprecompiled module for improved performance.\r\n\r\n\r\n\r\n```python\r\nimport timeit\r\nfrom isincython import generate_random_arrays, fast_isin\r\nimport numpy as np\r\n\r\nsize = 10000000\r\nlow = 0\r\nhigh = 254\r\narras = [\r\n    (size, \"float32\", low, high),\r\n    (size, \"float64\", low, high),\r\n    (size, np.uint8, low, high),\r\n    (size, np.int8, low, high),\r\n    (size, np.int16, low, high),\r\n    (size, np.int32, low, high),\r\n    (size, np.int64, low, high),\r\n    (size, np.uint16, low, high),\r\n    (size, np.uint32, low, high),\r\n    (size, np.uint64, low, high),\r\n]\r\n\r\nreps = 1\r\nfor a in arras:\r\n    arr = generate_random_arrays(*a)\r\n    seq = generate_random_arrays(size // 10, *a[1:])\r\n    s = \"\"\"u=fast_isin(arr,seq)\"\"\"\r\n    u = fast_isin(arr, seq)\r\n    print(\"c++\", arr[u])\r\n    t1 = timeit.timeit(s, globals=globals(), number=reps) / reps\r\n    print(t1)\r\n    s2 = \"\"\"q=np.isin(arr,seq)\"\"\"\r\n    q = np.isin(arr, seq)\r\n    print(\"numpy\", arr[q])\r\n\r\n    t2 = timeit.timeit(s2, globals=globals(), number=reps) / reps\r\n    print(t2)\r\n    print(np.all(q == u))\r\n\r\n    print(\"-----------------\")\r\n\r\nhaystack = np.array(\r\n    [\r\n        b\"Cumings\",\r\n        b\"Heikkinen\",\r\n        b\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)\",\r\n        b\"aaa\",\r\n        b\"bbbb()\",\r\n        b\"Futrelle\",\r\n        b\"Allen\",\r\n        b\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)q\",\r\n        b\"Braund, Mr. Owen Harris\",\r\n        b\"Heikkinen, Miss. Laina\",\r\n        b\"Futrelle, Mrs. Jacques Heath (Lily May Peel)\",\r\n        b\"Allen, Mr. William Henry\",\r\n        b\"Braund\",\r\n    ],\r\n    dtype=\"S\",\r\n)\r\nneedels = np.array(\r\n    [\r\n        b\"Braund, Mr. Owen Harris\",\r\n        b\"Cumings, Mrs. John Bradley (Florence Briggs Th\",\r\n        b\"Heikkinen, Miss. Lxxaina\",\r\n        b\"Futrelle, Mrs. Jacqxues Heath (Lily May Peel)\",\r\n        b\"Allen, Mxr. William Henry\",\r\n        b\"sdfsdd\",\r\n        b\"aaa\",\r\n        b\"bbbb()\",\r\n    ],\r\n    dtype=\"S\",\r\n)\r\nhaystack = np.ascontiguousarray(np.concatenate([haystack for _ in range(200000)]))\r\nneedels = np.ascontiguousarray(np.concatenate([needels for _ in range(10000)]))\r\n\r\ns = \"o = fast_isin(haystack, needels)\"\r\nt1 = timeit.timeit(s, globals=globals(), number=reps) / reps\r\ns1 = \"o = np.isin(haystack, needels)\"\r\nt2 = timeit.timeit(s1, globals=globals(), number=reps) / reps\r\nprint(f\"c++ {t1}\")\r\nprint(f\"numpy {t2}\")\r\no1 = fast_isin(haystack, needels)\r\no2 = np.isin(haystack, needels)\r\nprint(np.all(o1 == o2))\r\nneedels = needels.astype(\"U\")\r\nhaystack = haystack.astype(\"U\")\r\ns = \"o = fast_isin(haystack, needels)\"\r\nt1 = timeit.timeit(s, globals=globals(), number=reps) / reps\r\ns1 = \"o = np.isin(haystack, needels)\"\r\nt2 = timeit.timeit(s1, globals=globals(), number=reps) / reps\r\nprint(f\"c++ {t1}\")\r\nprint(f\"numpy {t2}\")\r\no1 = fast_isin(haystack, needels)\r\no2 = np.isin(haystack, needels)\r\nprint(np.all(o1 == o2))\r\n\r\n# c++ [136.03264   62.5741   156.39038  ...  78.545906 229.14676  186.44472 ]\r\n# 0.39614199999778066\r\n# numpy [136.03264   62.5741   156.39038  ...  78.545906 229.14676  186.44472 ]\r\n# 2.1623376999996253\r\n# True\r\n# -----------------\r\n# c++ []\r\n# 0.4184691000045859\r\n# numpy []\r\n# 2.189824300003238\r\n# True\r\n# -----------------\r\n# c++ [126 128  31 ... 113 190 146]\r\n# 0.011114299995824695\r\n# numpy [126 128  31 ... 113 190 146]\r\n# 0.05381579999811947\r\n# True\r\n# -----------------\r\n# c++ [  23   35   52 ...   54   98 -125]\r\n# 0.010347299998102244\r\n# numpy [  23   35   52 ...   54   98 -125]\r\n# 0.8121466000011424\r\n# True\r\n# -----------------\r\n# c++ [144  29  89 ...  90  34 202]\r\n# 0.012101899999834131\r\n# numpy [144  29  89 ...  90  34 202]\r\n# 0.05841199999849778\r\n# True\r\n# -----------------\r\n# c++ [ 93  51 131 ... 231 147 140]\r\n# 0.013264799999888055\r\n# numpy [ 93  51 131 ... 231 147 140]\r\n# 0.07822610000584973\r\n# True\r\n# -----------------\r\n# c++ [138 158 233 ...  64  82 160]\r\n# 0.018734699995548\r\n# numpy [138 158 233 ...  64  82 160]\r\n# 0.09425780000310624\r\n# True\r\n# -----------------\r\n# c++ [158  17 126 ...  55   7 116]\r\n# 0.011595800002396572\r\n# numpy [158  17 126 ...  55   7 116]\r\n# 0.06014610000420362\r\n# True\r\n# -----------------\r\n# c++ [ 60  12 226 ... 152 190 155]\r\n# 0.013999900002090726\r\n# numpy [ 60  12 226 ... 152 190 155]\r\n# 0.07416449999436736\r\n# True\r\n# -----------------\r\n# c++ [239  84  81 ... 146  85  63]\r\n# 0.026196500002697576\r\n# numpy [239  84  81 ... 146  85  63]\r\n# 0.11476380000385689\r\n# True\r\n# -----------------\r\n# c++ 0.7991062000000966\r\n# numpy 2.1993997000026866\r\n# True\r\n# c++ 1.7051588000031188\r\n# numpy 3.0464809000040987\r\n# True\r\n\r\n```\r\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "fast isin() function using Cython (C++) - up to 80 times faster than NumPy/Pandas.",
    "version": "0.13",
    "project_urls": {
        "Homepage": "https://github.com/hansalemaos/isincython"
    },
    "split_keywords": [
        "isin",
        "cython"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "afdf7c4cd9343c5e54ca68903f47ec235253af9e4ca2f252ccb9cda72f572cb2",
                "md5": "52b41908598903cb1181263a884378ec",
                "sha256": "c52c3d5fb998e865a587704c8faeb6af111dc456bafca1c4499bea78a75bed75"
            },
            "downloads": -1,
            "filename": "isincython-0.13-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "52b41908598903cb1181263a884378ec",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 24186,
            "upload_time": "2023-11-30T21:11:15",
            "upload_time_iso_8601": "2023-11-30T21:11:15.867910Z",
            "url": "https://files.pythonhosted.org/packages/af/df/7c4cd9343c5e54ca68903f47ec235253af9e4ca2f252ccb9cda72f572cb2/isincython-0.13-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "c1d162bbeb2467423950753fc8796784290649ff0aecd2447182f13e66b50bef",
                "md5": "7465de3ecf11e60352b3c446a3fd5824",
                "sha256": "b6d9da45f3f4c1ed9b1f957cf6bab7c43bb915943fda88586c942e4aa9f2f9a6"
            },
            "downloads": -1,
            "filename": "isincython-0.13.tar.gz",
            "has_sig": false,
            "md5_digest": "7465de3ecf11e60352b3c446a3fd5824",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 24524,
            "upload_time": "2023-11-30T21:11:17",
            "upload_time_iso_8601": "2023-11-30T21:11:17.826191Z",
            "url": "https://files.pythonhosted.org/packages/c1/d1/62bbeb2467423950753fc8796784290649ff0aecd2447182f13e66b50bef/isincython-0.13.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-11-30 21:11:17",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "hansalemaos",
    "github_project": "isincython",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [],
    "lcname": "isincython"
}
        
Elapsed time: 0.31147s