tesseractmultiprocessing

Name	tesseractmultiprocessing JSON
Version	0.10 JSON
	download
home_page	https://github.com/hansalemaos/tesseractmultiprocessing
Summary	Multiprocessing OCR with Tesseract
upload_time	2023-03-10 06:04:47
maintainer
docs_url	None
author	Johannes Fischer
requires_python
license	MIT
keywords	tesseract multiprocessing threads ocr
VCS
bugtrack_url
requirements	a_cv_imwrite_imread_plus callpyfile opencv_python pandas pathos
Travis-CI	No Travis.
coveralls test coverage	No coveralls.

            
# Multiprocessing OCR with Tesseract



## pip install tesseractmultiprocessing



Worth using if you:

1) have plenty of different files 

2) are using numpy



#### Multi: 23.9910116



#### One CPU: 100.61128 #pytesseract





```python

from tesseractmultiprocessing import tesser2df

from a_cv_imwrite_imread_plus import open_image_in_cv

from time import perf_counter



picslinks = [

    r"https://github.com/hansalemaos/screenshots/raw/main/pandsnesteddicthtml.png",

    r"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000000.png",

    r"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000008.png",

    r"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000017.png",

]

picsunique = [open_image_in_cv(x) for x in picslinks]

pics = []

for _ in range(100):

    pics.extend(picsunique)



start = perf_counter()

output = tesser2df(

    pics,

    language="eng",

    pandas_kwargs={"on_bad_lines": "warn"},

    tesser_args=(),

    cpus=5,

    tesser_path=r"C:\Program Files\Tesseract-OCR\tesseract.exe",

)

print(f"Multi: {perf_counter()-start}")





################################################################################



import pytesseract



pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"





def st():

    alla = []

    for p in pics:

        alla.append(pytesseract.image_to_data(p))

    return alla





start = perf_counter()

output2 = st()

print(f"One CPU: {perf_counter()-start}")





# Multi: 23.9910116

# One CPU: 100.61128



# output[0]

# Out[4]:

# (    level  page_num  block_num  par_num  ...  start_x  start_y  end_x  end_y

#  0       1         1          0        0  ...        0        0   1465    654

#  1       2         1          1        0  ...      322       64    327    540

#  2       3         1          1        1  ...      322       64    327    540

#  3       4         1          1        1  ...      322       64    327    540

#  4       5         1          1        1  ...      322       64    327    540

#  ..    ...       ...        ...      ...  ...      ...      ...    ...    ...

#  60      5         1         11        1  ...       14      633   1448    644

#  61      2         1         12        0  ...     1445       15   1450    639

#  62      3         1         12        1  ...     1445       15   1450    639

#  63      4         1         12        1  ...     1445       15   1450    639

#  64      5         1         12        1  ...     1445       15   1450    639

#

#  [65 rows x 19 columns],

#  array([[[255, 255, 255],

#          [255, 255, 255],

#          [255, 255, 255],

#          ...,

#          [255, 255, 255],

#          [255, 255, 255],

#          [255, 255, 255]],

#

#         [[255, 255, 255],

#          [255, 255, 255],

#          [255, 255, 255],

#          ...,

#          [255, 255, 255],

#          [255, 255, 255],

#          [255, 255, 255]],



```

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/hansalemaos/tesseractmultiprocessing",
    "name": "tesseractmultiprocessing",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "tesseract,multiprocessing,threads,ocr",
    "author": "Johannes Fischer",
    "author_email": "<aulasparticularesdealemaosp@gmail.com>",
    "download_url": "https://files.pythonhosted.org/packages/8f/85/b5818df606ef38ffba3cfad84fc2db62f9d71c9e96fe78bba39ad901e32c/tesseractmultiprocessing-0.10.tar.gz",
    "platform": null,
    "description": "\n# Multiprocessing OCR with Tesseract\n\n\n\n## pip install tesseractmultiprocessing\n\n\n\nWorth using if you:\n\n1) have plenty of different files \n\n2) are using numpy\n\n\n\n#### Multi: 23.9910116\n\n\n\n#### One CPU: 100.61128 #pytesseract\n\n\n\n\n\n```python\n\nfrom tesseractmultiprocessing import tesser2df\n\nfrom a_cv_imwrite_imread_plus import open_image_in_cv\n\nfrom time import perf_counter\n\n\n\npicslinks = [\n\n    r\"https://github.com/hansalemaos/screenshots/raw/main/pandsnesteddicthtml.png\",\n\n    r\"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000000.png\",\n\n    r\"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000008.png\",\n\n    r\"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000017.png\",\n\n]\n\npicsunique = [open_image_in_cv(x) for x in picslinks]\n\npics = []\n\nfor _ in range(100):\n\n    pics.extend(picsunique)\n\n\n\nstart = perf_counter()\n\noutput = tesser2df(\n\n    pics,\n\n    language=\"eng\",\n\n    pandas_kwargs={\"on_bad_lines\": \"warn\"},\n\n    tesser_args=(),\n\n    cpus=5,\n\n    tesser_path=r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\",\n\n)\n\nprint(f\"Multi: {perf_counter()-start}\")\n\n\n\n\n\n################################################################################\n\n\n\nimport pytesseract\n\n\n\npytesseract.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n\n\n\n\n\ndef st():\n\n    alla = []\n\n    for p in pics:\n\n        alla.append(pytesseract.image_to_data(p))\n\n    return alla\n\n\n\n\n\nstart = perf_counter()\n\noutput2 = st()\n\nprint(f\"One CPU: {perf_counter()-start}\")\n\n\n\n\n\n# Multi: 23.9910116\n\n# One CPU: 100.61128\n\n\n\n# output[0]\n\n# Out[4]:\n\n# (    level  page_num  block_num  par_num  ...  start_x  start_y  end_x  end_y\n\n#  0       1         1          0        0  ...        0        0   1465    654\n\n#  1       2         1          1        0  ...      322       64    327    540\n\n#  2       3         1          1        1  ...      322       64    327    540\n\n#  3       4         1          1        1  ...      322       64    327    540\n\n#  4       5         1          1        1  ...      322       64    327    540\n\n#  ..    ...       ...        ...      ...  ...      ...      ...    ...    ...\n\n#  60      5         1         11        1  ...       14      633   1448    644\n\n#  61      2         1         12        0  ...     1445       15   1450    639\n\n#  62      3         1         12        1  ...     1445       15   1450    639\n\n#  63      4         1         12        1  ...     1445       15   1450    639\n\n#  64      5         1         12        1  ...     1445       15   1450    639\n\n#\n\n#  [65 rows x 19 columns],\n\n#  array([[[255, 255, 255],\n\n#          [255, 255, 255],\n\n#          [255, 255, 255],\n\n#          ...,\n\n#          [255, 255, 255],\n\n#          [255, 255, 255],\n\n#          [255, 255, 255]],\n\n#\n\n#         [[255, 255, 255],\n\n#          [255, 255, 255],\n\n#          [255, 255, 255],\n\n#          ...,\n\n#          [255, 255, 255],\n\n#          [255, 255, 255],\n\n#          [255, 255, 255]],\n\n\n\n```\n\n\n\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "Multiprocessing OCR with Tesseract",
    "version": "0.10",
    "split_keywords": [
        "tesseract",
        "multiprocessing",
        "threads",
        "ocr"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "3c816e6e1c27ab48f33feb0d2a007544e9ed01d9c057959ac9a2eb3d8638a630",
                "md5": "4257237233fe24d6a2bd04861d14ad57",
                "sha256": "014a045bcc22b01414a7111ecebc3f11d1098ec2dd422431d07e139e35b58d5a"
            },
            "downloads": -1,
            "filename": "tesseractmultiprocessing-0.10-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "4257237233fe24d6a2bd04861d14ad57",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 7313,
            "upload_time": "2023-03-10T06:04:44",
            "upload_time_iso_8601": "2023-03-10T06:04:44.807690Z",
            "url": "https://files.pythonhosted.org/packages/3c/81/6e6e1c27ab48f33feb0d2a007544e9ed01d9c057959ac9a2eb3d8638a630/tesseractmultiprocessing-0.10-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "8f85b5818df606ef38ffba3cfad84fc2db62f9d71c9e96fe78bba39ad901e32c",
                "md5": "39048d2cf41e8f422279243c0d546e4e",
                "sha256": "7c8ba358a549d25f8439ba7159586d149cc6f1a5f34d7b63f721f51dc01d7d33"
            },
            "downloads": -1,
            "filename": "tesseractmultiprocessing-0.10.tar.gz",
            "has_sig": false,
            "md5_digest": "39048d2cf41e8f422279243c0d546e4e",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 5468,
            "upload_time": "2023-03-10T06:04:47",
            "upload_time_iso_8601": "2023-03-10T06:04:47.379757Z",
            "url": "https://files.pythonhosted.org/packages/8f/85/b5818df606ef38ffba3cfad84fc2db62f9d71c9e96fe78bba39ad901e32c/tesseractmultiprocessing-0.10.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-03-10 06:04:47",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "github_user": "hansalemaos",
    "github_project": "tesseractmultiprocessing",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [
        {
            "name": "a_cv_imwrite_imread_plus",
            "specs": []
        },
        {
            "name": "callpyfile",
            "specs": []
        },
        {
            "name": "opencv_python",
            "specs": []
        },
        {
            "name": "pandas",
            "specs": []
        },
        {
            "name": "pathos",
            "specs": []
        }
    ],
    "lcname": "tesseractmultiprocessing"
}

Johannes Fischer