# Multiprocessing OCR with Tesseract
## pip install tesseractmultiprocessing
Worth using if you:
1) have plenty of different files
2) are using numpy
#### Multi: 23.9910116
#### One CPU: 100.61128 #pytesseract
```python
from tesseractmultiprocessing import tesser2df
from a_cv_imwrite_imread_plus import open_image_in_cv
from time import perf_counter
picslinks = [
r"https://github.com/hansalemaos/screenshots/raw/main/pandsnesteddicthtml.png",
r"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000000.png",
r"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000008.png",
r"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000017.png",
]
picsunique = [open_image_in_cv(x) for x in picslinks]
pics = []
for _ in range(100):
pics.extend(picsunique)
start = perf_counter()
output = tesser2df(
pics,
language="eng",
pandas_kwargs={"on_bad_lines": "warn"},
tesser_args=(),
cpus=5,
tesser_path=r"C:\Program Files\Tesseract-OCR\tesseract.exe",
)
print(f"Multi: {perf_counter()-start}")
################################################################################
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def st():
alla = []
for p in pics:
alla.append(pytesseract.image_to_data(p))
return alla
start = perf_counter()
output2 = st()
print(f"One CPU: {perf_counter()-start}")
# Multi: 23.9910116
# One CPU: 100.61128
# output[0]
# Out[4]:
# ( level page_num block_num par_num ... start_x start_y end_x end_y
# 0 1 1 0 0 ... 0 0 1465 654
# 1 2 1 1 0 ... 322 64 327 540
# 2 3 1 1 1 ... 322 64 327 540
# 3 4 1 1 1 ... 322 64 327 540
# 4 5 1 1 1 ... 322 64 327 540
# .. ... ... ... ... ... ... ... ... ...
# 60 5 1 11 1 ... 14 633 1448 644
# 61 2 1 12 0 ... 1445 15 1450 639
# 62 3 1 12 1 ... 1445 15 1450 639
# 63 4 1 12 1 ... 1445 15 1450 639
# 64 5 1 12 1 ... 1445 15 1450 639
#
# [65 rows x 19 columns],
# array([[[255, 255, 255],
# [255, 255, 255],
# [255, 255, 255],
# ...,
# [255, 255, 255],
# [255, 255, 255],
# [255, 255, 255]],
#
# [[255, 255, 255],
# [255, 255, 255],
# [255, 255, 255],
# ...,
# [255, 255, 255],
# [255, 255, 255],
# [255, 255, 255]],
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/tesseractmultiprocessing",
"name": "tesseractmultiprocessing",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "tesseract,multiprocessing,threads,ocr",
"author": "Johannes Fischer",
"author_email": "<aulasparticularesdealemaosp@gmail.com>",
"download_url": "https://files.pythonhosted.org/packages/8f/85/b5818df606ef38ffba3cfad84fc2db62f9d71c9e96fe78bba39ad901e32c/tesseractmultiprocessing-0.10.tar.gz",
"platform": null,
"description": "\n# Multiprocessing OCR with Tesseract\n\n\n\n## pip install tesseractmultiprocessing\n\n\n\nWorth using if you:\n\n1) have plenty of different files \n\n2) are using numpy\n\n\n\n#### Multi: 23.9910116\n\n\n\n#### One CPU: 100.61128 #pytesseract\n\n\n\n\n\n```python\n\nfrom tesseractmultiprocessing import tesser2df\n\nfrom a_cv_imwrite_imread_plus import open_image_in_cv\n\nfrom time import perf_counter\n\n\n\npicslinks = [\n\n r\"https://github.com/hansalemaos/screenshots/raw/main/pandsnesteddicthtml.png\",\n\n r\"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000000.png\",\n\n r\"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000008.png\",\n\n r\"https://github.com/hansalemaos/screenshots/raw/main/cv2_putTrueTypeText_000017.png\",\n\n]\n\npicsunique = [open_image_in_cv(x) for x in picslinks]\n\npics = []\n\nfor _ in range(100):\n\n pics.extend(picsunique)\n\n\n\nstart = perf_counter()\n\noutput = tesser2df(\n\n pics,\n\n language=\"eng\",\n\n pandas_kwargs={\"on_bad_lines\": \"warn\"},\n\n tesser_args=(),\n\n cpus=5,\n\n tesser_path=r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\",\n\n)\n\nprint(f\"Multi: {perf_counter()-start}\")\n\n\n\n\n\n################################################################################\n\n\n\nimport pytesseract\n\n\n\npytesseract.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n\n\n\n\n\ndef st():\n\n alla = []\n\n for p in pics:\n\n alla.append(pytesseract.image_to_data(p))\n\n return alla\n\n\n\n\n\nstart = perf_counter()\n\noutput2 = st()\n\nprint(f\"One CPU: {perf_counter()-start}\")\n\n\n\n\n\n# Multi: 23.9910116\n\n# One CPU: 100.61128\n\n\n\n# output[0]\n\n# Out[4]:\n\n# ( level page_num block_num par_num ... start_x start_y end_x end_y\n\n# 0 1 1 0 0 ... 0 0 1465 654\n\n# 1 2 1 1 0 ... 322 64 327 540\n\n# 2 3 1 1 1 ... 322 64 327 540\n\n# 3 4 1 1 1 ... 322 64 327 540\n\n# 4 5 1 1 1 ... 322 64 327 540\n\n# .. ... ... ... ... ... ... ... ... ...\n\n# 60 5 1 11 1 ... 14 633 1448 644\n\n# 61 2 1 12 0 ... 1445 15 1450 639\n\n# 62 3 1 12 1 ... 1445 15 1450 639\n\n# 63 4 1 12 1 ... 1445 15 1450 639\n\n# 64 5 1 12 1 ... 1445 15 1450 639\n\n#\n\n# [65 rows x 19 columns],\n\n# array([[[255, 255, 255],\n\n# [255, 255, 255],\n\n# [255, 255, 255],\n\n# ...,\n\n# [255, 255, 255],\n\n# [255, 255, 255],\n\n# [255, 255, 255]],\n\n#\n\n# [[255, 255, 255],\n\n# [255, 255, 255],\n\n# [255, 255, 255],\n\n# ...,\n\n# [255, 255, 255],\n\n# [255, 255, 255],\n\n# [255, 255, 255]],\n\n\n\n```\n\n\n\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Multiprocessing OCR with Tesseract",
"version": "0.10",
"split_keywords": [
"tesseract",
"multiprocessing",
"threads",
"ocr"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "3c816e6e1c27ab48f33feb0d2a007544e9ed01d9c057959ac9a2eb3d8638a630",
"md5": "4257237233fe24d6a2bd04861d14ad57",
"sha256": "014a045bcc22b01414a7111ecebc3f11d1098ec2dd422431d07e139e35b58d5a"
},
"downloads": -1,
"filename": "tesseractmultiprocessing-0.10-py3-none-any.whl",
"has_sig": false,
"md5_digest": "4257237233fe24d6a2bd04861d14ad57",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 7313,
"upload_time": "2023-03-10T06:04:44",
"upload_time_iso_8601": "2023-03-10T06:04:44.807690Z",
"url": "https://files.pythonhosted.org/packages/3c/81/6e6e1c27ab48f33feb0d2a007544e9ed01d9c057959ac9a2eb3d8638a630/tesseractmultiprocessing-0.10-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "8f85b5818df606ef38ffba3cfad84fc2db62f9d71c9e96fe78bba39ad901e32c",
"md5": "39048d2cf41e8f422279243c0d546e4e",
"sha256": "7c8ba358a549d25f8439ba7159586d149cc6f1a5f34d7b63f721f51dc01d7d33"
},
"downloads": -1,
"filename": "tesseractmultiprocessing-0.10.tar.gz",
"has_sig": false,
"md5_digest": "39048d2cf41e8f422279243c0d546e4e",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 5468,
"upload_time": "2023-03-10T06:04:47",
"upload_time_iso_8601": "2023-03-10T06:04:47.379757Z",
"url": "https://files.pythonhosted.org/packages/8f/85/b5818df606ef38ffba3cfad84fc2db62f9d71c9e96fe78bba39ad901e32c/tesseractmultiprocessing-0.10.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-03-10 06:04:47",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "hansalemaos",
"github_project": "tesseractmultiprocessing",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "a_cv_imwrite_imread_plus",
"specs": []
},
{
"name": "callpyfile",
"specs": []
},
{
"name": "opencv_python",
"specs": []
},
{
"name": "pandas",
"specs": []
},
{
"name": "pathos",
"specs": []
}
],
"lcname": "tesseractmultiprocessing"
}