# Performs OCR on a list of images using Tesseract and performs fuzzy string matching with a given list of strings.
## Tested against Windows 10 / Python 3.11 / Anaconda
### pip install tesseractrapidfuzz
```python
This function takes a path to the Tesseract OCR executable, a list of image paths or URLs,
a list of strings to compare against the recognized text, and optional fuzzy matching settings.
It returns a pandas DataFrame with OCR results and fuzzy matching scores.
Args:
tesseract_path (str): Path to the Tesseract OCR executable.
allpics (Union[list, tuple]): List of image paths, URLs, or other image data sources.
strings_to_compare (Union[list, tuple, np.ndarray]): List of strings for fuzzy matching.
compare_single_words (bool, optional): Enable fuzzy matching on individual words.
Defaults to True.
compare_grouped_words (bool, optional): Enable fuzzy matching on grouped words.
Defaults to True.
scorer_single_words (valid_scorer, optional): Fuzzy matching scorer for single words.
Defaults to "WRatio".
scorer_grouped_words (valid_scorer, optional): Fuzzy matching scorer for grouped words.
Defaults to "WRatio".
add_after_tesseract_path (str, optional): Additional arguments for Tesseract after
the input image path. Defaults to an empty string.
add_at_the_end (str, optional): Additional arguments to append to the Tesseract command.
Defaults to "-l eng --psm 3".
**kwargs: Additional keyword arguments to control the fuzzy matching process.
Returns:
pd.DataFrame: A DataFrame with OCR results and fuzzy matching scores, including columns:
- 'id_img': Image ID
- 'id_word': Word ID within the image
- 'ocr_result': Recognized text
- 'start_x': Starting X-coordinate of the bounding box
- 'end_x': Ending X-coordinate of the bounding box
- 'start_y': Starting Y-coordinate of the bounding box
- 'end_y': Ending Y-coordinate of the bounding box
- 'conf': Confidence score
- 'grouped_text': Grouped text for fuzzy matching
- 'compared_grouped_words_similarity': Fuzzy matching score for grouped words
- 'compared_grouped_words_index': Index of the matched string for grouped words
- 'compared_grouped_words_value': Matched value for grouped words
- 'compared_single_words_similarity': Fuzzy matching score for single words
- 'compared_single_words_index': Index of the matched string for single words
- 'compared_single_words_value': Matched value for single words
Example:
import re
from tesseractrapidfuzz import ocr_and_fuzzy_check
df = ocr_and_fuzzy_check(
tesseract_path=r"C:\Program Files\Tesseract-OCR\tesseract.exe",
allpics=[
"https://m.media-amazon.com/images/I/711y6oE2JrL._SL1500_.jpg",
"https://m.media-amazon.com/images/I/61g+KBpG20L._SL1500_.jpg",
],
strings_to_compare=[
"nonviolent",
"communication",
"emotional",
"well-being",
"terrible",
"today.",
"discover",
"definitive",
"guides",
"transforming",
"converting",
"conflict",
"meaningful",
"connection,",
"unveiling",
"inspirational",
"strategies",
"engagement.",
"martha",
"williams",
"nonviolent communication",
"emotional well-being",
"I had a terrible day at work today.",
"wait till you",
"heared about",
"the art of nonviolent communication",
"martha a. williams",
],
compare_single_words=True,
compare_grouped_words=True,
scorer_single_words="QRatio",
scorer_grouped_words="WRatio",
add_after_tesseract_path="",
add_at_the_end="-l eng --psm 3",
workers=5,
processor=lambda x: re.sub(r"\W+", "", str(x).lower()),
)
print(df.to_string())
# ...
# 7 1 8 terrible 448 563 371 396 77 2875 505 383 115 25 2 | had a terrible 100.000000 4 terrible 90.0 4 terrible
# 8 1 9 day 363 418 415 448 96 1815 390 431 55 33 3 day at work 75.000000 5 today. 90.0 22 I had a terrible day at work today.
# 9 1 10 at 427 457 418 440 96 660 442 429 30 22 3 day at work 50.000000 18 martha 90.0 22 I had a terrible day at work today.
# 10 1 11 work 466 540 415 440 96 1850 503 427 74 25 3 day at work 33.333332 6 discover 90.0 22 I had a terrible day at work today.
# 11 1 12 today. 402 498 460 492 96 3072 450 476 96 32 4 today. 100.000000 5 today. 100.0 5 today.
# 12 1 13 Wait 551 635 525 556 95 2604 593 540 84 31 5 Wait till you 53.333332 23 wait till you 100.0 23 wait till you
# 13 1 14 till 645 695 525 556 96 1550 670 540 50 31 5 Wait till you 53.333332 23 wait till you 100.0 23 wait till you
# 14 1 15 you 705 773 533 565 96 2176 739 549 68 32 5 Wait till you 42.857143 23 wait till you 100.0 23 wait till you
# 15 1 16 hear 562 645 579 610 95 2573 603 594 83 31 6 hear about 53.333332 24 heared about 90.0 24 heared about
# 16 1 17 about 663 767 579 610 96 3224 715 594 104 31 6 hear about 62.500000 24 heared about 90.0 24 heared about
# 17 2 1 ART 94 246 125 207 95 12464 170 166 152 82 7 ART OF NONVIOLENT 66.666664 18 martha 90.0 0 nonviolent
# 18 2 2 OF 275 376 125 207 95 8282 325 166 101 82 7 ART OF NONVIOLENT 40.000000 11 conflict 90.0 0 nonviolent
# 19 2 3 NONVIOLENT 407 907 125 206 96 40500 657 165 500 81 7 ART OF NONVIOLENT 100.000000 0 nonviolent 90.0 0 nonviolent
# 20 2 4 COMMUNICATION 167 832 296 377 96 53865 499 336 665 81 8 COMMUNICATION 100.000000 1 communication 100.0 1 communication
# 21 2 5 TAR 319 379 428 444 31 960 349 436 60 16 9 TAR 50.000000 5 today. 72.0 9 transforming
# 22 2 6 DISCOVER 192 307 624 667 96 4945 249 645 115 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 6 discover 90.0 0 nonviolent
# 23 2 7 THE 320 360 624 667 96 1720 340 645 40 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 44.444443 18 martha 90.0 0 nonviolent
# 24 2 8 DEFINITIVE 374 507 624 667 96 5719 440 645 133 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 7 definitive 90.0 0 nonviolent
# 25 2 9 GUIDES 521 604 624 667 96 3569 562 645 83 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 8 guides 90.0 0 nonviolent
# 26 2 10 TO 618 645 628 654 96 702 631 641 27 26 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 57.142857 5 today. 90.0 0 nonviolent
# 27 2 11 NONVIOLENT 661 810 624 667 96 6407 735 645 149 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 0 nonviolent 90.0 0 nonviolent
# ...
Note:
- The function combines OCR results with fuzzy string matching, allowing for versatile text analysis.
- Valid_scoring options are: "WRatio", "QRatio", "ratio", "partial_ratio".
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/tesseractrapidfuzz",
"name": "tesseractrapidfuzz",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "tesseract,ocr,fuzzy,rapidfuzz,fuzzywuzzy",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/18/4f/4a726a3d943e608351a7d3e9cbfdf6a154eaf81d5e9ba2ea5316c5415c74/tesseractrapidfuzz-0.10.tar.gz",
"platform": null,
"description": "\r\n# Performs OCR on a list of images using Tesseract and performs fuzzy string matching with a given list of strings.\r\n\r\n## Tested against Windows 10 / Python 3.11 / Anaconda\r\n\r\n### pip install tesseractrapidfuzz\r\n\r\n\r\n```python\r\n\r\n\r\nThis function takes a path to the Tesseract OCR executable, a list of image paths or URLs,\r\na list of strings to compare against the recognized text, and optional fuzzy matching settings.\r\nIt returns a pandas DataFrame with OCR results and fuzzy matching scores.\r\n\r\nArgs:\r\n\ttesseract_path (str): Path to the Tesseract OCR executable.\r\n\tallpics (Union[list, tuple]): List of image paths, URLs, or other image data sources.\r\n\tstrings_to_compare (Union[list, tuple, np.ndarray]): List of strings for fuzzy matching.\r\n\tcompare_single_words (bool, optional): Enable fuzzy matching on individual words.\r\n\t\tDefaults to True.\r\n\tcompare_grouped_words (bool, optional): Enable fuzzy matching on grouped words.\r\n\t\tDefaults to True.\r\n\tscorer_single_words (valid_scorer, optional): Fuzzy matching scorer for single words.\r\n\t\tDefaults to \"WRatio\".\r\n\tscorer_grouped_words (valid_scorer, optional): Fuzzy matching scorer for grouped words.\r\n\t\tDefaults to \"WRatio\".\r\n\tadd_after_tesseract_path (str, optional): Additional arguments for Tesseract after\r\n\t\tthe input image path. Defaults to an empty string.\r\n\tadd_at_the_end (str, optional): Additional arguments to append to the Tesseract command.\r\n\t\tDefaults to \"-l eng --psm 3\".\r\n\t**kwargs: Additional keyword arguments to control the fuzzy matching process.\r\n\r\nReturns:\r\n\tpd.DataFrame: A DataFrame with OCR results and fuzzy matching scores, including columns:\r\n\t\t- 'id_img': Image ID\r\n\t\t- 'id_word': Word ID within the image\r\n\t\t- 'ocr_result': Recognized text\r\n\t\t- 'start_x': Starting X-coordinate of the bounding box\r\n\t\t- 'end_x': Ending X-coordinate of the bounding box\r\n\t\t- 'start_y': Starting Y-coordinate of the bounding box\r\n\t\t- 'end_y': Ending Y-coordinate of the bounding box\r\n\t\t- 'conf': Confidence score\r\n\t\t- 'grouped_text': Grouped text for fuzzy matching\r\n\t\t- 'compared_grouped_words_similarity': Fuzzy matching score for grouped words\r\n\t\t- 'compared_grouped_words_index': Index of the matched string for grouped words\r\n\t\t- 'compared_grouped_words_value': Matched value for grouped words\r\n\t\t- 'compared_single_words_similarity': Fuzzy matching score for single words\r\n\t\t- 'compared_single_words_index': Index of the matched string for single words\r\n\t\t- 'compared_single_words_value': Matched value for single words\r\n\r\nExample:\r\n\timport re\r\n\tfrom tesseractrapidfuzz import ocr_and_fuzzy_check\r\n\tdf = ocr_and_fuzzy_check(\r\n\t\ttesseract_path=r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\",\r\n\t\tallpics=[\r\n\t\t\t\"https://m.media-amazon.com/images/I/711y6oE2JrL._SL1500_.jpg\",\r\n\t\t\t\"https://m.media-amazon.com/images/I/61g+KBpG20L._SL1500_.jpg\",\r\n\t\t],\r\n\t\tstrings_to_compare=[\r\n\t\t\t\"nonviolent\",\r\n\t\t\t\"communication\",\r\n\t\t\t\"emotional\",\r\n\t\t\t\"well-being\",\r\n\t\t\t\"terrible\",\r\n\t\t\t\"today.\",\r\n\t\t\t\"discover\",\r\n\t\t\t\"definitive\",\r\n\t\t\t\"guides\",\r\n\t\t\t\"transforming\",\r\n\t\t\t\"converting\",\r\n\t\t\t\"conflict\",\r\n\t\t\t\"meaningful\",\r\n\t\t\t\"connection,\",\r\n\t\t\t\"unveiling\",\r\n\t\t\t\"inspirational\",\r\n\t\t\t\"strategies\",\r\n\t\t\t\"engagement.\",\r\n\t\t\t\"martha\",\r\n\t\t\t\"williams\",\r\n\t\t\t\"nonviolent communication\",\r\n\t\t\t\"emotional well-being\",\r\n\t\t\t\"I had a terrible day at work today.\",\r\n\t\t\t\"wait till you\",\r\n\t\t\t\"heared about\",\r\n\t\t\t\"the art of nonviolent communication\",\r\n\t\t\t\"martha a. williams\",\r\n\t\t],\r\n\t\tcompare_single_words=True,\r\n\t\tcompare_grouped_words=True,\r\n\t\tscorer_single_words=\"QRatio\",\r\n\t\tscorer_grouped_words=\"WRatio\",\r\n\t\tadd_after_tesseract_path=\"\",\r\n\t\tadd_at_the_end=\"-l eng --psm 3\",\r\n\t\tworkers=5,\r\n\t\tprocessor=lambda x: re.sub(r\"\\W+\", \"\", str(x).lower()),\r\n\t)\r\n\tprint(df.to_string())\r\n\t# ...\r\n\t# 7 1 8 terrible 448 563 371 396 77 2875 505 383 115 25 2 | had a terrible 100.000000 4 terrible 90.0 4 terrible\r\n\t# 8 1 9 day 363 418 415 448 96 1815 390 431 55 33 3 day at work 75.000000 5 today. 90.0 22 I had a terrible day at work today.\r\n\t# 9 1 10 at 427 457 418 440 96 660 442 429 30 22 3 day at work 50.000000 18 martha 90.0 22 I had a terrible day at work today.\r\n\t# 10 1 11 work 466 540 415 440 96 1850 503 427 74 25 3 day at work 33.333332 6 discover 90.0 22 I had a terrible day at work today.\r\n\t# 11 1 12 today. 402 498 460 492 96 3072 450 476 96 32 4 today. 100.000000 5 today. 100.0 5 today.\r\n\t# 12 1 13 Wait 551 635 525 556 95 2604 593 540 84 31 5 Wait till you 53.333332 23 wait till you 100.0 23 wait till you\r\n\t# 13 1 14 till 645 695 525 556 96 1550 670 540 50 31 5 Wait till you 53.333332 23 wait till you 100.0 23 wait till you\r\n\t# 14 1 15 you 705 773 533 565 96 2176 739 549 68 32 5 Wait till you 42.857143 23 wait till you 100.0 23 wait till you\r\n\t# 15 1 16 hear 562 645 579 610 95 2573 603 594 83 31 6 hear about 53.333332 24 heared about 90.0 24 heared about\r\n\t# 16 1 17 about 663 767 579 610 96 3224 715 594 104 31 6 hear about 62.500000 24 heared about 90.0 24 heared about\r\n\t# 17 2 1 ART 94 246 125 207 95 12464 170 166 152 82 7 ART OF NONVIOLENT 66.666664 18 martha 90.0 0 nonviolent\r\n\t# 18 2 2 OF 275 376 125 207 95 8282 325 166 101 82 7 ART OF NONVIOLENT 40.000000 11 conflict 90.0 0 nonviolent\r\n\t# 19 2 3 NONVIOLENT 407 907 125 206 96 40500 657 165 500 81 7 ART OF NONVIOLENT 100.000000 0 nonviolent 90.0 0 nonviolent\r\n\t# 20 2 4 COMMUNICATION 167 832 296 377 96 53865 499 336 665 81 8 COMMUNICATION 100.000000 1 communication 100.0 1 communication\r\n\t# 21 2 5 TAR 319 379 428 444 31 960 349 436 60 16 9 TAR 50.000000 5 today. 72.0 9 transforming\r\n\t# 22 2 6 DISCOVER 192 307 624 667 96 4945 249 645 115 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 6 discover 90.0 0 nonviolent\r\n\t# 23 2 7 THE 320 360 624 667 96 1720 340 645 40 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 44.444443 18 martha 90.0 0 nonviolent\r\n\t# 24 2 8 DEFINITIVE 374 507 624 667 96 5719 440 645 133 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 7 definitive 90.0 0 nonviolent\r\n\t# 25 2 9 GUIDES 521 604 624 667 96 3569 562 645 83 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 8 guides 90.0 0 nonviolent\r\n\t# 26 2 10 TO 618 645 628 654 96 702 631 641 27 26 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 57.142857 5 today. 90.0 0 nonviolent\r\n\t# 27 2 11 NONVIOLENT 661 810 624 667 96 6407 735 645 149 43 10 DISCOVER THE DEFINITIVE GUIDES TO NONVIOLENT 100.000000 0 nonviolent 90.0 0 nonviolent\r\n\t# ...\r\n\r\nNote:\r\n\t- The function combines OCR results with fuzzy string matching, allowing for versatile text analysis.\r\n\t- Valid_scoring options are: \"WRatio\", \"QRatio\", \"ratio\", \"partial_ratio\".\r\n```\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Performs OCR on a list of images using Tesseract and performs fuzzy string matching with a given list of strings.",
"version": "0.10",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/tesseractrapidfuzz"
},
"split_keywords": [
"tesseract",
"ocr",
"fuzzy",
"rapidfuzz",
"fuzzywuzzy"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "32c92156ab989edbbd8a2ccc54032afb6826827ba27412b36d26f344a1e7cac2",
"md5": "cbdfe926706ddc7992e2309f954ce5ce",
"sha256": "50201d9f7f9837dd060552c2e60cb2d593422900e2ebcc7b30148d1fbedc72e9"
},
"downloads": -1,
"filename": "tesseractrapidfuzz-0.10-py3-none-any.whl",
"has_sig": false,
"md5_digest": "cbdfe926706ddc7992e2309f954ce5ce",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 28780,
"upload_time": "2023-09-17T10:21:23",
"upload_time_iso_8601": "2023-09-17T10:21:23.787588Z",
"url": "https://files.pythonhosted.org/packages/32/c9/2156ab989edbbd8a2ccc54032afb6826827ba27412b36d26f344a1e7cac2/tesseractrapidfuzz-0.10-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "184f4a726a3d943e608351a7d3e9cbfdf6a154eaf81d5e9ba2ea5316c5415c74",
"md5": "9ba07db2ba503d44c2146aab165d4504",
"sha256": "494a5494477205b3a4f0c4b195fc34f182ab4d99a2ea74181cb276bb43df96dc"
},
"downloads": -1,
"filename": "tesseractrapidfuzz-0.10.tar.gz",
"has_sig": false,
"md5_digest": "9ba07db2ba503d44c2146aab165d4504",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 26885,
"upload_time": "2023-09-17T10:21:25",
"upload_time_iso_8601": "2023-09-17T10:21:25.624848Z",
"url": "https://files.pythonhosted.org/packages/18/4f/4a726a3d943e608351a7d3e9cbfdf6a154eaf81d5e9ba2ea5316c5415c74/tesseractrapidfuzz-0.10.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-09-17 10:21:25",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "tesseractrapidfuzz",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [],
"lcname": "tesseractrapidfuzz"
}