# Merges hard-to-merge data using rapidfuzz, cython, pandas and numpy
### Tested against Windows 10 / Python 3.11 / Anaconda
### pip install rafuzzpandas
### Cython and a C compiler must be installed!
## Real world example - merging a list with common data (but not all data is in both lists)
```
# Input data:
# Rolling Stone Best Albums of All Time - 2021
first_list = r"""500 Kanye West, 'Stronger' 2007
499 The Supremes, 'Baby Love' 1964
498 Townes Van Zandt, 'Pancho and Lefty' 1972
497 Lizzo, 'Truth Hurts' 2017
496 Harry Nilsson, 'Without You' 1971
495 Carly Simon, 'You're So Vain' 1972
494 Cyndi Lauper, 'Time After Time' 1983
493 The Pixies, 'Where Is My Mind?' 1988
....
....
9 Fleetwood Mac, 'Dreams' 1977
8 Missy Elliott, 'Get Ur Freak On' 2001
7 The Beatles, 'Strawberry Fields Forever' 1967
6 Marvin Gaye, 'What’s Going On' 1971
5 Nirvana, 'Smells Like Teen Spirit' 1991
4 Bob Dylan, 'Like a Rolling Stone' 1965
3 Sam Cooke, 'A Change Is Gonna Come' 1964
2 Public Enemy, 'Fight the Power' 1989
1 Aretha Franklin, 'Respect' 1967"""
# Rolling Stone - best albums of all time - 2004 (different format, no quotes)
second_list = """1. Bob Dylan - Like a Rolling Stone
2. The Rolling Stones - Satisfaction
3. John Lennon - Imagine
4. Marvin Gaye - What’s Going On
5. Aretha Franklin - Respect
....
....
495. Smokey Robinson and the Miracles - Shop Around
496. The Rolling Stones - Miss You
497. Weezer - Buddy Holly
498. Brook Benton - Rainy Night in Georgia
499. Thin Lizzy - The Boys Are Back in Town
500. Boston - More Than a Feeling"""
# Merged Output
-----------------------------------------result1
MAPS ONE TO ONE - NO DUPLICATES
Smokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'
Grandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'
The Velvet Underground - I’m Waiting for the Man-----The Velvet Underground, 'I’m Waiting for the Man'
Martha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'
Simon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'
Sly and the Family Stone - Everyday People-----Sly and the Family Stone, 'Everyday People'
Screamin’ Jay Hawkins - I Put a Spell on You-----Screamin’ Jay Hawkins, 'I Put a Spell on You'
Marvin Gaye - I Heard It Through the Grapevine-----Marvin Gaye, 'I Heard It Through the Grapevine'
U2 - I Still Haven’t Found What I’m Looking For-----U2, 'I Still Haven’t Found What I’m Looking For'
Gladys Knight and the Pips - Midnight Train to Georgia-----Gladys Knight and the Pips, 'Midnight Train to Georgia'
-----------------------------------------result2
VALUES MIGHT BE DUPLICATES SOMEWHERE IN THE RESULTS
Smokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'
Grandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'
The Velvet Underground - I’m Waiting for the Man-----The Velvet Underground, 'I’m Waiting for the Man'
Martha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'
Simon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'
Sly and the Family Stone - Everyday People-----Sly and the Family Stone, 'Everyday People'
Screamin’ Jay Hawkins - I Put a Spell on You-----Screamin’ Jay Hawkins, 'I Put a Spell on You'
Marvin Gaye - I Heard It Through the Grapevine-----Marvin Gaye, 'I Heard It Through the Grapevine'
U2 - I Still Haven’t Found What I’m Looking For-----U2, 'I Still Haven’t Found What I’m Looking For'
Gladys Knight and the Pips - Midnight Train to Georgia-----Gladys Knight and the Pips, 'Midnight Train to Georgia'
-----------------------------------------result3
BEST RESULTS - MAPS ONE TO ONE - NO DUPLICATES
Smokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'
Grandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'
The Velvet Underground - I’m Waiting for the Man-----The Velvet Underground, 'I’m Waiting for the Man'
Martha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'
Simon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
SECOND BEST RESULTS- MAPS ONE TO ONE - NO DUPLICATES
Bruce Springsteen - Thunder Road-----Bruce Springsteen, 'Jungleland'
David Bowie - Heroes-----David Bowie, 'Life on Mars?'
The Beatles - Let It Be-----The Strokes, 'Last Nite'
The Supremes - Baby Love-----The Supremes, 'Stop! In the Name of Love'
The Beatles - I Want to Hold Your Hand-----Tears for Fears, 'Everybody Wants to Rule the World'
-----------------------------------------result4
BEST RESULTS - VALUE MIGHT BE DUPLICATE
Smokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'
Grandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'
The Velvet Underground - I’m Waiting for the Man-----The Velvet Underground, 'I’m Waiting for the Man'
Martha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'
Simon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
SECOND BEST RESULTS- VALUE MIGHT BE DUPLICATE - BUT NOT THE SAME KEY-VALUE COMBINATION LIKE THE FIRST
The Beatles - In My Life-----The Beatles, 'A Day in the Life'
The Beatles - A Day in the Life-----The Beatles, 'In My Life'
Bruce Springsteen - Thunder Road-----Bruce Springsteen, 'Jungleland'
Marvin Gaye - Let’s Get It On-----Marvin Gaye, 'What’s Going On'
David Bowie - Heroes-----David Bowie, 'Changes'
```
## How to use it
```PY
Finds the closest matches to the query strings from the choices using fuzzy string matching.
Parameters:
query_strings: The list (iterable) of query strings. DO NOT PUT EMPTY STRINGS IN THE LIST!
choices: The list (iterable) of choices to match against. DO NOT PUT EMPTY STRINGS IN THE LIST!
clear_cache (bool): Whether to clear the cache after processing.
max_results_each_query (int): The maximum number of results for each query.
allow_repeating_matches (bool): Whether to allow repeating matches.
first_limit (int): The initial score limit (presearch) for matches.
chunksize (int): The size of chunks to process.
cutoff (float): The score cutoff for matches.
processor (callable): The processor function to use.
score_cutoff (float): The score cutoff for matches.
score_hint (float): The score hint for matches.
score_multiplier (int): The score multiplier for matches.
workers (int): The number of workers to use.
scorer_kwargs (dict): Additional arguments for the scoring functions.
first_scorers (tuple): The tuple of scoring functions to use.
Returns:
dict: A dictionary containing the closest matches for each query.
from rafuzzpandas import get_closest_matches
import os
this_path = os.path.dirname(os.path.abspath(__file__))
rollingstone2021 = os.path.join(this_path, "rollingstone2021.txt")
rollingstone2004 = os.path.join(this_path, "rollingstone2004.txt")
with open(rollingstone2021, "r", encoding="utf-8") as f:
first_list = f.read()
with open(rollingstone2004, "r", encoding="utf-8") as f:
second_list = f.read()
# Little pre-processing to get rid of the numbers
query_strings = [
h
for q in first_list.strip().splitlines()
if (h := q.split(maxsplit=1)[-1].strip().rsplit(maxsplit=1)[0].strip())
]
choices = [
h
for q in second_list.strip().splitlines()
if (h := q.split(maxsplit=1)[-1].strip())
]
for indi in range(10):
print(f"{query_strings[indi]} ------ {choices[indi]}")
result1 = get_closest_matches(
query_strings,
choices,
max_results_each_query=1,
allow_repeating_matches=False,
first_limit=70,
chunksize=150,
workers=1,
scorer_kwargs=None,
first_scorers=(
"ratio",
"partial_ratio",
"token_sort_ratio",
"token_set_ratio",
"token_ratio",
"partial_token_sort_ratio",
"partial_token_set_ratio",
"partial_token_ratio",
"WRatio",
"QRatio",
),
)
print("\n\n-----------------------------------------result1")
print("MAPS ONE TO ONE - NO DUPLICATES\n")
counter = 0
for k, v in result1[0].items():
print(f"{k}-----{v}")
counter += 1
if counter == 10:
break
result2 = get_closest_matches(
query_strings,
choices,
max_results_each_query=1,
allow_repeating_matches=True,
first_limit=70,
chunksize=150,
workers=1,
scorer_kwargs=None,
first_scorers=(
"ratio",
"partial_ratio",
"token_sort_ratio",
"token_set_ratio",
"token_ratio",
"partial_token_sort_ratio",
"partial_token_set_ratio",
"partial_token_ratio",
"WRatio",
"QRatio",
),
)
print("\n\n-----------------------------------------result2")
print("VALUES MIGHT BE DUPLICATES SOMEWHERE IN THE RESULTS\n")
counter = 0
for k, v in result2[0].items():
print(f"{k}-----{v}")
counter += 1
if counter == 10:
break
result3 = get_closest_matches(
query_strings,
choices,
max_results_each_query=3,
allow_repeating_matches=False,
first_limit=70,
chunksize=150,
workers=1,
scorer_kwargs=None,
first_scorers=(
"ratio",
"partial_ratio",
"token_sort_ratio",
"token_set_ratio",
"token_ratio",
"partial_token_sort_ratio",
"partial_token_set_ratio",
"partial_token_ratio",
"WRatio",
"QRatio",
),
)
print("\n\n-----------------------------------------result3")
print("BEST RESULTS - MAPS ONE TO ONE - NO DUPLICATES\n")
counter = 0
for k, v in result3[0].items():
print(f"{k}-----{v}")
counter += 1
if counter == 5:
break
print("\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n")
print("SECOND BEST RESULTS- MAPS ONE TO ONE - NO DUPLICATES\n")
counter = 0
for k, v in result3[1].items():
print(f"{k}-----{v}")
counter += 1
if counter == 5:
break
result4 = get_closest_matches(
query_strings,
choices,
max_results_each_query=3,
allow_repeating_matches=True,
first_limit=70,
chunksize=150,
workers=1,
scorer_kwargs=None,
first_scorers=(
"ratio",
"partial_ratio",
"token_sort_ratio",
"token_set_ratio",
"token_ratio",
"partial_token_sort_ratio",
"partial_token_set_ratio",
"partial_token_ratio",
"WRatio",
"QRatio",
),
)
print("\n\n-----------------------------------------result4")
print("BEST RESULTS - VALUE MIGHT BE DUPLICATE\n")
counter = 0
for k, v in result4[0].items():
print(f"{k}-----{v}")
counter += 1
if counter == 5:
break
print("\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n")
print(
"SECOND BEST RESULTS- VALUE MIGHT BE DUPLICATE - BUT NOT THE SAME KEY-VALUE COMBINATION LIKE THE FIRST\n"
)
counter = 0
for k, v in result4[1].items():
print(f"{k}-----{v}")
counter += 1
if counter == 5:
break
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/rafuzzpandas",
"name": "rafuzzpandas",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "data, fuzzy, merging, rapidfuzz, cython, pandas, numpy",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/a6/8c/b3464c007186fccc7f6a0f6139a92205b9e2e4636e9b7f15128093ba6c2e/rafuzzpandas-0.10.tar.gz",
"platform": null,
"description": "\r\n# Merges hard-to-merge data using rapidfuzz, cython, pandas and numpy\r\n\r\n### Tested against Windows 10 / Python 3.11 / Anaconda\r\n\r\n### pip install rafuzzpandas\r\n\r\n### Cython and a C compiler must be installed!\r\n\r\n\r\n## Real world example - merging a list with common data (but not all data is in both lists)\r\n\r\n```\r\n# Input data:\r\n# Rolling Stone Best Albums of All Time - 2021\r\nfirst_list = r\"\"\"500 Kanye West, 'Stronger' 2007\r\n499 The Supremes, 'Baby Love' 1964\r\n498 Townes Van Zandt, 'Pancho and Lefty' 1972\r\n497 Lizzo, 'Truth Hurts' 2017\r\n496 Harry Nilsson, 'Without You' 1971\r\n495 Carly Simon, 'You're So Vain' 1972\r\n494 Cyndi Lauper, 'Time After Time' 1983\r\n493 The Pixies, 'Where Is My Mind?' 1988\r\n....\r\n....\r\n9 Fleetwood Mac, 'Dreams' 1977\r\n8 Missy Elliott, 'Get Ur Freak On' 2001\r\n7 The Beatles, 'Strawberry Fields Forever' 1967\r\n6 Marvin Gaye, 'What\u2019s Going On' 1971\r\n5 Nirvana, 'Smells Like Teen Spirit' 1991\r\n4 Bob Dylan, 'Like a Rolling Stone' 1965\r\n3 Sam Cooke, 'A Change Is Gonna Come' 1964\r\n2 Public Enemy, 'Fight the Power' 1989\r\n1 Aretha Franklin, 'Respect' 1967\"\"\"\r\n\r\n\r\n\r\n# Rolling Stone - best albums of all time - 2004 (different format, no quotes)\r\nsecond_list = \"\"\"1. Bob Dylan - Like a Rolling Stone\r\n2. The Rolling Stones - Satisfaction\r\n3. John Lennon - Imagine\r\n4. Marvin Gaye - What\u2019s Going On\r\n5. Aretha Franklin - Respect\r\n....\r\n....\r\n495. Smokey Robinson and the Miracles - Shop Around\r\n496. The Rolling Stones - Miss You\r\n497. Weezer - Buddy Holly\r\n498. Brook Benton - Rainy Night in Georgia\r\n499. Thin Lizzy - The Boys Are Back in Town\r\n500. Boston - More Than a Feeling\"\"\"\r\n\r\n\r\n# Merged Output\r\n-----------------------------------------result1\r\nMAPS ONE TO ONE - NO DUPLICATES\r\n\r\nSmokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'\r\nGrandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'\r\nThe Velvet Underground - I\u2019m Waiting for the Man-----The Velvet Underground, 'I\u2019m Waiting for the Man'\r\nMartha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'\r\nSimon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'\r\nSly and the Family Stone - Everyday People-----Sly and the Family Stone, 'Everyday People'\r\nScreamin\u2019 Jay Hawkins - I Put a Spell on You-----Screamin\u2019 Jay Hawkins, 'I Put a Spell on You'\r\nMarvin Gaye - I Heard It Through the Grapevine-----Marvin Gaye, 'I Heard It Through the Grapevine'\r\nU2 - I Still Haven\u2019t Found What I\u2019m Looking For-----U2, 'I Still Haven\u2019t Found What I\u2019m Looking For'\r\nGladys Knight and the Pips - Midnight Train to Georgia-----Gladys Knight and the Pips, 'Midnight Train to Georgia'\r\n\r\n\r\n-----------------------------------------result2\r\nVALUES MIGHT BE DUPLICATES SOMEWHERE IN THE RESULTS\r\n\r\nSmokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'\r\nGrandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'\r\nThe Velvet Underground - I\u2019m Waiting for the Man-----The Velvet Underground, 'I\u2019m Waiting for the Man'\r\nMartha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'\r\nSimon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'\r\nSly and the Family Stone - Everyday People-----Sly and the Family Stone, 'Everyday People'\r\nScreamin\u2019 Jay Hawkins - I Put a Spell on You-----Screamin\u2019 Jay Hawkins, 'I Put a Spell on You'\r\nMarvin Gaye - I Heard It Through the Grapevine-----Marvin Gaye, 'I Heard It Through the Grapevine'\r\nU2 - I Still Haven\u2019t Found What I\u2019m Looking For-----U2, 'I Still Haven\u2019t Found What I\u2019m Looking For'\r\nGladys Knight and the Pips - Midnight Train to Georgia-----Gladys Knight and the Pips, 'Midnight Train to Georgia'\r\n\r\n\r\n-----------------------------------------result3\r\nBEST RESULTS - MAPS ONE TO ONE - NO DUPLICATES\r\n\r\nSmokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'\r\nGrandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'\r\nThe Velvet Underground - I\u2019m Waiting for the Man-----The Velvet Underground, 'I\u2019m Waiting for the Man'\r\nMartha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'\r\nSimon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'\r\n\r\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\r\n\r\nSECOND BEST RESULTS- MAPS ONE TO ONE - NO DUPLICATES\r\n\r\nBruce Springsteen - Thunder Road-----Bruce Springsteen, 'Jungleland'\r\nDavid Bowie - Heroes-----David Bowie, 'Life on Mars?'\r\nThe Beatles - Let It Be-----The Strokes, 'Last Nite'\r\nThe Supremes - Baby Love-----The Supremes, 'Stop! In the Name of Love'\r\nThe Beatles - I Want to Hold Your Hand-----Tears for Fears, 'Everybody Wants to Rule the World'\r\n\r\n\r\n-----------------------------------------result4\r\nBEST RESULTS - VALUE MIGHT BE DUPLICATE\r\n\r\nSmokey Robinson and the Miracles - The Tracks of My Tears-----Smokey Robinson and the Miracles, 'The Tracks of My Tears'\r\nGrandmaster Flash and the Furious Five - The Message-----Grandmaster Flash and the Furious Five, 'The Message'\r\nThe Velvet Underground - I\u2019m Waiting for the Man-----The Velvet Underground, 'I\u2019m Waiting for the Man'\r\nMartha and the Vandellas - Dancing in the Street-----Martha and the Vandellas, 'Dancing in the Street'\r\nSimon and Garfunkel - Bridge Over Troubled Water-----Simon and Garfunkel, 'Bridge Over Troubled Water'\r\n\r\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\r\n\r\nSECOND BEST RESULTS- VALUE MIGHT BE DUPLICATE - BUT NOT THE SAME KEY-VALUE COMBINATION LIKE THE FIRST\r\n\r\nThe Beatles - In My Life-----The Beatles, 'A Day in the Life'\r\nThe Beatles - A Day in the Life-----The Beatles, 'In My Life'\r\nBruce Springsteen - Thunder Road-----Bruce Springsteen, 'Jungleland'\r\nMarvin Gaye - Let\u2019s Get It On-----Marvin Gaye, 'What\u2019s Going On'\r\nDavid Bowie - Heroes-----David Bowie, 'Changes'\r\n```\r\n\r\n## How to use it\r\n\r\n```PY\r\n\r\nFinds the closest matches to the query strings from the choices using fuzzy string matching.\r\n\r\nParameters:\r\nquery_strings: The list (iterable) of query strings. DO NOT PUT EMPTY STRINGS IN THE LIST!\r\nchoices: The list (iterable) of choices to match against. DO NOT PUT EMPTY STRINGS IN THE LIST!\r\nclear_cache (bool): Whether to clear the cache after processing.\r\nmax_results_each_query (int): The maximum number of results for each query.\r\nallow_repeating_matches (bool): Whether to allow repeating matches.\r\nfirst_limit (int): The initial score limit (presearch) for matches.\r\nchunksize (int): The size of chunks to process.\r\ncutoff (float): The score cutoff for matches.\r\nprocessor (callable): The processor function to use.\r\nscore_cutoff (float): The score cutoff for matches.\r\nscore_hint (float): The score hint for matches.\r\nscore_multiplier (int): The score multiplier for matches.\r\nworkers (int): The number of workers to use.\r\nscorer_kwargs (dict): Additional arguments for the scoring functions.\r\nfirst_scorers (tuple): The tuple of scoring functions to use.\r\n\r\nReturns:\r\ndict: A dictionary containing the closest matches for each query.\r\n\r\n\r\nfrom rafuzzpandas import get_closest_matches\r\nimport os\r\n\r\nthis_path = os.path.dirname(os.path.abspath(__file__))\r\n\r\n\r\n\r\nrollingstone2021 = os.path.join(this_path, \"rollingstone2021.txt\")\r\nrollingstone2004 = os.path.join(this_path, \"rollingstone2004.txt\")\r\nwith open(rollingstone2021, \"r\", encoding=\"utf-8\") as f:\r\n first_list = f.read()\r\nwith open(rollingstone2004, \"r\", encoding=\"utf-8\") as f:\r\n second_list = f.read()\r\n\r\n# Little pre-processing to get rid of the numbers \r\nquery_strings = [\r\n h\r\n for q in first_list.strip().splitlines()\r\n if (h := q.split(maxsplit=1)[-1].strip().rsplit(maxsplit=1)[0].strip())\r\n]\r\nchoices = [\r\n h\r\n for q in second_list.strip().splitlines()\r\n if (h := q.split(maxsplit=1)[-1].strip())\r\n]\r\nfor indi in range(10):\r\n print(f\"{query_strings[indi]} ------ {choices[indi]}\")\r\n\r\nresult1 = get_closest_matches(\r\n query_strings,\r\n choices,\r\n max_results_each_query=1,\r\n allow_repeating_matches=False,\r\n first_limit=70,\r\n chunksize=150,\r\n workers=1,\r\n scorer_kwargs=None,\r\n first_scorers=(\r\n \"ratio\",\r\n \"partial_ratio\",\r\n \"token_sort_ratio\",\r\n \"token_set_ratio\",\r\n \"token_ratio\",\r\n \"partial_token_sort_ratio\",\r\n \"partial_token_set_ratio\",\r\n \"partial_token_ratio\",\r\n \"WRatio\",\r\n \"QRatio\",\r\n ),\r\n)\r\nprint(\"\\n\\n-----------------------------------------result1\")\r\nprint(\"MAPS ONE TO ONE - NO DUPLICATES\\n\")\r\ncounter = 0\r\nfor k, v in result1[0].items():\r\n print(f\"{k}-----{v}\")\r\n counter += 1\r\n if counter == 10:\r\n break\r\n\r\nresult2 = get_closest_matches(\r\n query_strings,\r\n choices,\r\n max_results_each_query=1,\r\n allow_repeating_matches=True,\r\n first_limit=70,\r\n chunksize=150,\r\n workers=1,\r\n scorer_kwargs=None,\r\n first_scorers=(\r\n \"ratio\",\r\n \"partial_ratio\",\r\n \"token_sort_ratio\",\r\n \"token_set_ratio\",\r\n \"token_ratio\",\r\n \"partial_token_sort_ratio\",\r\n \"partial_token_set_ratio\",\r\n \"partial_token_ratio\",\r\n \"WRatio\",\r\n \"QRatio\",\r\n ),\r\n)\r\n\r\nprint(\"\\n\\n-----------------------------------------result2\")\r\nprint(\"VALUES MIGHT BE DUPLICATES SOMEWHERE IN THE RESULTS\\n\")\r\ncounter = 0\r\nfor k, v in result2[0].items():\r\n print(f\"{k}-----{v}\")\r\n counter += 1\r\n if counter == 10:\r\n break\r\n\r\nresult3 = get_closest_matches(\r\n query_strings,\r\n choices,\r\n max_results_each_query=3,\r\n allow_repeating_matches=False,\r\n first_limit=70,\r\n chunksize=150,\r\n workers=1,\r\n scorer_kwargs=None,\r\n first_scorers=(\r\n \"ratio\",\r\n \"partial_ratio\",\r\n \"token_sort_ratio\",\r\n \"token_set_ratio\",\r\n \"token_ratio\",\r\n \"partial_token_sort_ratio\",\r\n \"partial_token_set_ratio\",\r\n \"partial_token_ratio\",\r\n \"WRatio\",\r\n \"QRatio\",\r\n ),\r\n)\r\nprint(\"\\n\\n-----------------------------------------result3\")\r\nprint(\"BEST RESULTS - MAPS ONE TO ONE - NO DUPLICATES\\n\")\r\ncounter = 0\r\nfor k, v in result3[0].items():\r\n print(f\"{k}-----{v}\")\r\n counter += 1\r\n if counter == 5:\r\n break\r\nprint(\"\\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\")\r\nprint(\"SECOND BEST RESULTS- MAPS ONE TO ONE - NO DUPLICATES\\n\")\r\ncounter = 0\r\nfor k, v in result3[1].items():\r\n print(f\"{k}-----{v}\")\r\n counter += 1\r\n if counter == 5:\r\n break\r\n\r\n\r\nresult4 = get_closest_matches(\r\n query_strings,\r\n choices,\r\n max_results_each_query=3,\r\n allow_repeating_matches=True,\r\n first_limit=70,\r\n chunksize=150,\r\n workers=1,\r\n scorer_kwargs=None,\r\n first_scorers=(\r\n \"ratio\",\r\n \"partial_ratio\",\r\n \"token_sort_ratio\",\r\n \"token_set_ratio\",\r\n \"token_ratio\",\r\n \"partial_token_sort_ratio\",\r\n \"partial_token_set_ratio\",\r\n \"partial_token_ratio\",\r\n \"WRatio\",\r\n \"QRatio\",\r\n ),\r\n)\r\nprint(\"\\n\\n-----------------------------------------result4\")\r\nprint(\"BEST RESULTS - VALUE MIGHT BE DUPLICATE\\n\")\r\ncounter = 0\r\nfor k, v in result4[0].items():\r\n print(f\"{k}-----{v}\")\r\n counter += 1\r\n if counter == 5:\r\n break\r\nprint(\"\\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\n\")\r\n\r\nprint(\r\n \"SECOND BEST RESULTS- VALUE MIGHT BE DUPLICATE - BUT NOT THE SAME KEY-VALUE COMBINATION LIKE THE FIRST\\n\"\r\n)\r\ncounter = 0\r\nfor k, v in result4[1].items():\r\n print(f\"{k}-----{v}\")\r\n counter += 1\r\n if counter == 5:\r\n break\r\n\r\n\r\n```\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Merges hard-to-merge data using rapidfuzz, cython, pandas and numpy",
"version": "0.10",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/rafuzzpandas"
},
"split_keywords": [
"data",
" fuzzy",
" merging",
" rapidfuzz",
" cython",
" pandas",
" numpy"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "f6f6536a468f61ffbb348089407e7959498dc9168d670ae9c66c85fbdc2cba9f",
"md5": "1929446a89c617f58f5f8c2ba6e62cd9",
"sha256": "c9031b8089ad45268f624334c812b8b4bccbeacf1a377429102a33fa45a95343"
},
"downloads": -1,
"filename": "rafuzzpandas-0.10-py3-none-any.whl",
"has_sig": false,
"md5_digest": "1929446a89c617f58f5f8c2ba6e62cd9",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 32970,
"upload_time": "2024-07-06T05:21:05",
"upload_time_iso_8601": "2024-07-06T05:21:05.133772Z",
"url": "https://files.pythonhosted.org/packages/f6/f6/536a468f61ffbb348089407e7959498dc9168d670ae9c66c85fbdc2cba9f/rafuzzpandas-0.10-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "a68cb3464c007186fccc7f6a0f6139a92205b9e2e4636e9b7f15128093ba6c2e",
"md5": "a45342f9ce6884ad005e9dff401df569",
"sha256": "45ac101ab35f6d28779ff66701bb746e2451fa65cb7b47fb351115d949d5cbda"
},
"downloads": -1,
"filename": "rafuzzpandas-0.10.tar.gz",
"has_sig": false,
"md5_digest": "a45342f9ce6884ad005e9dff401df569",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 32542,
"upload_time": "2024-07-06T05:21:06",
"upload_time_iso_8601": "2024-07-06T05:21:06.690018Z",
"url": "https://files.pythonhosted.org/packages/a6/8c/b3464c007186fccc7f6a0f6139a92205b9e2e4636e9b7f15128093ba6c2e/rafuzzpandas-0.10.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-07-06 05:21:06",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "rafuzzpandas",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "Cython",
"specs": []
},
{
"name": "normaltext",
"specs": []
},
{
"name": "numpy",
"specs": []
},
{
"name": "pandas",
"specs": []
},
{
"name": "rapidfuzz",
"specs": []
},
{
"name": "setuptools",
"specs": []
}
],
"lcname": "rafuzzpandas"
}