df:
col1
['aa', 'bb', 'cc', 'dd']
['this', 'is', 'a', 'list', '2']
['this', 'list', '3']
col2
[['ee', 'ff', 'gg', 'hh'], ['qq', 'ww', 'ee', 'rr']]
[['list', 'a', 'not', '1'], ['not', 'is', 'this', '2']]
[['this', 'is', 'list', 'not'], ['a', 'not', 'list', '2']]
What I'm trying to do:
I am trying to run the code below on each element (word) in df col1
on each corresponding element in each of the sublists in col2
, and put the scores in a new column.
So for the first row in col1
, run the get_top_matches
function on this:
`col1` "aa" and `col2` "ee" and "qq"
`col1` "bb" and `col2` "ff" and "ww"
`col1` "cc" and `col2` "gg" and "ee"
`col1` "dd" and `col2` "hh" and "rr"
What the new column should look like:
I don't know for sure what row 2 and 3 scores should be
score_col
[1.0, 1.0, 1.0, 1.0]
[.34, .33, .27, .24, .23] #not sure
[.23, .13, .26] #not sure
What I've tried before:
I've done when col1
was just a string against each list element in col2
, like this, but i don't have the slightest idea how to run it against list elements to corresponding sublist elements:
df.agg(lambda x: get_top_matches(*x), axis=1)
. . . .
The Function Code
Here's the get_top_matches
function - just run this whole thing; i'm only calling the last function for this question:
#jaro version
def sort_token_alphabetically(word):
token = re.split('[,. ]', word)
sorted_token = sorted(token)
return ' '.join(sorted_token)
def get_jaro_distance(first, second, winkler=True, winkler_ajustment=True,
scaling=0.1, sort_tokens=True):
"""
:param first: word to calculate distance for
:param second: word to calculate distance with
:param winkler: same as winkler_ajustment
:param winkler_ajustment: add an adjustment factor to the Jaro of the distance
:param scaling: scaling factor for the Winkler adjustment
:return: Jaro distance adjusted (or not)
"""
if sort_tokens:
first = sort_token_alphabetically(first)
second = sort_token_alphabetically(second)
if not first or not second:
raise JaroDistanceException(
"Cannot calculate distance from NoneType ({0}, {1})".format(
first.__class__.__name__,
second.__class__.__name__))
jaro = _score(first, second)
cl = min(len(_get_prefix(first, second)), 4)
if all([winkler, winkler_ajustment]): # 0.1 as scaling factor
return round((jaro + (scaling * cl * (1.0 - jaro))) * 100.0) / 100.0
return jaro
def _score(first, second):
shorter, longer = first.lower(), second.lower()
if len(first) > len(second):
longer, shorter = shorter, longer
m1 = _get_matching_characters(shorter, longer)
m2 = _get_matching_characters(longer, shorter)
if len(m1) == 0 or len(m2) == 0:
return 0.0
return (float(len(m1)) / len(shorter) +
float(len(m2)) / len(longer) +
float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0
def _get_diff_index(first, second):
if first == second:
pass
if not first or not second:
return 0
max_len = min(len(first), len(second))
for i in range(0, max_len):
if not first[i] == second[i]:
return i
return max_len
def _get_prefix(first, second):
if not first or not second:
return ""
index = _get_diff_index(first, second)
if index == -1:
return first
elif index == 0:
return ""
else:
return first[0:index]
def _get_matching_characters(first, second):
common = []
limit = math.floor(min(len(first), len(second)) / 2)
for i, l in enumerate(first):
left, right = int(max(0, i - limit)), int(
min(i + limit + 1, len(second)))
if l in second[left:right]:
common.append(l)
second = second[0:second.index(l)] + '*' + second[
second.index(l) + 1:]
return ''.join(common)
def _transpositions(first, second):
return math.floor(
len([(f, s) for f, s in zip(first, second) if not f == s]) / 2.0)
def get_top_matches(reference, value_list, max_results=None):
scores = []
if not max_results:
max_results = len(value_list)
for val in value_list:
score_sorted = get_jaro_distance(reference, val)
score_unsorted = get_jaro_distance(reference, val, sort_tokens=False)
scores.append((val, max(score_sorted, score_unsorted)))
scores.sort(key=lambda x: x[1], reverse=True)
return scores[:max_results]
class JaroDistanceException(Exception):
def __init__(self, message):
super(Exception, self).__init__(message)
parent_org_name_list children_org_name_sublists
0 [MYLLC] [[M'YALYK, OLEKSANDR, NYCHYPOROVYCH, PP], [MYL...
1 [YDEA, S.R.L] [[YD, CONFECCOES, LTDA], [YDA], [YDA, INSAAT, ...
2 [HYONIX] [[HYMAX, TALKING, SOLUTIONS], [HYNIX, SEMICOND...
3 [MJN, ENTERPRISES] [[MJM, INTERANTIONAL, INC], [MJN, ENTERPRISES]...
4 [LTD, YURIA-PHARM] [[LTD, YURIA-PHARM], [YURIYPRA, LAW, OFFICE, PC]]
... ... ...
9995 [UPNET] [[PAULOROBERTOREZENDE, UPNET], [UPCNET], [UPNE...
9996 [FDM, GROUP, INC.] [[FDM, DEVELOPMENT, PARTNERSHIP, LL], [FDM, GR...
9997 [VALVTECHNOLOGIES, INC.] [[JNTU, VAYALPADU], [VALUPTEC, CO., LTD.], [VA...
9998 [WEB4AFRICA-ZA] [[WEB4AFRICA], [WEB4AFRICA-GH], [WEB4AFRICA-KE...
9999 [JAARBEURS, B.V.] [[JAARBEURS, B.V.], [KPN, ZM, CS, IZ, JAARBEUR...
Code Results Just trying to get this to compare to each word in the lists rather than each letter:
[[[df1.agg(lambda x: get_top_matches(u,w), axis=1) for u,w in zip(x,v)]\ for v in y] for x,y in zip(df1['parent_org_name_list'], df1['children_org_name_sublists'])]
from Run a function for each element in two lists in Pandas Dataframe Columns
No comments:
Post a Comment