Implement distance comparison

This commit is contained in:
Gabriel Augendre 2016-11-20 00:29:33 +01:00
parent 5e2f6b7500
commit 3ca63e988e
No known key found for this signature in database
GPG key ID: D2B6A5B41FC438B1

View file

@ -3,15 +3,17 @@ from unidecode import unidecode
from xlrd import open_workbook from xlrd import open_workbook
from difflib import SequenceMatcher
class Person: class Person:
def __init__(self, fname, lname): def __init__(self, fname, lname):
self.fname = str(fname).lower() self.fname = str(fname.strip()).lower()
self.lname = str(lname).lower() self.lname = str(lname.strip()).lower()
@staticmethod @staticmethod
def normalize(name): def normalize(name):
return unidecode(name.replace('-', ' ')) return unidecode(name.replace('-', ' ').replace(' ', ''))
def __eq__(self, other): def __eq__(self, other):
return Person.normalize(self.fname) == Person.normalize(other.fname) \ return Person.normalize(self.fname) == Person.normalize(other.fname) \
@ -23,10 +25,19 @@ class Person:
def __hash__(self): def __hash__(self):
return hash(self.fname + self.lname) return hash(self.fname + self.lname)
def ratio_eq(self, other):
a = Person.normalize(self.fname) + Person.normalize(self.lname)
b = Person.normalize(other.fname) + Person.normalize(other.lname)
ratio = SequenceMatcher(a=a, b=b).ratio()
if ratio > 0.9:
print(str(self) + ' vs ' + str(other) + ' : ' + str(ratio))
return True
def main(files):
items = [] def main(files, strict_only):
dupes = set() items = set()
strict_dupes = set()
low_dupes = set()
for file in files: for file in files:
with open_workbook(file) as wb: with open_workbook(file) as wb:
@ -41,24 +52,43 @@ def main(files):
values.append(value) values.append(value)
item = Person(*values) item = Person(*values)
if item not in items: if strict_only:
items.append(item) if item in items:
strict_dupes.add(item)
items.add(item)
else: else:
dupes.add(item) for i in items:
if item == i:
strict_dupes.add(item)
break
elif item.ratio_eq(i):
low_dupes.add(item)
break
items.add(item)
print('Nombre de duplicatas : {0}'.format(len(dupes))) print('Nombre de duplicatas stricts : {0}'.format(len(strict_dupes)))
for dupe in dupes: for dupe in strict_dupes:
print(dupe) print(dupe)
if not strict_only:
print('\nNombre de duplicatas avec distance : {0}'.format(len(low_dupes)))
for dupe in low_dupes:
print(dupe)
if __name__ == '__main__': if __name__ == '__main__':
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument('-f', '--files', type=str, nargs='+', help='Files to compare.', required=True) ap.add_argument('-f', '--files', type=str, nargs='+', help='Files to compare.', required=True)
ap.add_argument('-d', '--distance', action='store_true', help='Also compare using distances')
args = ap.parse_args() args = ap.parse_args()
files = args.files files = args.files
strict_only = not args.distance
print('WARNING') print('WARNING')
print('All your sheets should have the same column order.\n') print('All your sheets should have the same column order.\n')
main(files) if not strict_only:
print('Also comparing with distances.\n')
main(files, strict_only)