From 3ca63e988eba5eac180c3cd1e603771698b188b6 Mon Sep 17 00:00:00 2001 From: Gabriel Augendre Date: Sun, 20 Nov 2016 00:29:33 +0100 Subject: [PATCH] Implement distance comparison --- src/main.py | 54 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/src/main.py b/src/main.py index 85c5c6a..a1957e4 100644 --- a/src/main.py +++ b/src/main.py @@ -3,15 +3,17 @@ from unidecode import unidecode from xlrd import open_workbook +from difflib import SequenceMatcher + class Person: def __init__(self, fname, lname): - self.fname = str(fname).lower() - self.lname = str(lname).lower() + self.fname = str(fname.strip()).lower() + self.lname = str(lname.strip()).lower() @staticmethod def normalize(name): - return unidecode(name.replace('-', ' ')) + return unidecode(name.replace('-', ' ').replace(' ', '')) def __eq__(self, other): return Person.normalize(self.fname) == Person.normalize(other.fname) \ @@ -23,10 +25,19 @@ class Person: def __hash__(self): return hash(self.fname + self.lname) + def ratio_eq(self, other): + a = Person.normalize(self.fname) + Person.normalize(self.lname) + b = Person.normalize(other.fname) + Person.normalize(other.lname) + ratio = SequenceMatcher(a=a, b=b).ratio() + if ratio > 0.9: + print(str(self) + ' vs ' + str(other) + ' : ' + str(ratio)) + return True -def main(files): - items = [] - dupes = set() + +def main(files, strict_only): + items = set() + strict_dupes = set() + low_dupes = set() for file in files: with open_workbook(file) as wb: @@ -41,24 +52,43 @@ def main(files): values.append(value) item = Person(*values) - if item not in items: - items.append(item) + if strict_only: + if item in items: + strict_dupes.add(item) + items.add(item) else: - dupes.add(item) + for i in items: + if item == i: + strict_dupes.add(item) + break + elif item.ratio_eq(i): + low_dupes.add(item) + break + items.add(item) - print('Nombre de duplicatas : {0}'.format(len(dupes))) - for dupe in dupes: + print('Nombre de duplicatas stricts : {0}'.format(len(strict_dupes))) + for dupe in strict_dupes: print(dupe) + if not strict_only: + print('\nNombre de duplicatas avec distance : {0}'.format(len(low_dupes))) + for dupe in low_dupes: + print(dupe) + if __name__ == '__main__': ap = argparse.ArgumentParser() ap.add_argument('-f', '--files', type=str, nargs='+', help='Files to compare.', required=True) + ap.add_argument('-d', '--distance', action='store_true', help='Also compare using distances') args = ap.parse_args() files = args.files + strict_only = not args.distance print('WARNING') print('All your sheets should have the same column order.\n') - main(files) + if not strict_only: + print('Also comparing with distances.\n') + + main(files, strict_only)