import argparse from difflib import SequenceMatcher from unidecode import unidecode from xlrd import open_workbook class Person: def __init__(self, fname, lname): self.fname = str(fname.strip()).lower() self.lname = str(lname.strip()).lower() @staticmethod def normalize(name): return unidecode(name.replace('-', ' ').replace(' ', '')) def __eq__(self, other): return Person.normalize(self.fname) == Person.normalize(other.fname) \ and Person.normalize(self.lname) == Person.normalize(other.lname) def __str__(self): return "{0} {1}".format(self.fname.title(), self.lname.title()) def __hash__(self): return hash(self.fname + self.lname) def ratio_eq(self, other): a = Person.normalize(self.fname) + Person.normalize(self.lname) b = Person.normalize(other.fname) + Person.normalize(other.lname) ratio = SequenceMatcher(a=a, b=b).ratio() if ratio > 0.9: print(str(self) + ' vs ' + str(other) + ' : ' + str(ratio)) return True def main(files, strict_only): items = set() strict_dupes = set() low_dupes = set() for file in files: with open_workbook(file) as wb: for sheet in wb.sheets(): number_of_rows = sheet.nrows number_of_columns = sheet.ncols for row in range(number_of_rows): values = [] for col in range(number_of_columns): value = sheet.cell(row, col).value values.append(value) item = Person(*values) if strict_only: if item in items: strict_dupes.add(item) items.add(item) else: for i in items: if item == i: strict_dupes.add(item) break elif item.ratio_eq(i): low_dupes.add(item) break items.add(item) print('Nombre de duplicatas stricts : {0}'.format(len(strict_dupes))) for dupe in strict_dupes: print(dupe) if not strict_only: print('\nNombre de duplicatas avec distance : {0}'.format(len(low_dupes))) for dupe in low_dupes: print(dupe) if __name__ == '__main__': ap = argparse.ArgumentParser() ap.add_argument('-f', '--files', type=str, nargs='+', help='Files to compare.', required=True) ap.add_argument('-d', '--distance', action='store_true', help='Also compare using distances') args = ap.parse_args() files = args.files strict_only = not args.distance print('WARNING') print('All your sheets should have the same column order.\n') if not strict_only: print('Also comparing with distances.\n') main(files, strict_only)