excel-dupes-finder/src/main.py

94 lines
3 KiB
Python

import argparse
from difflib import SequenceMatcher
from unidecode import unidecode
from xlrd import open_workbook
class Person:
def __init__(self, fname, lname):
self.fname = str(fname.strip()).lower()
self.lname = str(lname.strip()).lower()
@staticmethod
def normalize(name):
return unidecode(name.replace('-', ' ').replace(' ', ''))
def __eq__(self, other):
return Person.normalize(self.fname) == Person.normalize(other.fname) \
and Person.normalize(self.lname) == Person.normalize(other.lname)
def __str__(self):
return "{0} {1}".format(self.fname.title(), self.lname.title())
def __hash__(self):
return hash(self.fname + self.lname)
def ratio_eq(self, other):
a = Person.normalize(self.fname) + Person.normalize(self.lname)
b = Person.normalize(other.fname) + Person.normalize(other.lname)
ratio = SequenceMatcher(a=a, b=b).ratio()
if ratio > 0.9:
print(str(self) + ' vs ' + str(other) + ' : ' + str(ratio))
return True
def main(files, strict_only):
items = set()
strict_dupes = set()
low_dupes = set()
for file in files:
with open_workbook(file) as wb:
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
for row in range(number_of_rows):
values = []
for col in range(number_of_columns):
value = sheet.cell(row, col).value
values.append(value)
item = Person(*values)
if strict_only:
if item in items:
strict_dupes.add(item)
items.add(item)
else:
for i in items:
if item == i:
strict_dupes.add(item)
break
elif item.ratio_eq(i):
low_dupes.add(item)
break
items.add(item)
print('Nombre de duplicatas stricts : {0}'.format(len(strict_dupes)))
for dupe in strict_dupes:
print(dupe)
if not strict_only:
print('\nNombre de duplicatas avec distance : {0}'.format(len(low_dupes)))
for dupe in low_dupes:
print(dupe)
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument('-f', '--files', type=str, nargs='+', help='Files to compare.', required=True)
ap.add_argument('-d', '--distance', action='store_true', help='Also compare using distances')
args = ap.parse_args()
files = args.files
strict_only = not args.distance
print('WARNING')
print('All your sheets should have the same column order.\n')
if not strict_only:
print('Also comparing with distances.\n')
main(files, strict_only)