Implement distance comparison
This commit is contained in:
parent
5e2f6b7500
commit
3ca63e988e
1 changed files with 42 additions and 12 deletions
54
src/main.py
54
src/main.py
|
@ -3,15 +3,17 @@ from unidecode import unidecode
|
||||||
|
|
||||||
from xlrd import open_workbook
|
from xlrd import open_workbook
|
||||||
|
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
|
||||||
class Person:
|
class Person:
|
||||||
def __init__(self, fname, lname):
|
def __init__(self, fname, lname):
|
||||||
self.fname = str(fname).lower()
|
self.fname = str(fname.strip()).lower()
|
||||||
self.lname = str(lname).lower()
|
self.lname = str(lname.strip()).lower()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def normalize(name):
|
def normalize(name):
|
||||||
return unidecode(name.replace('-', ' '))
|
return unidecode(name.replace('-', ' ').replace(' ', ''))
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return Person.normalize(self.fname) == Person.normalize(other.fname) \
|
return Person.normalize(self.fname) == Person.normalize(other.fname) \
|
||||||
|
@ -23,10 +25,19 @@ class Person:
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return hash(self.fname + self.lname)
|
return hash(self.fname + self.lname)
|
||||||
|
|
||||||
|
def ratio_eq(self, other):
|
||||||
|
a = Person.normalize(self.fname) + Person.normalize(self.lname)
|
||||||
|
b = Person.normalize(other.fname) + Person.normalize(other.lname)
|
||||||
|
ratio = SequenceMatcher(a=a, b=b).ratio()
|
||||||
|
if ratio > 0.9:
|
||||||
|
print(str(self) + ' vs ' + str(other) + ' : ' + str(ratio))
|
||||||
|
return True
|
||||||
|
|
||||||
def main(files):
|
|
||||||
items = []
|
def main(files, strict_only):
|
||||||
dupes = set()
|
items = set()
|
||||||
|
strict_dupes = set()
|
||||||
|
low_dupes = set()
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
with open_workbook(file) as wb:
|
with open_workbook(file) as wb:
|
||||||
|
@ -41,24 +52,43 @@ def main(files):
|
||||||
values.append(value)
|
values.append(value)
|
||||||
item = Person(*values)
|
item = Person(*values)
|
||||||
|
|
||||||
if item not in items:
|
if strict_only:
|
||||||
items.append(item)
|
if item in items:
|
||||||
|
strict_dupes.add(item)
|
||||||
|
items.add(item)
|
||||||
else:
|
else:
|
||||||
dupes.add(item)
|
for i in items:
|
||||||
|
if item == i:
|
||||||
|
strict_dupes.add(item)
|
||||||
|
break
|
||||||
|
elif item.ratio_eq(i):
|
||||||
|
low_dupes.add(item)
|
||||||
|
break
|
||||||
|
items.add(item)
|
||||||
|
|
||||||
print('Nombre de duplicatas : {0}'.format(len(dupes)))
|
print('Nombre de duplicatas stricts : {0}'.format(len(strict_dupes)))
|
||||||
for dupe in dupes:
|
for dupe in strict_dupes:
|
||||||
print(dupe)
|
print(dupe)
|
||||||
|
|
||||||
|
if not strict_only:
|
||||||
|
print('\nNombre de duplicatas avec distance : {0}'.format(len(low_dupes)))
|
||||||
|
for dupe in low_dupes:
|
||||||
|
print(dupe)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
ap.add_argument('-f', '--files', type=str, nargs='+', help='Files to compare.', required=True)
|
ap.add_argument('-f', '--files', type=str, nargs='+', help='Files to compare.', required=True)
|
||||||
|
ap.add_argument('-d', '--distance', action='store_true', help='Also compare using distances')
|
||||||
|
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
files = args.files
|
files = args.files
|
||||||
|
strict_only = not args.distance
|
||||||
|
|
||||||
print('WARNING')
|
print('WARNING')
|
||||||
print('All your sheets should have the same column order.\n')
|
print('All your sheets should have the same column order.\n')
|
||||||
|
|
||||||
main(files)
|
if not strict_only:
|
||||||
|
print('Also comparing with distances.\n')
|
||||||
|
|
||||||
|
main(files, strict_only)
|
||||||
|
|
Loading…
Reference in a new issue