I am using python to neardupe huge list of file (over 20000 ) files. Totaling about 300 MB
Current way is to do near-dupe checking using difflib’s SequenceMatcher and getting result using QuickRatio .
With 4 worker process it takes 25 hours to get the job done , which is quite slow.
I also tried Livenstheine which gives C base near-dupe checking but its even slower and less accurate than difflib.
The checking need to be done in this manner :
There are 20000 files in a folder. each file need to compare against 20000 files in the folder on every iterations. so there will be 20000 * 20000 iterations.
What I think of is to index all the files and comparing indexes but i am new to indexing and i am not sure it would work. If that the way what is the best indexing options?
Thanks.
Below is the code :
import os,sys,chardet, csv,operator,time,subprocess
from difflib import SequenceMatcher
import threading
#from threading import Timer
import multiprocessing
from multiprocessing import Pool
OrgFile = ""
mark = int(sys.argv[2])
def init_logger():
print "Starting %s" % multiprocessing.current_process().name
#----Get_Near_DupeStatus--------#
def Get_Near_DupeStatus(score):
if score > 30 and score <= 50:
return "Low Inclusive"
elif score > 50 and score <= 75:
return "Inclusive"
elif score > 75 and score <= 85:
return "Most Inclusive"
elif score > 85 and score <= 99:
return "Near-Dupe"
elif score == 100:
return "Unique"
else: return "No Inclusive"
#----Write_To_CSV --- ALL-------#
def Write_To_CSV_All(List):
writer = csv.writer(open('./TableList.csv','wb'),delimiter=';', quotechar=' ', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['Path/FileName(Source);'+'ID;'+'NearDupeID;'+'Similarity Score;'+'Near_DupeStatus;'+'NearDupeProcess(Y/N);'+'Encoding'])
for i,li in enumerate(sorted(List, key=operator.itemgetter("NearDupeID"))):
writer.writerow([li['Path/FileName(Source)']+";"+'ID00'+str(i+1)+";"+str(li['NearDupeID'])+";"+str(li['Similarity Score'])+";"+li['Near_DupeStatus']+";"+li['NearDupeProcess(Y/N)']+";"+li['Encoding']])
#Get Finish File List
def Finish_Files(List,count,id):
finish_files = []
for i,li in enumerate(sorted(List, key=operator.itemgetter("Similarity Score"), reverse=True)):
if i < count:
li['NearDupeID'] = id
finish_files.append(li)
if count == 0:
li['NearDupeID'] = id
# if li['Similarity Score'] > 50:
finish_files.append(li)
return finish_files
#----Search Files in Dir--------#
def GetFileListFrom_Dir(dir):
FileList = []
for root,dirs,filenames in os.walk(dir):
for filename in filenames:
realpath = os.path.join(root, filename)
FileList.append(realpath)
return FileList
#----Matcher--------#
def Matcher(realpath):
junk = ["\t","\n","\r"]
score = 0
dict = {}
MatchFile = ""
dupe_Process = 'N'
if os.path.isfile(realpath):
MatchFile = open(realpath).read()
matcher = SequenceMatcher(lambda x: x in junk,OrgFile, MatchFile)
score = int(matcher.ratio()*100)
if score >= mark:
encoding = chardet.detect(MatchFile)['encoding']
if encoding == None: encoding = 'None'
if score > 85: dupe_Process = 'Y'
dict = {'Path/FileName(Source)':realpath,'Similarity Score':score,'Near_DupeStatus':Get_Near_DupeStatus(score),'NearDupeProcess(Y/N)':dupe_Process,'Encoding':encoding}
return dict
#-------------Pooling--------------------#
def MatcherPooling(FileList,orgFile,process):
global OrgFile
OrgFile = open(orgFile).read()
pool_obj = Pool(processes=process)
#pool_obj = Pool(processes=process,initializer=init_logger)
dict = {}
DictList = []
dict = pool_obj.map(Matcher,FileList)
DictList.append(dict)
pool_obj.close()
pool_obj.join()
return DictList
def Progress():
p = "/-\\|"
# global t
for s in p:
time.sleep(0.1)
sys.stdout.write("%c" % s)
sys.stdout.flush()
sys.stdout.write('\b')
t2 = threading.Timer(0.1,Progress).start()
# t.start()
#----Main--------#
def Main():
Mainls = []
dictList = []
finish_List = []
BLINK = '\033[05m'
NOBLINK = '\033[25m'
dir = sys.argv[1]
process = int(sys.argv[3])
Top_rec = int(sys.argv[4])
Mainls = GetFileListFrom_Dir(dir)
bar = "*"
# setup toolbar
sys.stdout.write("%s" % BLINK+"Processing...."+ NOBLINK + "With "+ str(process) + " Multi Process...")#+" \n")
if Top_rec != 0:
charwidth = len(Mainls)/Top_rec
elif Top_rec == 0: charwidth = len(Mainls)
t = threading.Timer(0.1,Progress)
t.start()
# sys.stdout.write("[%s]" % ("-" * charwidth))
# sys.stdout.flush()
# sys.stdout.write("\b" * (charwidth+1)) # return to start of line, after '['
#----------------------------------------------------------#
for id,orgFile in enumerate(sorted(Mainls)):
for dl in MatcherPooling(sorted(Mainls),orgFile,process):
for dict in dl:
if dict != None:
dictList.append(dict)
#Append Finish Files List For CSV ALL(Write Once)
fl = Finish_Files(dictList,Top_rec,id+1)
if Top_rec != 0:
for del_List in fl:
Mainls.remove(del_List['Path/FileName(Source)'])
Mainls.sort()
finish_List.extend(fl)
dictList = []
sys.stdout.write("%s" % bar)
sys.stdout.flush()
#Exit Loop
if len(Mainls) == 0:
break
#----------------------------------------------------------#
Write_To_CSV_All(finish_List)
#print os.system('clear')
sys.stdout.write("%s" % " ")
print "Finished!"
t.cancel()
print os._exit(99)
if __name__ == '__main__':
Main()
A partial answer, but one obvious optimization is to only compare files with approximately the same size. Also comparing file a and file b is the same as comparing b and a: 20000 files gives 20000 * (20000-1)/2 comparisons.
300 MB is not that big, you could try to read in all files first.
On indexing, it’s just about describing each file with one ore more numbers. Size is one. Number of non-white space or white space or new line characters could be others. If the files all contain the same kind of data you can interpret the data to create more useful numbers.
Also, completely identical files will have the same SHA-256 hash. This will only help if a significant fraction of the files are identical.
Unfortunately I can think of no method for completely accurately and correctly factorizing (parts of) what is done by
difflib.SequenceMatchersince it’s dynamically comparing all possible chunks of the input files.