Jump to content
Usr6

Duplicate file finder

Recommended Posts

Dupa cum sugereaza si titlul, cauta toate fisierele dumplicate din calculator + anexe si le scrie in fisierul duplicate.txt sub forma:


md5
file size1 cale fisier
file size2 cale fisier
etc.

pentru ca md5 nu mai este considerat 100% sigur, nici nezultatele oferite de acest script nu pot avea o acuratete 100%, @fallen_angel a fost suficient de norocos incat sa dea peste un collision in timpul testelor

ar trebui sa functioneze fara probleme pe orice sistem windows/*unix/+ubuntu ce are instalat python 2.7

#!/usr/bin/env python

import os

import sys

import hashlib

print """

#############################

# Rst Duplicate file finder #

# Usr6 #

#############################

"""

alfabet =["a","b","c","d","e","f","g","h","i","j","k","l","m",

"n","o","p","q","r","s","t","u","v","w","x","y","z"]

biblioteca = {}

def hashfile(afile):

#####

#http://www.pythoncentral.io/hashing-files-with-python/

#####

BLOCKSIZE = 65536

hasher = hashlib.md5()

with open(afile, "rb") as afile:

buf = afile.read(BLOCKSIZE)

while len(buf) > 0:

hasher.update(buf)

buf = afile.read(BLOCKSIZE)

return hasher.hexdigest()

def fileparsing(root):

for root, subFolders, files in os.walk(root):

for file in files:

fullpath = os.path.join(root, file)

try:

md5hash = hashfile(fullpath)

if md5hash not in biblioteca.keys():

biblioteca[md5hash] = [fullpath]

else:

listapath = biblioteca[md5hash] +[fullpath]

biblioteca[md5hash] = listapath

print fullpath

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

#sys.platform

for i in alfabet:

root = i + ":\\"

if os.path.isdir(root):

fileparsing(root)

else:

fileparsing("/")

for md5_ in biblioteca.keys():

if len(biblioteca[md5_]) >1:

with open("duplicate.txt", "a") as handle:

handle.write(str(md5_) + "\n")

for f_path in biblioteca[md5_]:

handle.write("\t\t" + str(os.path.getsize(f_path))+ "\t"+ str(f_path) +"\n")

handle.close()

print "\nRezultate:"

print "Buguri:", "bugs.txt" if os.path.isfile("bugs.txt") else "N-am gasit"

print "Duplicate: ", "duplicate.txt" if os.path.isfile("duplicate.txt") else "N-am gasit"

  • Upvote 1
Link to comment
Share on other sites

F?-l bre s? introduci path-ul de unde s? înceap? s? scaneze :D

Daca tii tu neaparat...

#!/usr/bin/env python

import os

import sys

import hashlib

print """

#############################

# Rst Duplicate file finder #

# Usr6 #

#############################

#fallen edition

"""

biblioteca = {}

def hashfile(afile):

#####

#http://www.pythoncentral.io/hashing-files-with-python/

#####

BLOCKSIZE = 65536

hasher = hashlib.md5()

with open(afile, "rb") as afile:

buf = afile.read(BLOCKSIZE)

while len(buf) > 0:

hasher.update(buf)

buf = afile.read(BLOCKSIZE)

return hasher.hexdigest()

def fileparsing(root):

for root, subFolders, files in os.walk(root):

for file in files:

fullpath = os.path.join(root, file)

try:

md5hash = hashfile(fullpath)

if md5hash not in biblioteca.keys():

biblioteca[md5hash] = [fullpath]

else:

listapath = biblioteca[md5hash] +[fullpath]

biblioteca[md5hash] = listapath

print fullpath

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

root = raw_input("Enter start dir: ")

if os.path.isdir(root):

fileparsing(root)

else:

while not os.path.isdir(root) and root != "exit":

print "Mai incearca o data sau 'exit' + enter to Exit"

root = raw_input("Enter start dir: ")

if root != "exit":

fileparsing(root)

else:

sys.exit("Out!")

for md5_ in biblioteca.keys():

if len(biblioteca[md5_]) >1:

with open("duplicate.txt", "a") as handle:

handle.write(str(md5_) + "\n")

for f_path in biblioteca[md5_]:

handle.write("\t\t" + str(os.path.getsize(f_path))+ "\t"+ str(f_path) +"\n")

handle.close()

print "\nRezultate:"

print "Buguri:", "bugs.txt" if os.path.isfile("bugs.txt") else "N-am gasit"

print "Duplicate: ", "duplicate.txt" if os.path.isfile("duplicate.txt") else "N-am gasit"

sys.exit("Out!")

Edited by Usr6
Link to comment
Share on other sites

Super, si eu voiam sa fac asa ceva.

Cred ca ar fi mai optim sa gasesti dimensiunile fisierelor si sa calculezi hash-ul doar pentru cele cu aceeasi dimensiune.

indeed, script updated

#!/usr/bin/env python

import os

import sys

import hashlib

print """

#############################

# Rst Duplicate file finder #

# Usr6 #

#############################

#nytro edition*

#*are la baza fallen edition

"""

biblioteca = {}

librarie = {}

def hashfile(afile):

#####

#http://www.pythoncentral.io/hashing-files-with-python/

#####

BLOCKSIZE = 65536

hasher = hashlib.md5()

with open(afile, "rb") as afile:

buf = afile.read(BLOCKSIZE)

while len(buf) > 0:

hasher.update(buf)

buf = afile.read(BLOCKSIZE)

return hasher.hexdigest()

def fileparsing(root):

for root, subFolders, files in os.walk(root):

for file in files:

fullpath = os.path.join(root, file)

try:

file_size = str(os.path.getsize(fullpath))

if file_size not in biblioteca.keys():

biblioteca[file_size] = [fullpath]

else:

listapath = biblioteca[file_size] +[fullpath]

biblioteca[file_size] = listapath

print fullpath

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

root = raw_input("Enter start dir: ")

if os.path.isdir(root):

fileparsing(root)

else:

while not os.path.isdir(root) and root != "exit":

print "Mai incearca o data sau 'exit' + enter to Exit"

root = raw_input("Enter start dir: ")

if root != "exit":

fileparsing(root)

else:

sys.exit("Out!")

print "Filtram rezultatele:"

for f_size in biblioteca.keys():

if len(biblioteca[f_size]) >1:

for f_path in biblioteca[f_size]:

try:

md5hash = hashfile(f_path)

if md5hash not in librarie.keys():

librarie[md5hash] = [f_path]

else:

listapath = librarie[md5hash] +[f_path]

librarie[md5hash] = listapath

print f_path

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

print "Scriu rezultatele finale in fisierul 'duplicate.txt'"

for md5_ in librarie.keys():

if len(librarie[md5_]) >1:

with open("duplicate.txt", "a") as handle:

handle.write(str(md5_) + "\n")

for f_path in librarie[md5_]:

handle.write("\t\t" + str(os.path.getsize(f_path))+ "\t"+ str(f_path) +"\n")

handle.close()

print "\nRezultate:"

print "Buguri:", "bugs.txt" if os.path.isfile("bugs.txt") else "N-am gasit"

print "Duplicate: ", "duplicate.txt" if os.path.isfile("duplicate.txt") else "N-am gasit"

sys.exit("Out!")

  • Upvote 1
Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...