Jump to content
Usr6

Duplicate file finder

Recommended Posts

Posted

Dupa cum sugereaza si titlul, cauta toate fisierele dumplicate din calculator + anexe si le scrie in fisierul duplicate.txt sub forma:


md5
file size1 cale fisier
file size2 cale fisier
etc.

pentru ca md5 nu mai este considerat 100% sigur, nici nezultatele oferite de acest script nu pot avea o acuratete 100%, @fallen_angel a fost suficient de norocos incat sa dea peste un collision in timpul testelor

ar trebui sa functioneze fara probleme pe orice sistem windows/*unix/+ubuntu ce are instalat python 2.7

#!/usr/bin/env python

import os

import sys

import hashlib

print """

#############################

# Rst Duplicate file finder #

# Usr6 #

#############################

"""

alfabet =["a","b","c","d","e","f","g","h","i","j","k","l","m",

"n","o","p","q","r","s","t","u","v","w","x","y","z"]

biblioteca = {}

def hashfile(afile):

#####

#http://www.pythoncentral.io/hashing-files-with-python/

#####

BLOCKSIZE = 65536

hasher = hashlib.md5()

with open(afile, "rb") as afile:

buf = afile.read(BLOCKSIZE)

while len(buf) > 0:

hasher.update(buf)

buf = afile.read(BLOCKSIZE)

return hasher.hexdigest()

def fileparsing(root):

for root, subFolders, files in os.walk(root):

for file in files:

fullpath = os.path.join(root, file)

try:

md5hash = hashfile(fullpath)

if md5hash not in biblioteca.keys():

biblioteca[md5hash] = [fullpath]

else:

listapath = biblioteca[md5hash] +[fullpath]

biblioteca[md5hash] = listapath

print fullpath

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

#sys.platform

for i in alfabet:

root = i + ":\\"

if os.path.isdir(root):

fileparsing(root)

else:

fileparsing("/")

for md5_ in biblioteca.keys():

if len(biblioteca[md5_]) >1:

with open("duplicate.txt", "a") as handle:

handle.write(str(md5_) + "\n")

for f_path in biblioteca[md5_]:

handle.write("\t\t" + str(os.path.getsize(f_path))+ "\t"+ str(f_path) +"\n")

handle.close()

print "\nRezultate:"

print "Buguri:", "bugs.txt" if os.path.isfile("bugs.txt") else "N-am gasit"

print "Duplicate: ", "duplicate.txt" if os.path.isfile("duplicate.txt") else "N-am gasit"

  • Upvote 1
Posted (edited)
F?-l bre s? introduci path-ul de unde s? înceap? s? scaneze :D

Daca tii tu neaparat...

#!/usr/bin/env python

import os

import sys

import hashlib

print """

#############################

# Rst Duplicate file finder #

# Usr6 #

#############################

#fallen edition

"""

biblioteca = {}

def hashfile(afile):

#####

#http://www.pythoncentral.io/hashing-files-with-python/

#####

BLOCKSIZE = 65536

hasher = hashlib.md5()

with open(afile, "rb") as afile:

buf = afile.read(BLOCKSIZE)

while len(buf) > 0:

hasher.update(buf)

buf = afile.read(BLOCKSIZE)

return hasher.hexdigest()

def fileparsing(root):

for root, subFolders, files in os.walk(root):

for file in files:

fullpath = os.path.join(root, file)

try:

md5hash = hashfile(fullpath)

if md5hash not in biblioteca.keys():

biblioteca[md5hash] = [fullpath]

else:

listapath = biblioteca[md5hash] +[fullpath]

biblioteca[md5hash] = listapath

print fullpath

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

root = raw_input("Enter start dir: ")

if os.path.isdir(root):

fileparsing(root)

else:

while not os.path.isdir(root) and root != "exit":

print "Mai incearca o data sau 'exit' + enter to Exit"

root = raw_input("Enter start dir: ")

if root != "exit":

fileparsing(root)

else:

sys.exit("Out!")

for md5_ in biblioteca.keys():

if len(biblioteca[md5_]) >1:

with open("duplicate.txt", "a") as handle:

handle.write(str(md5_) + "\n")

for f_path in biblioteca[md5_]:

handle.write("\t\t" + str(os.path.getsize(f_path))+ "\t"+ str(f_path) +"\n")

handle.close()

print "\nRezultate:"

print "Buguri:", "bugs.txt" if os.path.isfile("bugs.txt") else "N-am gasit"

print "Duplicate: ", "duplicate.txt" if os.path.isfile("duplicate.txt") else "N-am gasit"

sys.exit("Out!")

Edited by Usr6
Posted
Super, si eu voiam sa fac asa ceva.

Cred ca ar fi mai optim sa gasesti dimensiunile fisierelor si sa calculezi hash-ul doar pentru cele cu aceeasi dimensiune.

indeed, script updated

#!/usr/bin/env python

import os

import sys

import hashlib

print """

#############################

# Rst Duplicate file finder #

# Usr6 #

#############################

#nytro edition*

#*are la baza fallen edition

"""

biblioteca = {}

librarie = {}

def hashfile(afile):

#####

#http://www.pythoncentral.io/hashing-files-with-python/

#####

BLOCKSIZE = 65536

hasher = hashlib.md5()

with open(afile, "rb") as afile:

buf = afile.read(BLOCKSIZE)

while len(buf) > 0:

hasher.update(buf)

buf = afile.read(BLOCKSIZE)

return hasher.hexdigest()

def fileparsing(root):

for root, subFolders, files in os.walk(root):

for file in files:

fullpath = os.path.join(root, file)

try:

file_size = str(os.path.getsize(fullpath))

if file_size not in biblioteca.keys():

biblioteca[file_size] = [fullpath]

else:

listapath = biblioteca[file_size] +[fullpath]

biblioteca[file_size] = listapath

print fullpath

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

root = raw_input("Enter start dir: ")

if os.path.isdir(root):

fileparsing(root)

else:

while not os.path.isdir(root) and root != "exit":

print "Mai incearca o data sau 'exit' + enter to Exit"

root = raw_input("Enter start dir: ")

if root != "exit":

fileparsing(root)

else:

sys.exit("Out!")

print "Filtram rezultatele:"

for f_size in biblioteca.keys():

if len(biblioteca[f_size]) >1:

for f_path in biblioteca[f_size]:

try:

md5hash = hashfile(f_path)

if md5hash not in librarie.keys():

librarie[md5hash] = [f_path]

else:

listapath = librarie[md5hash] +[f_path]

librarie[md5hash] = listapath

print f_path

except Exception as bug:

with open("bugs.txt", "a")as handle:

handle.write(str(bug) + "\n")

handle.close

pass

print "Scriu rezultatele finale in fisierul 'duplicate.txt'"

for md5_ in librarie.keys():

if len(librarie[md5_]) >1:

with open("duplicate.txt", "a") as handle:

handle.write(str(md5_) + "\n")

for f_path in librarie[md5_]:

handle.write("\t\t" + str(os.path.getsize(f_path))+ "\t"+ str(f_path) +"\n")

handle.close()

print "\nRezultate:"

print "Buguri:", "bugs.txt" if os.path.isfile("bugs.txt") else "N-am gasit"

print "Duplicate: ", "duplicate.txt" if os.path.isfile("duplicate.txt") else "N-am gasit"

sys.exit("Out!")

  • Upvote 1

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...