Skip to content

Commit 90a7398

Browse files
committed
rmdup
1 parent afd17c2 commit 90a7398

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed

6.rmdup/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# REMOVE DUPLICATED FILES
2+
=========================
3+
4+
This script is used to remove all duplicated files in current working directory.
5+

6.rmdup/rmdup.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/python3
2+
3+
# Merrick Zhang<[email protected]>
4+
# Licensed under MIT.
5+
6+
# Find duplicated files
7+
# Step 1: get all file sizes
8+
# Step 2: get MD5 for same size files, if fits, append to to-delete-list.
9+
# Step 3: remove duplicated files
10+
11+
import hashlib
12+
import os
13+
from functools import partial
14+
from os.path import getsize, join
15+
16+
#################
17+
# Get File Hash #
18+
#################
19+
20+
21+
def md5sum(filename):
22+
with open(filename, mode='rb') as f:
23+
d = hashlib.md5()
24+
for buf in iter(partial(f.read, 128), b''):
25+
d.update(buf)
26+
return d.hexdigest()
27+
28+
####################
29+
# File Node Struct #
30+
####################
31+
32+
33+
class node():
34+
filepath = None
35+
filesize = None
36+
filehash = None
37+
38+
def __init__(self, fpath):
39+
self.filepath = fpath
40+
self.filesize = getsize(fpath)
41+
42+
def gethash(self):
43+
if not self.filehash:
44+
self.filehash = md5sum(self.filepath)
45+
return self.filehash
46+
47+
def __hash__(self):
48+
return self.filesize
49+
50+
51+
# starts from current directory
52+
current_directory = os.getcwd()
53+
print("Finding duplicate files in ", current_directory)
54+
55+
huge_hash_table = {}
56+
to_delete_list = []
57+
58+
for root, dirs, files in os.walk(current_directory):
59+
for file in files:
60+
p = node(os.path.join(root, file))
61+
fz = str(p.filesize)
62+
if fz in huge_hash_table.keys():
63+
# compare md5 hash
64+
hash_of_p = p.gethash()
65+
dup_flag = False
66+
67+
for i in huge_hash_table[fz]:
68+
if not dup_flag and (hash_of_p == i.gethash()):
69+
dup_flag = True
70+
71+
if dup_flag:
72+
# mark for delete
73+
to_delete_list.append(p)
74+
else:
75+
if fz in huge_hash_table.keys():
76+
huge_hash_table[fz].append(p)
77+
else:
78+
huge_hash_table[fz] = []
79+
huge_hash_table[fz].append(p)
80+
81+
print("Files to be deleted:")
82+
log = open("rmdup.files", "w+")
83+
for i in to_delete_list:
84+
print(i.filepath)
85+
print(i.filepath, file=log)
86+
log.close()
87+
88+
print("Checkout rmdup.files for details.")
89+
90+
s = input("Just delete ALL(y/n/f)? F to load entries from rmdup.files >")
91+
if (s.lower == "y"):
92+
for i in to_delete_list:
93+
print("Deleting ", i.filepath)
94+
os.remove(i.filepath)
95+
96+
if (s.lower == "f"):
97+
with open("rmdup.files", "r") as f:
98+
for line in f.readlines():
99+
print("Deleting ", line)
100+
os.remove(line)

0 commit comments

Comments
 (0)