-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhash_dedup.py
More file actions
executable file
·165 lines (137 loc) · 5.65 KB
/
hash_dedup.py
File metadata and controls
executable file
·165 lines (137 loc) · 5.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/python3
# this file is used to perform the deduplication of our backup system.
# given backup directory as an argument, this file inspects each file and
# subsirectory and confirms which are new or updated since previous backups.
# each run of this program, an upload.txt file is created, which contains a list
# of the new or modified files that need to be uploaded. a metadata.txt file is
# also created, which lists every file in the backup.
# USAGE: python3 hash_dedup.py BACKUP_DIR
import sys
import os
import hashlib
from datetime import datetime
import boto3
import gzip
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad
BUF_SIZE = 4096
BUCKET_NAME = "duke2025-esa03-backup-storage"
BASE_DIR = "/mnt/btrfs_project_drive/snapshots/"
UPLOAD_PATH = "/mnt/btrfs_project_drive/upload_files.txt"
DATE_FORMAT = '%d%b%Y-%H-%M'
# decrypt_aes_cbc(ciphertext, key)
# parameters: ciphertext the encrypted data
# key the encryption key
# description: this file is used to decrypt files with a given key
# returns: the decrypted data
def decrypt_aes_cbc(ciphertext, key):
iv = ciphertext[:16]
cipher = AES.new(key, AES.MODE_CBC, iv)
plaintext_padded = cipher.decrypt(ciphertext[16:])
return unpad(plaintext_padded, AES.block_size)
# get_file_metadata(lines, filehash)
# parameters: lines a list of the lines from the metadata file
# filehash the specific hash we are looking for
# description: given lines of metadata, this function returns the line that
# contains the specified file hash
# returns: the line of metadata with the file hash, or none if none exists
def get_file_metadata(lines, filehash):
for line in lines:
if filehash in line:
return line
return None
# hash_file(filename)
# parameters: filename the path of the file to hash
# description: given the file path, this function hashes the entire file
# returns: the hash of the provided file
def hash_file(filename):
hash_function = hashlib.new('sha256')
fp = open(filename, 'rb')
data = fp.read(BUF_SIZE)
while (data):
hash_function.update(data)
data = fp.read(BUF_SIZE)
return hash_function.hexdigest()
# hash_dir(directory, lines)
# parameters: directory the directory to traverse
# lines the lines of metadata to compare against
# description: given a directory path, this function explores the directory and
# hashes its files, saving the results in various metadata files
def hash_dir(directory):
global backup_metadata
global upload_list
global meta_name
global lines
# traverse directory to inspect each file
for entry in os.scandir(directory):
path = entry.path.replace(BASE_DIR, '')
# if current entry is a subdirectory, recursively explore
if entry.is_dir():
print("subdir:\t\t" + path)
backup_metadata.write("directory$" + path + "\n")
hash_dir(entry.path)
elif entry.is_file() and entry.path != meta_name:
print("file:\t\t" + path)
hash_result = hash_file(entry.path)
file_output = hash_result + "$" + path + "\n"
file_metadata = get_file_metadata(lines, hash_result)
# if file is not known on S3, add to list to be uploaded
if file_metadata is None:
upload_list.write(file_output)
lines.append(file_output)
print("uploading: " + path)
backup_metadata.write(file_output)
# get_metadata()
# description: this function retrieves all metadata files form s3 and combines
# them into a list of all the files on the server
# returns: the lines of the metadata files
def get_metadata():
s3 = boto3.client('s3')
files = []
lines = []
# retrieve encryption key
with open('aes_key.bin', 'rb') as key_file:
key = key_file.read()
# get s3 files, and return empty list s3 is empty
response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix='snapshots/')
if 'Contents' not in response:
return lines
# collect metadata files from retrieved s3 files
for obj in response['Contents']:
if "metadata.txt" in obj['Key']:
files.append(obj['Key'])
# decrypt metadata files and add to list
for file in files:
response = s3.get_object(Bucket=BUCKET_NAME, Key=file)
encrypted_data = response['Body'].read()
decrypted_data = decrypt_aes_cbc(encrypted_data, key)
content = gzip.decompress(decrypted_data).decode('utf-8')
lines += content.splitlines()
return lines
# get_metadata()
# parameters: args directory of the target backup to dedup
# description: this function drives the program, opening metadata files and
# initiating the deduplication process
def main(args):
if len(sys.argv) != 2:
sys.exit(1)
global backup_metadata
global upload_list
global meta_name
global lines
target_directory = BASE_DIR + args[1]
meta_name = target_directory + "/metadata.txt"
backup_metadata = open(meta_name, "w+")
upload_list = open(UPLOAD_PATH, "w+")
backup_metadata.truncate(0)
upload_list.truncate(0)
lines = get_metadata()
hash_dir(target_directory)
# add metadata file to upload list
hash_result = hash_file(meta_name)
file_output = hash_result + "$" + meta_name.replace(BASE_DIR, '') + "\n"
upload_list.write(file_output)
backup_metadata.close()
upload_list.close()
if __name__ == '__main__':
main(sys.argv)