ESA-project/hash_dedup.py at main · amelia-cook/ESA-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/python3

# this file is used to perform the deduplication of our backup system.
# given backup directory as an argument, this file inspects each file and
# subsirectory and confirms which are new or updated since previous backups.
# each run of this program, an upload.txt file is created, which contains a list
# of the new or modified files that need to be uploaded. a metadata.txt file is
# also created, which lists every file in the backup.
# USAGE: python3 hash_dedup.py BACKUP_DIR

import sys
import os
import hashlib
from datetime import datetime
import boto3
import gzip
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad

BUF_SIZE = 4096
BUCKET_NAME = "duke2025-esa03-backup-storage"
BASE_DIR = "/mnt/btrfs_project_drive/snapshots/"
UPLOAD_PATH = "/mnt/btrfs_project_drive/upload_files.txt"
DATE_FORMAT = '%d%b%Y-%H-%M'

# decrypt_aes_cbc(ciphertext, key)
#  parameters: ciphertext   the encrypted data
#              key          the encryption key
# description: this file is used to decrypt files with a given key
#     returns: the decrypted data
def decrypt_aes_cbc(ciphertext, key):
    iv = ciphertext[:16]
    cipher = AES.new(key, AES.MODE_CBC, iv)
    plaintext_padded = cipher.decrypt(ciphertext[16:])
    return unpad(plaintext_padded, AES.block_size)

# get_file_metadata(lines, filehash)
#  parameters: lines        a list of the lines from the metadata file
#              filehash     the specific hash we are looking for
# description: given lines of metadata, this function returns the line that
#              contains the specified file hash
#     returns: the line of metadata with the file hash, or none if none exists
def get_file_metadata(lines, filehash):
    for line in lines:
        if filehash in line:
            return line
    return None

# hash_file(filename)
#  parameters: filename     the path of the file to hash
# description: given the file path, this function hashes the entire file
#     returns: the hash of the provided file
def hash_file(filename):
    hash_function = hashlib.new('sha256')
    fp = open(filename, 'rb')
    data = fp.read(BUF_SIZE)
    while (data):
        hash_function.update(data)
        data = fp.read(BUF_SIZE)
    return hash_function.hexdigest()

# hash_dir(directory, lines)
#  parameters: directory    the directory to traverse
#              lines        the lines of metadata to compare against
# description: given a directory path, this function explores the directory and
#              hashes its files, saving the results in various metadata files
def hash_dir(directory):
    global backup_metadata
    global upload_list
    global meta_name
    global lines

    # traverse directory to inspect each file
    for entry in os.scandir(directory):
        path = entry.path.replace(BASE_DIR, '')

        # if current entry is a subdirectory, recursively explore
        if entry.is_dir():
            print("subdir:\t\t" + path)

            backup_metadata.write("directory$" + path + "\n")
            hash_dir(entry.path)

        elif entry.is_file() and entry.path != meta_name:
            print("file:\t\t" + path)

            hash_result = hash_file(entry.path)
            file_output = hash_result + "$" + path + "\n"
            file_metadata = get_file_metadata(lines, hash_result)

            # if file is not known on S3, add to list to be uploaded
            if file_metadata is None:
                upload_list.write(file_output)
                lines.append(file_output)
                print("uploading: " + path)

            backup_metadata.write(file_output)

# get_metadata()
# description: this function retrieves all metadata files form s3 and combines
#              them into a list of all the files on the server
#     returns: the lines of the metadata files
def get_metadata():
    s3 = boto3.client('s3')
    files = []
    lines = []

    # retrieve encryption key
    with open('aes_key.bin', 'rb') as key_file:
        key = key_file.read()

    # get s3 files, and return empty list s3 is empty
    response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix='snapshots/')
    if 'Contents' not in response:
        return lines

    # collect metadata files from retrieved s3 files
    for obj in response['Contents']:
        if "metadata.txt" in obj['Key']:
            files.append(obj['Key'])

    # decrypt metadata files and add to list
    for file in files:
        response = s3.get_object(Bucket=BUCKET_NAME, Key=file)
        encrypted_data = response['Body'].read()
        decrypted_data = decrypt_aes_cbc(encrypted_data, key)
        content = gzip.decompress(decrypted_data).decode('utf-8')
        lines += content.splitlines()

    return lines

# get_metadata()
#  parameters: args         directory of the target backup to dedup
# description: this function drives the program, opening metadata files and
#              initiating the deduplication process
def main(args):
    if len(sys.argv) != 2:
        sys.exit(1)

    global backup_metadata
    global upload_list
    global meta_name
    global lines

    target_directory = BASE_DIR + args[1]
    meta_name = target_directory + "/metadata.txt"

    backup_metadata = open(meta_name, "w+")
    upload_list = open(UPLOAD_PATH, "w+")
    backup_metadata.truncate(0)
    upload_list.truncate(0)

    lines = get_metadata()
    hash_dir(target_directory)

    # add metadata file to upload list
    hash_result = hash_file(meta_name)
    file_output = hash_result + "$" + meta_name.replace(BASE_DIR, '') + "\n"
    upload_list.write(file_output)

    backup_metadata.close()
    upload_list.close()

if __name__ == '__main__':
    main(sys.argv)