Skip to content

Commit 31c5662

Browse files
authored
Merge pull request #27 from brootware/refactor-regexes
Refactor regexes
2 parents c5d0f8d + a9591c7 commit 31c5662

File tree

10 files changed

+234
-49
lines changed

10 files changed

+234
-49
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ redacted_*
66
ip_test.txt
77
multiredact
88
manhours*
9+
hashshadow*
10+
.hashshadow*
11+
*.txt
12+
.vscode
913
# Ignoring sensitive files and directories.
1014

1115
secret*.*

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pyredactkit"
3-
version = "0.1.5"
3+
version = "0.1.6"
44
description = "Python cli tool to redact sensitive data"
55
authors = ["brootware <[email protected]>"]
66
license = "GNU"

pyredactkit/pyredactkit.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
"""
55

66
import argparse
7+
from ast import arg
78
from pyredactkit.redact import Redactor
9+
from pyredactkit.unredact import Unredactor
810
import os
911
import glob
10-
# lintertest
1112

1213
banner = """
1314
______ ______ _ _ _ ___ _
@@ -35,8 +36,19 @@ def main():
3536
formatter_class=argparse.ArgumentDefaultsHelpFormatter
3637
)
3738
parser.add_argument(
38-
"file", nargs="+",
39-
help="Path of a file or a directory of files"
39+
"file",
40+
nargs="+",
41+
help="""
42+
Path of a file or a directory of files.
43+
Usage: pyredactkit [file/filestoredact]"""
44+
)
45+
parser.add_argument(
46+
"-u",
47+
"--unredact",
48+
help="""
49+
Option to unredact masked data.
50+
Usage: pyredactkit [redactedfile] -u [.hashshadow.json]
51+
"""
4052
)
4153
parser.add_argument(
4254
"-t", "--redactiontype",
@@ -47,12 +59,16 @@ def main():
4759
emails,
4860
ipv4,
4961
ipv6,
50-
base64"""
62+
base64.
63+
Usage: pyredactkit [file/filestoredact] -t ip"""
5164
)
5265
parser.add_argument(
5366
"-d",
5467
"--dirout",
55-
help="Output directory of the file"
68+
help="""
69+
Output directory of the file.
70+
Usage: pyredactkit [file/filestoredact] -d [redacted_dir]
71+
"""
5672
)
5773
parser.add_argument(
5874
'-r',
@@ -82,13 +98,16 @@ def main():
8298

8399
# redact file
84100
redact_obj = Redactor()
101+
unredact_obj = Unredactor()
85102

86103
for file in files:
87104
if args.redactiontype:
88105
redact_obj.process_file(file, args.redactiontype)
89106
elif args.dirout:
90107
redact_obj.process_file(file, args.redactiontype, args.dirout)
91108
redact_obj.process_report(file, args.dirout)
109+
elif args.unredact:
110+
unredact_obj.unredact(file, args.unredact)
92111
else:
93112
redact_obj.process_file(file)
94113
redact_obj.process_report(file)

pyredactkit/redact.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import sys
66
import re
77
import math
8+
import json
9+
import uuid
810

911
from pyredactkit.identifiers import Identifier
1012

@@ -79,6 +81,20 @@ def allowed_file(self, file):
7981
return False
8082
return mimetypes.guess_type(file)[0] in self.get_allowed_files()
8183

84+
def write_hashmap(self, hash_map=dict, filename=str):
85+
"""Function that writes a .hashshadow_file.txt.json to os directory.
86+
Args:
87+
hash_map (dictionary): dictionary object to be written to file.
88+
filename (str): name of supplied file
89+
90+
Returns:
91+
Writes .hashshadow_file.txt.json to os directory
92+
"""
93+
with open(f".hashshadow_{os.path.basename(filename)}.json", "w", encoding="utf-8") as file:
94+
json.dump(hash_map, file)
95+
print(
96+
f"[ + ].hashshadow_{os.path.basename(filename)}.json file generated. Keep this safe if you need to undo the redaction.")
97+
8298
def valid_options(self):
8399
"""Function to read in valid options from Identifier.regexes
84100
Args:
@@ -92,24 +108,32 @@ def valid_options(self):
92108
option_tuple += id['type']
93109
return option_tuple
94110

95-
def redact_specific(self, line=str, option=str):
111+
def redact_specific(self, line=str, option=str, filename=str):
96112
"""Function to redact specific option
97113
Args:
98114
line (str) : line to be supplied to redact
99115
option (str): (optional) choice for redaction
116+
filename (str): name of supplied file
100117
101118
Returns:
102-
redacted_line (str): redacted line
119+
line (str): redacted line
103120
"""
104-
redacted_line = ''
121+
hash_map = {}
105122

106123
for id in id_object.regexes:
107124
redact_pattern = id['pattern']
108-
if option in id['type']:
109-
redacted_line = re.sub(
110-
redact_pattern, self.block, line, flags=re.IGNORECASE)
111-
112-
return redacted_line
125+
if option in id['type'] and re.search(
126+
redact_pattern, line, flags=re.IGNORECASE):
127+
pattern_string = re.search(
128+
redact_pattern, line, flags=re.IGNORECASE)
129+
pattern_string = pattern_string.group(0)
130+
masked_data = str(uuid.uuid4())
131+
hash_map.update({masked_data: pattern_string})
132+
line = re.sub(
133+
redact_pattern, masked_data, line, flags=re.IGNORECASE)
134+
135+
self.write_hashmap(hash_map, filename)
136+
return line
113137

114138
def redact_name(self, data=str):
115139
"""Main function to redact
@@ -136,6 +160,7 @@ def process_file(self, filename, option=str, savedir="./"):
136160
Creates redacted file.
137161
"""
138162
count = 0
163+
hash_map = {}
139164
options_list = self.valid_options()
140165
try:
141166
# Open a file read pointer as target_file
@@ -170,11 +195,19 @@ def process_file(self, filename, option=str, savedir="./"):
170195
f"[ + ] No option supplied, will be redacting all the sensitive data supported")
171196
for line in target_file:
172197
for p in id_object.regexes:
173-
if re.search(p['pattern'], line, flags=re.IGNORECASE):
198+
redact_pattern = p['pattern']
199+
if re.search(redact_pattern, line, flags=re.IGNORECASE):
174200
count += 1
175-
line = re.sub(p['pattern'], self.block, line,
201+
pattern_string = re.search(
202+
redact_pattern, line, flags=re.IGNORECASE)
203+
pattern_string = pattern_string.group(0)
204+
masked_data = str(uuid.uuid4())
205+
hash_map.update(
206+
{masked_data: pattern_string})
207+
line = re.sub(redact_pattern, masked_data, line,
176208
flags=re.IGNORECASE)
177209
result.write(line)
210+
self.write_hashmap(hash_map, filename)
178211
# Separate option to redact names
179212
elif option in ("name", "names"):
180213
content = target_file.read()
@@ -193,7 +226,7 @@ def process_file(self, filename, option=str, savedir="./"):
193226
for id in id_object.regexes:
194227
if option in id['type'] and re.search(id['pattern'], line, flags=re.IGNORECASE):
195228
count += 1
196-
line = self.redact_specific(line, option)
229+
line = self.redact_specific(line, option, filename)
197230
result.write(line)
198231

199232
print(f"[ + ] Redacted {count} targets...")

pyredactkit/unredact.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import json
2+
import sys
3+
import os
4+
5+
6+
class Unredactor:
7+
"""Redactor class
8+
Class containing all methods to support un-redaction
9+
of masked data
10+
11+
"""
12+
13+
def __init__(self):
14+
"""
15+
Class Initialization
16+
Args:
17+
None
18+
19+
Returns:
20+
None
21+
"""
22+
23+
def replace_all(self, text, dictionary):
24+
"""Function to replace all the text from string
25+
Args:
26+
text (str): A line of text in string format
27+
dictionary (dict): A key value pair of masked data and original data
28+
29+
Returns:
30+
text (str): A line of text after replacing masked data with original data
31+
"""
32+
for k, v in dictionary.items():
33+
text = text.replace(k, v)
34+
return text
35+
36+
def unredact(self, redacted_file=str, lookup_file=str):
37+
"""Function to unredact masked data and produces original unredacted data.
38+
Args:
39+
redacted_file (str): Name of the redacted file
40+
lookup_file (str): Name of the file to look up key value map of masked data and original data.
41+
42+
43+
Returns:
44+
Writes unredacted_file.txt with original unmasked data.
45+
"""
46+
with open(redacted_file, encoding="utf-8") as redacted_target:
47+
try:
48+
with open(lookup_file, encoding="utf-8") as lookup_target:
49+
content = json.load(lookup_target)
50+
with open(f"unredacted_{os.path.basename(redacted_file)}", "w", encoding="utf-8") as write_file:
51+
for line in redacted_target:
52+
line = self.replace_all(line, content)
53+
write_file.write(line)
54+
print(
55+
f"[ + ] Unredacted results saved to unredacted_{os.path.basename(redacted_file)}")
56+
except FileNotFoundError:
57+
sys.exit(f"[ - ] {lookup_file} file was not found")
58+
except json.JSONDecodeError:
59+
sys.exit(f"[ - ] Issue decoding {lookup_file} file")

test/file_handling.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import os
2+
3+
4+
def read_file(savedir="./"):
5+
with open("test.txt", encoding="utf-8") as target_file:
6+
if savedir != "./" and savedir[-1] != "/":
7+
savedir = savedir + "/"
8+
9+
# created the directory if not present
10+
if not os.path.exists(os.path.dirname(savedir)):
11+
print(
12+
"[ + ] "
13+
+ os.path.dirname(savedir)
14+
+ " directory does not exist, creating it."
15+
)
16+
os.makedirs(os.path.dirname(savedir))
17+
18+
content = target_file.read()
19+
20+
return content
21+
22+
23+
# def process_report(savedir="./"):
24+
# content = read_file()
25+
# with open(
26+
# f"{savedir}redacted_{os.path.basename('test.txt')}",
27+
# "w",
28+
# encoding="utf-8",
29+
# ) as result:
30+
# for line in content:
31+
# print(line)
32+
33+
34+
# process_report()

test/hash_test.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import re
22
import hashlib
33
import json
4+
import os
5+
import sys
6+
import uuid
47

58
block = "\u2588" * 15
69
data = """
@@ -48,38 +51,40 @@
4851
]
4952

5053

51-
def write_hashmap(hashed_string, pattern_string, count):
52-
hash_map = []
53-
for i in range(0, count, 1):
54-
hash_map.append({hashed_string: pattern_string})
55-
with open("hashmap.txt", "w", encoding="utf-8") as file:
54+
def write_hashmap(hash_map):
55+
with open(".hashshadow.json", "w", encoding="utf-8") as file:
5656
json.dump(hash_map, file)
5757

5858

59-
count = 0
60-
pattern_string_list = []
61-
with open("hashtest.txt", encoding="utf-8") as target_file:
62-
with open(
63-
f"redacted_test.txt",
64-
"w",
65-
encoding="utf-8",
66-
) as result:
67-
for line in target_file:
68-
for id in regexes:
69-
redact_pattern = id['pattern']
70-
if re.search(redact_pattern, line, flags=re.IGNORECASE):
71-
count += 1
72-
pattern_string = re.search(
73-
redact_pattern, line, flags=re.IGNORECASE)
74-
pattern_string = pattern_string.group(0)
75-
hashed_string = hashlib.sha256(
76-
pattern_string.encode()).hexdigest()
77-
# print(pattern_string.group(0))
78-
# with open("hashmap.txt", "w", encoding="utf-8") as file:
79-
# file.writelines(f"{hashed_string}:{pattern_string}\n")
80-
write_hashmap(hashed_string, pattern_string,
81-
count)
82-
line = re.sub(redact_pattern, hashed_string, line,
83-
flags=re.IGNORECASE)
59+
def salt_hash(to_hash):
60+
salt = os.urandom(32) # A new salt to be appended to string
61+
masked_data = hashlib.sha256(to_hash.encode() + salt).hexdigest()
62+
return masked_data
8463

85-
result.write(line)
64+
65+
def process_redact():
66+
hash_map = {}
67+
with open("test.txt", encoding="utf-8") as target_file:
68+
with open(
69+
f"redacted_test.txt",
70+
"w",
71+
encoding="utf-8",
72+
) as result:
73+
for line in target_file:
74+
for id in regexes:
75+
redact_pattern = id['pattern']
76+
if re.search(redact_pattern, line, flags=re.IGNORECASE):
77+
pattern_string = re.search(
78+
redact_pattern, line, flags=re.IGNORECASE)
79+
pattern_string = pattern_string.group(0)
80+
masked_data = str(uuid.uuid4())
81+
hash_map.update({masked_data: pattern_string})
82+
line = re.sub(redact_pattern, masked_data, line,
83+
flags=re.IGNORECASE)
84+
85+
result.write(line)
86+
87+
write_hashmap(hash_map)
88+
89+
90+
process_redact()

0 commit comments

Comments
 (0)