55import sys
66import re
77import math
8+ import json
9+ import uuid
810
911from pyredactkit .identifiers import Identifier
1012
@@ -79,6 +81,20 @@ def allowed_file(self, file):
7981 return False
8082 return mimetypes .guess_type (file )[0 ] in self .get_allowed_files ()
8183
84+ def write_hashmap (self , hash_map = dict , filename = str ):
85+ """Function that writes a .hashshadow_file.txt.json to os directory.
86+ Args:
87+ hash_map (dictionary): dictionary object to be written to file.
88+ filename (str): name of supplied file
89+
90+ Returns:
91+ Writes .hashshadow_file.txt.json to os directory
92+ """
93+ with open (f".hashshadow_{ os .path .basename (filename )} .json" , "w" , encoding = "utf-8" ) as file :
94+ json .dump (hash_map , file )
95+ print (
96+ f"[ + ].hashshadow_{ os .path .basename (filename )} .json file generated. Keep this safe if you need to undo the redaction." )
97+
8298 def valid_options (self ):
8399 """Function to read in valid options from Identifier.regexes
84100 Args:
@@ -92,24 +108,32 @@ def valid_options(self):
92108 option_tuple += id ['type' ]
93109 return option_tuple
94110
95- def redact_specific (self , line = str , option = str ):
111+ def redact_specific (self , line = str , option = str , filename = str ):
96112 """Function to redact specific option
97113 Args:
98114 line (str) : line to be supplied to redact
99115 option (str): (optional) choice for redaction
116+ filename (str): name of supplied file
100117
101118 Returns:
102- redacted_line (str): redacted line
119+ line (str): redacted line
103120 """
104- redacted_line = ''
121+ hash_map = {}
105122
106123 for id in id_object .regexes :
107124 redact_pattern = id ['pattern' ]
108- if option in id ['type' ]:
109- redacted_line = re .sub (
110- redact_pattern , self .block , line , flags = re .IGNORECASE )
111-
112- return redacted_line
125+ if option in id ['type' ] and re .search (
126+ redact_pattern , line , flags = re .IGNORECASE ):
127+ pattern_string = re .search (
128+ redact_pattern , line , flags = re .IGNORECASE )
129+ pattern_string = pattern_string .group (0 )
130+ masked_data = str (uuid .uuid4 ())
131+ hash_map .update ({masked_data : pattern_string })
132+ line = re .sub (
133+ redact_pattern , masked_data , line , flags = re .IGNORECASE )
134+
135+ self .write_hashmap (hash_map , filename )
136+ return line
113137
114138 def redact_name (self , data = str ):
115139 """Main function to redact
@@ -136,6 +160,7 @@ def process_file(self, filename, option=str, savedir="./"):
136160 Creates redacted file.
137161 """
138162 count = 0
163+ hash_map = {}
139164 options_list = self .valid_options ()
140165 try :
141166 # Open a file read pointer as target_file
@@ -170,11 +195,19 @@ def process_file(self, filename, option=str, savedir="./"):
170195 f"[ + ] No option supplied, will be redacting all the sensitive data supported" )
171196 for line in target_file :
172197 for p in id_object .regexes :
173- if re .search (p ['pattern' ], line , flags = re .IGNORECASE ):
198+ redact_pattern = p ['pattern' ]
199+ if re .search (redact_pattern , line , flags = re .IGNORECASE ):
174200 count += 1
175- line = re .sub (p ['pattern' ], self .block , line ,
201+ pattern_string = re .search (
202+ redact_pattern , line , flags = re .IGNORECASE )
203+ pattern_string = pattern_string .group (0 )
204+ masked_data = str (uuid .uuid4 ())
205+ hash_map .update (
206+ {masked_data : pattern_string })
207+ line = re .sub (redact_pattern , masked_data , line ,
176208 flags = re .IGNORECASE )
177209 result .write (line )
210+ self .write_hashmap (hash_map , filename )
178211 # Separate option to redact names
179212 elif option in ("name" , "names" ):
180213 content = target_file .read ()
@@ -193,7 +226,7 @@ def process_file(self, filename, option=str, savedir="./"):
193226 for id in id_object .regexes :
194227 if option in id ['type' ] and re .search (id ['pattern' ], line , flags = re .IGNORECASE ):
195228 count += 1
196- line = self .redact_specific (line , option )
229+ line = self .redact_specific (line , option , filename )
197230 result .write (line )
198231
199232 print (f"[ + ] Redacted { count } targets..." )
0 commit comments