Merge pull request #27 from brootware/refactor-regexes

brootware · web-flow · commit 31c5662101ff · 2022-05-17T20:24:06.000+08:00
Refactor regexes
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,10 @@ redacted_*
 ip_test.txt
 multiredact
 manhours*
+hashshadow*
+.hashshadow*
+*.txt
+.vscode
 # Ignoring sensitive files and directories.
 
 secret*.*
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pyredactkit"
-version = "0.1.5"
+version = "0.1.6"
 description = "Python cli tool to redact sensitive data"
 authors = ["brootware <brootware@outlook.com>"]
 license = "GNU"
diff --git a/pyredactkit/pyredactkit.py b/pyredactkit/pyredactkit.py
@@ -4,10 +4,11 @@
 """
 
 import argparse
+from ast import arg
 from pyredactkit.redact import Redactor
+from pyredactkit.unredact import Unredactor
 import os
 import glob
-# lintertest
 
 banner = """
     ______       ______         _            _     _   ___ _   
@@ -35,8 +36,19 @@ def main():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
     parser.add_argument(
-        "file", nargs="+",
-        help="Path of a file or a directory of files"
+        "file",
+        nargs="+",
+        help="""
+        Path of a file or a directory of files.
+        Usage: pyredactkit [file/filestoredact]"""
+    )
+    parser.add_argument(
+        "-u",
+        "--unredact",
+        help="""
+        Option to unredact masked data.
+        Usage: pyredactkit [redactedfile] -u [.hashshadow.json]
+        """
     )
     parser.add_argument(
         "-t", "--redactiontype",
@@ -47,12 +59,16 @@ def main():
         emails,
         ipv4,
         ipv6,
-        base64"""
+        base64.
+        Usage: pyredactkit [file/filestoredact] -t ip"""
     )
     parser.add_argument(
         "-d",
         "--dirout",
-        help="Output directory of the file"
+        help="""
+        Output directory of the file.
+        Usage: pyredactkit [file/filestoredact] -d [redacted_dir]
+        """
     )
     parser.add_argument(
         '-r',
@@ -82,13 +98,16 @@ def main():
 
     # redact file
     redact_obj = Redactor()
+    unredact_obj = Unredactor()
 
     for file in files:
         if args.redactiontype:
             redact_obj.process_file(file, args.redactiontype)
         elif args.dirout:
             redact_obj.process_file(file, args.redactiontype, args.dirout)
             redact_obj.process_report(file, args.dirout)
+        elif args.unredact:
+            unredact_obj.unredact(file, args.unredact)
         else:
             redact_obj.process_file(file)
             redact_obj.process_report(file)
diff --git a/pyredactkit/redact.py b/pyredactkit/redact.py
@@ -5,6 +5,8 @@
 import sys
 import re
 import math
+import json
+import uuid
 
 from pyredactkit.identifiers import Identifier
 
@@ -79,6 +81,20 @@ def allowed_file(self, file):
             return False
         return mimetypes.guess_type(file)[0] in self.get_allowed_files()
 
+    def write_hashmap(self, hash_map=dict, filename=str):
+        """Function that writes a .hashshadow_file.txt.json to os directory.
+        Args:
+            hash_map (dictionary): dictionary object to be written to file.
+            filename (str): name of supplied file
+
+        Returns:
+            Writes .hashshadow_file.txt.json to os directory
+        """
+        with open(f".hashshadow_{os.path.basename(filename)}.json", "w", encoding="utf-8") as file:
+            json.dump(hash_map, file)
+            print(
+                f"[ + ].hashshadow_{os.path.basename(filename)}.json file generated. Keep this safe if you need to undo the redaction.")
+
     def valid_options(self):
         """Function to read in valid options from Identifier.regexes
         Args:
@@ -92,24 +108,32 @@ def valid_options(self):
             option_tuple += id['type']
         return option_tuple
 
-    def redact_specific(self, line=str, option=str):
+    def redact_specific(self, line=str, option=str, filename=str):
         """Function to redact specific option
         Args:
             line (str) : line to be supplied to redact
             option (str): (optional) choice for redaction
+            filename (str): name of supplied file
 
         Returns:
-            redacted_line (str): redacted line
+            line (str): redacted line
         """
-        redacted_line = ''
+        hash_map = {}
 
         for id in id_object.regexes:
             redact_pattern = id['pattern']
-            if option in id['type']:
-                redacted_line = re.sub(
-                    redact_pattern, self.block, line, flags=re.IGNORECASE)
-
-        return redacted_line
+            if option in id['type'] and re.search(
+                    redact_pattern, line, flags=re.IGNORECASE):
+                pattern_string = re.search(
+                    redact_pattern, line, flags=re.IGNORECASE)
+                pattern_string = pattern_string.group(0)
+                masked_data = str(uuid.uuid4())
+                hash_map.update({masked_data: pattern_string})
+                line = re.sub(
+                    redact_pattern, masked_data, line, flags=re.IGNORECASE)
+
+        self.write_hashmap(hash_map, filename)
+        return line
 
     def redact_name(self, data=str):
         """Main function to redact
@@ -136,6 +160,7 @@ def process_file(self, filename, option=str, savedir="./"):
             Creates redacted file.
         """
         count = 0
+        hash_map = {}
         options_list = self.valid_options()
         try:
             # Open a file read pointer as target_file
@@ -170,11 +195,19 @@ def process_file(self, filename, option=str, savedir="./"):
                             f"[ + ] No option supplied, will be redacting all the sensitive data supported")
                         for line in target_file:
                             for p in id_object.regexes:
-                                if re.search(p['pattern'], line, flags=re.IGNORECASE):
+                                redact_pattern = p['pattern']
+                                if re.search(redact_pattern, line, flags=re.IGNORECASE):
                                     count += 1
-                                    line = re.sub(p['pattern'], self.block, line,
+                                    pattern_string = re.search(
+                                        redact_pattern, line, flags=re.IGNORECASE)
+                                    pattern_string = pattern_string.group(0)
+                                    masked_data = str(uuid.uuid4())
+                                    hash_map.update(
+                                        {masked_data: pattern_string})
+                                    line = re.sub(redact_pattern, masked_data, line,
                                                   flags=re.IGNORECASE)
                             result.write(line)
+                        self.write_hashmap(hash_map, filename)
                     # Separate option to redact names
                     elif option in ("name", "names"):
                         content = target_file.read()
@@ -193,7 +226,7 @@ def process_file(self, filename, option=str, savedir="./"):
                             for id in id_object.regexes:
                                 if option in id['type'] and re.search(id['pattern'], line, flags=re.IGNORECASE):
                                     count += 1
-                            line = self.redact_specific(line, option)
+                            line = self.redact_specific(line, option, filename)
                             result.write(line)
 
                     print(f"[ + ] Redacted {count} targets...")
diff --git a/pyredactkit/unredact.py b/pyredactkit/unredact.py
@@ -0,0 +1,59 @@
+import json
+import sys
+import os
+
+
+class Unredactor:
+    """Redactor class
+    Class containing all methods to support un-redaction
+    of masked data
+
+    """
+
+    def __init__(self):
+        """
+        Class Initialization
+        Args:
+            None
+
+        Returns:
+            None
+        """
+
+    def replace_all(self, text, dictionary):
+        """Function to replace all the text from string
+        Args:
+            text (str): A line of text in string format
+            dictionary (dict): A key value pair of masked data and original data
+
+        Returns:
+            text (str): A line of text after replacing masked data with original data
+        """
+        for k, v in dictionary.items():
+            text = text.replace(k, v)
+        return text
+
+    def unredact(self, redacted_file=str, lookup_file=str):
+        """Function to unredact masked data and produces original unredacted data.
+        Args:
+            redacted_file (str): Name of the redacted file
+            lookup_file (str): Name of the file to look up key value map of masked data and original data.
+
+
+        Returns:
+            Writes unredacted_file.txt with original unmasked data.
+        """
+        with open(redacted_file, encoding="utf-8") as redacted_target:
+            try:
+                with open(lookup_file, encoding="utf-8") as lookup_target:
+                    content = json.load(lookup_target)
+                    with open(f"unredacted_{os.path.basename(redacted_file)}", "w", encoding="utf-8") as write_file:
+                        for line in redacted_target:
+                            line = self.replace_all(line, content)
+                            write_file.write(line)
+                        print(
+                            f"[ + ] Unredacted results saved to unredacted_{os.path.basename(redacted_file)}")
+            except FileNotFoundError:
+                sys.exit(f"[ - ] {lookup_file} file was not found")
+            except json.JSONDecodeError:
+                sys.exit(f"[ - ] Issue decoding {lookup_file} file")
diff --git a/test/file_handling.py b/test/file_handling.py
@@ -0,0 +1,34 @@
+import os
+
+
+def read_file(savedir="./"):
+    with open("test.txt", encoding="utf-8") as target_file:
+        if savedir != "./" and savedir[-1] != "/":
+            savedir = savedir + "/"
+
+        # created the directory if not present
+        if not os.path.exists(os.path.dirname(savedir)):
+            print(
+                "[ + ] "
+                + os.path.dirname(savedir)
+                + " directory does not exist, creating it."
+            )
+            os.makedirs(os.path.dirname(savedir))
+
+        content = target_file.read()
+
+    return content
+
+
+# def process_report(savedir="./"):
+#     content = read_file()
+#     with open(
+#         f"{savedir}redacted_{os.path.basename('test.txt')}",
+#         "w",
+#         encoding="utf-8",
+#     ) as result:
+#         for line in content:
+#             print(line)
+
+
+# process_report()
diff --git a/test/hash_test.py b/test/hash_test.py
@@ -1,6 +1,9 @@
 import re
 import hashlib
 import json
+import os
+import sys
+import uuid
 
 block = "\u2588" * 15
 data = """
@@ -48,38 +51,40 @@
 ]
 
 
-def write_hashmap(hashed_string, pattern_string, count):
-    hash_map = []
-    for i in range(0, count, 1):
-        hash_map.append({hashed_string: pattern_string})
-    with open("hashmap.txt", "w", encoding="utf-8") as file:
+def write_hashmap(hash_map):
+    with open(".hashshadow.json", "w", encoding="utf-8") as file:
         json.dump(hash_map, file)
 
 
-count = 0
-pattern_string_list = []
-with open("hashtest.txt", encoding="utf-8") as target_file:
-    with open(
-        f"redacted_test.txt",
-        "w",
-        encoding="utf-8",
-    ) as result:
-        for line in target_file:
-            for id in regexes:
-                redact_pattern = id['pattern']
-                if re.search(redact_pattern, line, flags=re.IGNORECASE):
-                    count += 1
-                    pattern_string = re.search(
-                        redact_pattern, line, flags=re.IGNORECASE)
-                    pattern_string = pattern_string.group(0)
-                    hashed_string = hashlib.sha256(
-                        pattern_string.encode()).hexdigest()
-                    # print(pattern_string.group(0))
-                    # with open("hashmap.txt", "w", encoding="utf-8") as file:
-                    #     file.writelines(f"{hashed_string}:{pattern_string}\n")
-                    write_hashmap(hashed_string, pattern_string,
-                                  count)
-                    line = re.sub(redact_pattern, hashed_string, line,
-                                  flags=re.IGNORECASE)
+def salt_hash(to_hash):
+    salt = os.urandom(32)  # A new salt to be appended to string
+    masked_data = hashlib.sha256(to_hash.encode() + salt).hexdigest()
+    return masked_data
 
-            result.write(line)
+
+def process_redact():
+    hash_map = {}
+    with open("test.txt", encoding="utf-8") as target_file:
+        with open(
+            f"redacted_test.txt",
+            "w",
+            encoding="utf-8",
+        ) as result:
+            for line in target_file:
+                for id in regexes:
+                    redact_pattern = id['pattern']
+                    if re.search(redact_pattern, line, flags=re.IGNORECASE):
+                        pattern_string = re.search(
+                            redact_pattern, line, flags=re.IGNORECASE)
+                        pattern_string = pattern_string.group(0)
+                        masked_data = str(uuid.uuid4())
+                        hash_map.update({masked_data: pattern_string})
+                        line = re.sub(redact_pattern, masked_data, line,
+                                      flags=re.IGNORECASE)
+
+                result.write(line)
+
+    write_hashmap(hash_map)
+
+
+process_redact()
diff --git a/test/hashmap.txt b/test/hashmap.txt
diff --git a/test/hashtest.txt b/test/hashtest.txt
diff --git a/test/unredact.py b/test/unredact.py