@@ -397,7 +397,7 @@ def gen_random_value(value):
397397 return obfuscated_value
398398
399399
400- def replace_rows (data : List [MetaRow ], lines : List [str ]):
400+ def replace_rows (data : List [MetaRow ], lines : List [str ], noise : int ):
401401 # Change data in already copied files
402402 for row in data :
403403 # PEM keys and other multiple-line credentials is processed in other function
@@ -418,7 +418,7 @@ def replace_rows(data: List[MetaRow], lines: List[str]):
418418 old_line = lines [row .LineStart - 1 ]
419419 value = old_line [row .ValueStart :row .ValueEnd ]
420420 # CredSweeper may scan huge lines since v1.6
421- random .seed ((row .ValueStart | (row .LineStart << 16 )) ^ int (row .FileID , 16 ))
421+ random .seed ((row .ValueStart | (row .LineStart << 16 )) ^ int (row .FileID , 16 ) ^ noise )
422422 obfuscated_value = get_obfuscated_value (value , row )
423423 new_line = old_line [:row .ValueStart ] + obfuscated_value + old_line [row .ValueEnd :]
424424
@@ -532,7 +532,7 @@ def create_new_multiline(lines: List[str], starting_position: int):
532532 return new_lines
533533
534534
535- def process_pem_key (row : MetaRow , lines :List [str ]):
535+ def process_pem_key (row : MetaRow , lines : List [str ], noise : int ):
536536 # Change data in already copied files (only keys)
537537 try :
538538 # Skip credentials that are not PEM or multiline
@@ -543,7 +543,7 @@ def process_pem_key(row: MetaRow, lines:List[str]):
543543 # skip double obfuscation for the categories
544544 return
545545
546- random .seed (row .LineStart ^ int (row .FileID , 16 ))
546+ random .seed (row .LineStart ^ int (row .FileID , 16 ) ^ noise )
547547
548548 if '' != row .CryptographyKey :
549549 new_lines = create_new_key (lines [row .LineStart - 1 :row .LineEnd ])
@@ -557,13 +557,14 @@ def process_pem_key(row: MetaRow, lines:List[str]):
557557 logger .critical (exc )
558558 raise
559559
560- def process_pem_keys (data : List [MetaRow ], lines :List [str ]):
560+
561+ def process_pem_keys (data : List [MetaRow ], lines : List [str ], noise : int ):
561562 for row in data :
562563 if 'T' == row .GroundTruth and "Private Key" == row .Category :
563- process_pem_key (row , lines )
564+ process_pem_key (row , lines , noise )
564565
565566
566- def obfuscate_creds (meta_dir : str , dataset_dir : str ):
567+ def obfuscate_creds (meta_dir : str , dataset_dir : str , noise : int = 0 ):
567568 dataset_files = {}
568569 for meta_row in read_meta (meta_dir ):
569570 meta_row .FilePath = meta_row .FilePath .replace ("data" , dataset_dir , 1 )
@@ -581,25 +582,28 @@ def obfuscate_creds(meta_dir: str, dataset_dir: str):
581582 logger .critical (exc )
582583 raise
583584 meta_rows .sort (key = lambda x : (x .LineStart , x .LineEnd , x .ValueStart , x .ValueEnd ))
584- replace_rows (meta_rows , lines )
585- process_pem_keys (meta_rows , lines )
585+ replace_rows (meta_rows , lines , noise )
586+ process_pem_keys (meta_rows , lines , noise )
586587
587588 with open (dataset_file , "w" , encoding = "utf8" ) as f :
588589 f .write ('\n ' .join (lines ))
589590
590591
591- def main ( args : Namespace ):
592- obfuscate_creds (args .meta_dir , args .data_dir )
592+ def obfuscate ( arguments : Namespace ) -> int :
593+ obfuscate_creds (arguments .meta_dir , arguments .data_dir , arguments . noise )
593594 logger .info (f"Obfuscation was done" )
594595 return 0
595596
596597
597- if __name__ == "__main__" :
598+ def main ( argv ) -> int :
598599 parser = ArgumentParser (prog = "python obfuscate_creds.py" )
599-
600600 parser .add_argument ("--meta_dir" , dest = "meta_dir" , default = "meta" , help = "Dataset markup" )
601601 parser .add_argument ("--data_dir" , dest = "data_dir" , default = "data" , help = "Dataset location after download" )
602602 parser .add_argument ("--jobs" , dest = "jobs" , help = "Jobs for multiprocessing" )
603- _args = parser .parse_args ()
603+ parser .add_argument ("--noise" , help = "Seed perturbation" , type = int , default = 0 )
604+ arguments = parser .parse_args (argv [1 :])
605+ return obfuscate (arguments )
606+
604607
605- sys .exit (main (_args ))
608+ if __name__ == """__main__""" :
609+ sys .exit (main (sys .argv ))
0 commit comments