Skip to content

Commit 1368700

Browse files
authored
Add noise argument for obfuscation (#278)
* add noise for obfuscation * fix
1 parent ff9f12f commit 1368700

File tree

2 files changed

+36
-29
lines changed

2 files changed

+36
-29
lines changed

download_data.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -239,12 +239,12 @@ def check_snapshot_meta(snapshot: dict) -> int:
239239
return result
240240

241241

242-
def main(args: Namespace):
243-
if os.path.exists(args.data_dir):
244-
if not args.clean_data:
245-
raise FileExistsError(f"{args.data_dir} directory already exists. "
242+
def process(arguments: Namespace) -> int:
243+
if os.path.exists(arguments.data_dir):
244+
if not arguments.clean_data:
245+
raise FileExistsError(f"{arguments.data_dir} directory already exists. "
246246
f"Please remove it or select other directory.")
247-
shutil.rmtree(args.data_dir)
247+
shutil.rmtree(arguments.data_dir)
248248

249249
with open("snapshot.json", encoding="utf_8") as f:
250250
snapshot = json.load(f)
@@ -253,31 +253,34 @@ def main(args: Namespace):
253253
logger.critical(f"Check logs, fix and restart if necessary: {new_meta_count}")
254254
return 1
255255

256-
jobs = 1 if not args.jobs else max(1, int(args.jobs))
257-
if not args.skip_download:
256+
jobs = 1 if not arguments.jobs else max(1, int(arguments.jobs))
257+
if not arguments.skip_download:
258258
logger.info("Start download")
259259
os.makedirs(TMP_DIR, exist_ok=True)
260260
download(snapshot, jobs)
261261
logger.info("Download finished. Now processing the files...")
262262
else:
263263
logger.info("Download skipped. Now processing the files...")
264264

265-
removed_meta = move_files(snapshot, args.data_dir)
265+
removed_meta = move_files(snapshot, arguments.data_dir)
266266
# check whether there were issues with downloading
267267
assert 0 == len(removed_meta), removed_meta
268268
logger.info("Finalizing dataset. Please wait a moment...")
269-
obfuscate_creds("meta", args.data_dir)
270-
logger.info(f"Done! All files saved to {args.data_dir}")
269+
obfuscate_creds("meta", arguments.data_dir, arguments.noise)
270+
logger.info(f"Done! All files saved to {arguments.data_dir}")
271271
return 0
272272

273273

274-
if __name__ == "__main__":
274+
def main(argv) -> int:
275275
parser = ArgumentParser(prog="python download_data.py")
276-
277276
parser.add_argument("--data_dir", dest="data_dir", default="data", help="Dataset location after download")
278277
parser.add_argument("--jobs", dest="jobs", help="Jobs for multiprocessing")
279278
parser.add_argument("--skip_download", help="Skip download", action="store_const", const=True)
280279
parser.add_argument("--clean_data", help="Recreate data dir", action="store_const", const=True)
281-
_args = parser.parse_args()
280+
parser.add_argument("--noise", help="Seed perturbation", type=int, default=0)
281+
arguments = parser.parse_args(argv[1:])
282+
return process(arguments)
283+
282284

283-
sys.exit(main(_args))
285+
if __name__ == """__main__""":
286+
sys.exit(main(sys.argv))

obfuscate_creds.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ def gen_random_value(value):
397397
return obfuscated_value
398398

399399

400-
def replace_rows(data: List[MetaRow], lines: List[str]):
400+
def replace_rows(data: List[MetaRow], lines: List[str], noise: int):
401401
# Change data in already copied files
402402
for row in data:
403403
# PEM keys and other multiple-line credentials is processed in other function
@@ -418,7 +418,7 @@ def replace_rows(data: List[MetaRow], lines: List[str]):
418418
old_line = lines[row.LineStart - 1]
419419
value = old_line[row.ValueStart:row.ValueEnd]
420420
# CredSweeper may scan huge lines since v1.6
421-
random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16))
421+
random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise)
422422
obfuscated_value = get_obfuscated_value(value, row)
423423
new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:]
424424

@@ -532,7 +532,7 @@ def create_new_multiline(lines: List[str], starting_position: int):
532532
return new_lines
533533

534534

535-
def process_pem_key(row: MetaRow, lines:List[str]):
535+
def process_pem_key(row: MetaRow, lines: List[str], noise: int):
536536
# Change data in already copied files (only keys)
537537
try:
538538
# Skip credentials that are not PEM or multiline
@@ -543,7 +543,7 @@ def process_pem_key(row: MetaRow, lines:List[str]):
543543
# skip double obfuscation for the categories
544544
return
545545

546-
random.seed(row.LineStart ^ int(row.FileID, 16))
546+
random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise)
547547

548548
if '' != row.CryptographyKey:
549549
new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd])
@@ -557,13 +557,14 @@ def process_pem_key(row: MetaRow, lines:List[str]):
557557
logger.critical(exc)
558558
raise
559559

560-
def process_pem_keys(data: List[MetaRow], lines:List[str]):
560+
561+
def process_pem_keys(data: List[MetaRow], lines: List[str], noise: int):
561562
for row in data:
562563
if 'T' == row.GroundTruth and "Private Key" == row.Category:
563-
process_pem_key(row, lines)
564+
process_pem_key(row, lines, noise)
564565

565566

566-
def obfuscate_creds(meta_dir: str, dataset_dir: str):
567+
def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0):
567568
dataset_files = {}
568569
for meta_row in read_meta(meta_dir):
569570
meta_row.FilePath = meta_row.FilePath.replace("data", dataset_dir, 1)
@@ -581,25 +582,28 @@ def obfuscate_creds(meta_dir: str, dataset_dir: str):
581582
logger.critical(exc)
582583
raise
583584
meta_rows.sort(key=lambda x: (x.LineStart, x.LineEnd, x.ValueStart, x.ValueEnd))
584-
replace_rows(meta_rows, lines)
585-
process_pem_keys(meta_rows, lines)
585+
replace_rows(meta_rows, lines, noise)
586+
process_pem_keys(meta_rows, lines, noise)
586587

587588
with open(dataset_file, "w", encoding="utf8") as f:
588589
f.write('\n'.join(lines))
589590

590591

591-
def main(args: Namespace):
592-
obfuscate_creds(args.meta_dir, args.data_dir)
592+
def obfuscate(arguments: Namespace) -> int:
593+
obfuscate_creds(arguments.meta_dir, arguments.data_dir, arguments.noise)
593594
logger.info(f"Obfuscation was done")
594595
return 0
595596

596597

597-
if __name__ == "__main__":
598+
def main(argv) -> int:
598599
parser = ArgumentParser(prog="python obfuscate_creds.py")
599-
600600
parser.add_argument("--meta_dir", dest="meta_dir", default="meta", help="Dataset markup")
601601
parser.add_argument("--data_dir", dest="data_dir", default="data", help="Dataset location after download")
602602
parser.add_argument("--jobs", dest="jobs", help="Jobs for multiprocessing")
603-
_args = parser.parse_args()
603+
parser.add_argument("--noise", help="Seed perturbation", type=int, default=0)
604+
arguments = parser.parse_args(argv[1:])
605+
return obfuscate(arguments)
606+
604607

605-
sys.exit(main(_args))
608+
if __name__ == """__main__""":
609+
sys.exit(main(sys.argv))

0 commit comments

Comments
 (0)