Skip to content

Commit 3410d73

Browse files
authored
PEM obfuscation (#289)
1 parent d408ab3 commit 3410d73

File tree

8 files changed

+110
-114
lines changed

8 files changed

+110
-114
lines changed

.ci/benchmark.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
META MD5 0f056a20273ef7291f1c4fb70600972e
2-
DATA MD5 2eed0cbe31bb629ffc744b916d064882
1+
META MD5 8fac9e2b7c95044650e74fff448ddf83
2+
DATA MD5 d4c0a42111715e6f26fa918cb41b290a
33
DATA: 16995334 interested lines. MARKUP: 63711 items
44
FileType FileNumber ValidLines Positives Negatives
55
--------------- ------------ ------------ ----------- -----------
@@ -94,7 +94,7 @@ FileType FileNumber ValidLines Positives Negatives
9494
.jwt 1 1 2
9595
.key 115 3067 105 11
9696
.ks 1 25 1
97-
.kt 120 19864 65 381
97+
.kt 120 19864 69 377
9898
.l 1 982 1
9999
.las 1 6656 36
100100
.lasso 1 230 7
@@ -225,7 +225,7 @@ FileType FileNumber ValidLines Positives Negatives
225225
.yml 560 56585 1896 1387
226226
.zsh 6 872 11
227227
.zsh-theme 1 97 1
228-
TOTAL: 11361 16995334 17155 53614
228+
TOTAL: 11361 16995334 17159 53610
229229
credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0
230230
Rules Positives Negatives Reported TP FP TN FN FPR FNR ACC PRC RCL F1
231231
------------------------------ ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ----
@@ -268,7 +268,7 @@ NTLM Token 4 0 0 0
268268
Nonce 131 109 0 0 109 131 0.000000 1.000000 0.454167 0.000000
269269
OTP / 2FA Secret 64 3 0 0 3 64 0.000000 1.000000 0.044776 0.000000
270270
Other 0 20 0 0 20 0 0.000000 1.000000
271-
PEM Private Key 1150 76 0 0 76 1150 0.000000 1.000000 0.061990 0.000000
271+
PEM Private Key 1154 72 0 0 72 1154 0.000000 1.000000 0.058728 0.000000
272272
Password 2597 11365 0 0 11365 2597 0.000000 1.000000 0.813995 0.000000
273273
Perplexity API Key 2 0 0 0 0 2 1.000000 0.000000 0.000000
274274
Postman Credentials 2 0 0 0 0 2 1.000000 0.000000 0.000000
@@ -283,4 +283,4 @@ Token 1140 5268 0 0
283283
Twilio Credentials 30 39 0 0 39 30 0.000000 1.000000 0.565217 0.000000
284284
URL Credentials 225 401 0 0 401 225 0.000000 1.000000 0.640575 0.000000
285285
UUID 2517 3716 0 0 3716 2517 0.000000 1.000000 0.596182 0.000000
286-
17155 53614 0 0 0 53614 17155 0.000000 1.000000 0.757592 0.000000
286+
17159 53610 0 0 0 53610 17159 0.000000 1.000000 0.757535 0.000000

benchmark/scanner/scanner.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from benchmark.common import GitService, LineStatus, Result, ScannerType
1515
from benchmark.scanner.file_type_stat import FileTypeStat
1616
from benchmark.scanner.true_false_counter import TrueFalseCounter
17+
from constants import LABEL_FALSE, LABEL_TRUE
1718
from meta_key import MetaKey
1819
from meta_row import _get_source_gen, MetaRow
1920

@@ -79,7 +80,7 @@ def _prepare_meta(self):
7980
rules = meta_row.Category.split(':')
8081
for rule in rules:
8182
true_cnt, false_cnt = self.rules_markup_counters.get(rule, (0, 0))
82-
if 'T' == meta_row.GroundTruth:
83+
if LABEL_TRUE == meta_row.GroundTruth:
8384
true_cnt += 1
8485
self.total_true_cnt += 1
8586
type_stat.true_markup += 1
@@ -285,7 +286,7 @@ def check_line_from_meta(self,
285286
approximate = f"{self.meta_next_id},{file_id}" \
286287
f",GitHub,{repo_name},{data_path}" \
287288
f",{line_start},{line_end}" \
288-
f",F,{value_start},{value_end}" \
289+
f",{LABEL_FALSE},{value_start},{value_end}" \
289290
f",,,{rule}"
290291
lost_meta = MetaRow({
291292
"Id": self.meta_next_id,
@@ -319,7 +320,7 @@ def check_line_from_meta(self,
319320
# it means, all markups are the same file with line start-end
320321
if 0 > row.ValueStart and 0 > row.ValueEnd:
321322
# the markup is for whole line - any value_start, value_end match
322-
if 'T' == row.GroundTruth and row.LineStart == row.LineEnd:
323+
if LABEL_TRUE == row.GroundTruth and row.LineStart == row.LineEnd:
323324
# True markup has to be marked at least start value in single line
324325
print(f"WARNING True markup for whole line: {row}", flush=True)
325326
pass
@@ -355,7 +356,7 @@ def check_line_from_meta(self,
355356
code = (data_path, row.LineStart, row.LineEnd, row.ValueStart, row.ValueEnd, rule)
356357
if code in self.line_checker:
357358
self.result_cnt -= 1
358-
if 'T' == row.GroundTruth:
359+
if LABEL_TRUE == row.GroundTruth:
359360
print(f"WARNING: Already checked True! Duplicate? {code}", flush=True)
360361
return LineStatus.CHECKED, repo_name, file_name
361362
else:
@@ -364,12 +365,12 @@ def check_line_from_meta(self,
364365
for meta_rule in row.Category.split(':'):
365366
# increase the counter only for corresponded rule mentioned in markup
366367
if meta_rule == rule:
367-
if 'T' == row.GroundTruth:
368+
if LABEL_TRUE == row.GroundTruth:
368369
self._increase_result_dict_cnt(meta_rule, True)
369370
self.true_cnt += 1
370371
return LineStatus.FALSE, repo_name, file_id
371372
else:
372-
# MetaRow class checks the correctness of row.GroundTruth = ['T', 'F']
373+
# MetaRow class checks the correctness of row.GroundTruth
373374
self._increase_result_dict_cnt(meta_rule, False)
374375
self.false_cnt += 1
375376
return LineStatus.TRUE, repo_name, file_id
@@ -484,7 +485,7 @@ def _get_total_true_cnt(self, rule: str) -> int:
484485
total_true_cnt = 0
485486
for rows in self.meta.values():
486487
for row in rows:
487-
if row and 'T' == row.GroundTruth and rule in row.Category.split(':'):
488+
if row and LABEL_TRUE == row.GroundTruth and rule in row.Category.split(':'):
488489
total_true_cnt += 1
489490
return total_true_cnt
490491

constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
LABEL_TRUE = 'T'
2+
LABEL_FALSE = 'F'
3+
LABEL_OTHER = 'X'
4+
ALLOWED_LABELS = (LABEL_TRUE, LABEL_FALSE, LABEL_OTHER)
5+
PRIVATE_KEY_CATEGORY = "PEM Private Key"

meta/48fd3902.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,ValueStart,Valu
2121
86317,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,68,68,T,45,1105,,,Azure Access Token:Token:JSON Web Token
2222
86318,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,70,70,T,43,1123,,,Azure Access Token:Token:JSON Web Token
2323
86319,87e253cc,GitHub,48fd3902,data/48fd3902/test/src/87e253cc.java,81,131,T,11,38,,,PEM Private Key
24-
86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,F,13,46,,,PEM Private Key
25-
86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,F,13,46,,,PEM Private Key
24+
86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,T,13,46,,fake-for-test-but-valid,PEM Private Key
25+
86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,T,13,46,,fake-for-test-but-valid,PEM Private Key
2626
86419,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,496,522,T,13,46,,,PEM Private Key
2727
86493,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,403,429,T,13,46,,,PEM Private Key
2828
86494,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,206,232,T,13,46,,,PEM Private Key
2929
86519,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,236,263,T,13,42,,,PEM Private Key
3030
86546,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,266,270,T,49,41,,,PEM Private Key
31-
86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,F,52,42,,,PEM Private Key
32-
86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,F,58,38,,,PEM Private Key
31+
86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,T,52,42,,fake-for-test-but-valid,PEM Private Key
32+
86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,T,58,38,,fake-for-test-but-valid,PEM Private Key
3333
86628,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,458,508,T,58,42,,,PEM Private Key
3434
86678,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,53,53,F,,,,,Password
3535
86679,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,62,62,F,,,,,Password

meta_row.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from pathlib import Path
55
from typing import Union, List, Generator
66

7+
from constants import ALLOWED_LABELS, LABEL_TRUE, PRIVATE_KEY_CATEGORY
8+
79

810
# dataclass is required for csv writer
911
@dataclasses.dataclass
@@ -26,7 +28,7 @@ class MetaRow:
2628

2729
def __init__(self, row: dict):
2830
if not isinstance(row, dict) or self.__annotations__.keys() != row.keys():
29-
raise RuntimeError(f"ERROR: wrong row {row}")
31+
raise ValueError(f"ERROR: wrong row {row}")
3032
for key, typ in self.__annotations__.items():
3133
if key.startswith("__"):
3234
continue
@@ -45,27 +47,26 @@ def __init__(self, row: dict):
4547
elif typ is str and isinstance(row_val, str):
4648
val = row_val
4749
else:
48-
raise RuntimeError(f"ERROR: Unsupported {typ}")
50+
raise ValueError(f"ERROR: Unsupported {typ}")
4951
self.__setattr__(key, val)
5052
if not hasattr(self, "Category") or not self.Category:
51-
raise RuntimeError(f"ERROR: Category must be set {row}")
53+
raise ValueError(f"ERROR: Category must be set {row}")
5254
if ':' in self.Category:
5355
rules = self.Category.split(':')
5456
rule_set=set(rules)
5557
if len(rules) != len(rule_set):
56-
raise RuntimeError(f"ERROR: Each rule must be once in Category {row}")
58+
raise ValueError(f"ERROR: Each rule must be once in Category {row}")
5759
if "Other" in rule_set:
58-
raise RuntimeError(f"ERROR: 'Other' Category must be single rule in markup {row}")
59-
allowed_labels = ['T', 'F', 'X']
60-
if self.GroundTruth not in allowed_labels:
61-
raise RuntimeError(f"ERROR: GroundTruth must be in {allowed_labels} {row}")
60+
raise ValueError(f"ERROR: 'Other' Category must be single rule in markup {row}")
61+
if self.GroundTruth not in ALLOWED_LABELS:
62+
raise ValueError(f"ERROR: GroundTruth must be in {ALLOWED_LABELS} {row}")
6263
if 0 > self.LineStart or 0 > self.LineEnd:
63-
raise RuntimeError(f"ERROR: LineStart and LineEnd must be positive {row}")
64+
raise ValueError(f"ERROR: LineStart and LineEnd must be positive {row}")
6465
elif self.LineStart > self.LineEnd:
65-
raise RuntimeError(f"ERROR: LineStart must be lower than LineEnd {row}")
66+
raise ValueError(f"ERROR: LineStart must be lower than LineEnd {row}")
6667
elif self.LineStart == self.LineEnd and 0 <= self.ValueStart and 0 <= self.ValueEnd < self.ValueStart:
6768
# multiline value positions are independent
68-
raise RuntimeError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}")
69+
raise ValueError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}")
6970

7071
def __str__(self) -> str:
7172
dict_values = self.__dict__.values()
@@ -85,7 +86,7 @@ def _meta_from_file(meta_path: Path) -> Generator[dict, None, None]:
8586
reader = csv.DictReader(f)
8687
for row in reader:
8788
if not isinstance(row, dict):
88-
raise RuntimeError(f"ERROR: wrong row '{row}' in {meta_path}")
89+
raise ValueError(f"ERROR: wrong row '{row}' in {meta_path}")
8990
yield row
9091

9192

@@ -100,17 +101,17 @@ def _meta_from_dir(meta_path: Path) -> Generator[dict, None, None]:
100101

101102
def _get_source_gen(meta_path: Union[Path]) -> Generator[dict, None, None]:
102103
if not isinstance(meta_path, Path):
103-
raise RuntimeError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}")
104+
raise ValueError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}")
104105

105106
if not meta_path.exists():
106-
raise RuntimeError(f"ERROR: {meta_path} does not exist")
107+
raise ValueError(f"ERROR: {meta_path} does not exist")
107108

108109
if meta_path.is_dir():
109110
source_gen = _meta_from_dir
110111
elif meta_path.is_file():
111112
source_gen = _meta_from_file
112113
else:
113-
raise RuntimeError(f"ERROR: unsupported {meta_path} file type")
114+
raise ValueError(f"ERROR: unsupported {meta_path} file type")
114115
yield from source_gen(meta_path)
115116

116117

@@ -122,7 +123,7 @@ def read_meta(meta_dir: Union[str, Path]) -> List[MetaRow]:
122123
for row in _get_source_gen(Path(meta_dir)):
123124
meta_row = MetaRow(row)
124125
if meta_row.Id in meta_ids:
125-
raise RuntimeError(f"ERROR: duplicate Id row {row}")
126+
raise ValueError(f"ERROR: duplicate Id row {row}")
126127
meta_ids.add(meta_row.Id)
127128

128129
meta.append(meta_row)

obfuscate_creds.py

Lines changed: 25 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
import string
77
import sys
88
from argparse import Namespace, ArgumentParser
9-
from multiprocessing.managers import Value
109
from typing import List
1110

11+
from constants import PRIVATE_KEY_CATEGORY, LABEL_TRUE
1212
from meta_row import read_meta, MetaRow
1313

1414
logging.basicConfig(
@@ -409,32 +409,6 @@ def gen_random_value(value):
409409
return obfuscated_value
410410

411411

412-
def replace_rows(data: List[MetaRow], lines: List[str], noise: int):
413-
# Change data in already copied files
414-
for row in data:
415-
# PEM keys and other multiple-line credentials is processed in other function
416-
if "" != row.CryptographyKey or row.LineEnd != row.LineStart:
417-
continue
418-
419-
if 'T' != row.GroundTruth:
420-
# false cases do not require an obfuscation
421-
continue
422-
423-
if not (0 <= row.ValueStart and 0 <= row.ValueEnd):
424-
continue
425-
426-
if row.Category in ["AWS Multi", "Google Multi"]:
427-
# skip obfuscation for the categories which are multi pattern
428-
continue
429-
430-
old_line = lines[row.LineStart - 1]
431-
value = old_line[row.ValueStart:row.ValueEnd]
432-
# CredSweeper may scan huge lines since v1.6
433-
random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise)
434-
obfuscated_value = get_obfuscated_value(value, row)
435-
new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:]
436-
437-
lines[row.LineStart - 1] = new_line
438412

439413

440414
def split_in_bounds(i: int, lines_len: int, old_line: str):
@@ -524,56 +498,21 @@ def create_new_key(lines: List[str]):
524498
return new_lines
525499

526500

527-
def create_new_multiline(lines: List[str], starting_position: int):
528-
# Create new lines with similar formatting as old one
529-
new_lines = []
530-
531-
first_line = lines[0]
532-
533-
new_lines.append(first_line[:starting_position] + obfuscate_segment(first_line[starting_position:]))
534-
535-
# Do not replace ssh-rsa substring if present
536-
if "ssh-rsa" in first_line:
537-
s = first_line.find("ssh-rsa")
538-
new_lines[0] = new_lines[0][:s] + "ssh-rsa" + new_lines[0][s + 7:]
539-
540-
for i, old_l in enumerate(lines[1:]):
541-
new_line = obfuscate_segment(old_l)
542-
new_lines.append(new_line)
543-
544-
return new_lines
545-
546-
547501
def process_pem_key(row: MetaRow, lines: List[str], noise: int):
548502
# Change data in already copied files (only keys)
549-
try:
550-
# Skip credentials that are not PEM or multiline
551-
if row.CryptographyKey == "" and row.LineStart == row.LineEnd:
552-
return
553-
554-
if row.Category in ["AWS Multi", "Google Multi"]:
555-
# skip double obfuscation for the categories
556-
return
557-
558-
random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise)
559-
560-
if '' != row.CryptographyKey:
561-
new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd])
562-
else:
563-
new_lines = create_new_multiline(lines[row.LineStart - 1:row.LineEnd], row.ValueStart)
564-
565-
lines[row.LineStart - 1:row.LineEnd] = new_lines
566-
567-
except Exception as exc:
568-
logger.error(f"FAILURE: {row}")
569-
logger.critical(exc)
570-
raise
503+
random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise)
504+
new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd])
505+
lines[row.LineStart - 1:row.LineEnd] = new_lines
571506

572507

573-
def process_pem_keys(data: List[MetaRow], lines: List[str], noise: int):
574-
for row in data:
575-
if 'T' == row.GroundTruth and "Private Key" == row.Category:
576-
process_pem_key(row, lines, noise)
508+
def process_single_value(row: MetaRow, lines: List[str], noise: int):
509+
random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise)
510+
old_line = lines[row.LineStart - 1]
511+
value = old_line[row.ValueStart:row.ValueEnd]
512+
# CredSweeper may scan huge lines since v1.6
513+
obfuscated_value = get_obfuscated_value(value, row)
514+
new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:]
515+
lines[row.LineStart - 1] = new_line
577516

578517

579518
def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0):
@@ -594,9 +533,19 @@ def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0):
594533
logger.critical(exc)
595534
raise
596535
meta_rows.sort(key=lambda x: (x.LineStart, x.LineEnd, x.ValueStart, x.ValueEnd))
597-
replace_rows(meta_rows, lines, noise)
598-
process_pem_keys(meta_rows, lines, noise)
599-
536+
for row in meta_rows:
537+
if LABEL_TRUE != row.GroundTruth:
538+
# obfuscation is only for True cases
539+
continue
540+
elif row.Category in ["AWS Multi", "Google Multi"]:
541+
# skip obfuscation for the categories which are multi pattern
542+
continue
543+
elif PRIVATE_KEY_CATEGORY == row.Category and row.LineStart < row.LineEnd:
544+
# multiline PEM keys obfuscation
545+
process_pem_key(row, lines, noise)
546+
elif row.LineStart == row.LineEnd and 0 <= row.ValueStart < row.ValueEnd:
547+
# single value obfuscation
548+
process_single_value(row, lines, noise)
600549
with open(dataset_file, "w", encoding="utf8") as f:
601550
f.write('\n'.join(lines))
602551

0 commit comments

Comments
 (0)