diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt index 258082a60..56f5097cc 100644 --- a/.ci/benchmark.txt +++ b/.ci/benchmark.txt @@ -1,5 +1,5 @@ -META MD5 0f056a20273ef7291f1c4fb70600972e -DATA MD5 2eed0cbe31bb629ffc744b916d064882 +META MD5 8fac9e2b7c95044650e74fff448ddf83 +DATA MD5 d4c0a42111715e6f26fa918cb41b290a DATA: 16995334 interested lines. MARKUP: 63711 items FileType FileNumber ValidLines Positives Negatives --------------- ------------ ------------ ----------- ----------- @@ -94,7 +94,7 @@ FileType FileNumber ValidLines Positives Negatives .jwt 1 1 2 .key 115 3067 105 11 .ks 1 25 1 -.kt 120 19864 65 381 +.kt 120 19864 69 377 .l 1 982 1 .las 1 6656 36 .lasso 1 230 7 @@ -225,7 +225,7 @@ FileType FileNumber ValidLines Positives Negatives .yml 560 56585 1896 1387 .zsh 6 872 11 .zsh-theme 1 97 1 -TOTAL: 11361 16995334 17155 53614 +TOTAL: 11361 16995334 17159 53610 credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0 Rules Positives Negatives Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ---- @@ -268,7 +268,7 @@ NTLM Token 4 0 0 0 Nonce 131 109 0 0 109 131 0.000000 1.000000 0.454167 0.000000 OTP / 2FA Secret 64 3 0 0 3 64 0.000000 1.000000 0.044776 0.000000 Other 0 20 0 0 20 0 0.000000 1.000000 -PEM Private Key 1150 76 0 0 76 1150 0.000000 1.000000 0.061990 0.000000 +PEM Private Key 1154 72 0 0 72 1154 0.000000 1.000000 0.058728 0.000000 Password 2597 11365 0 0 11365 2597 0.000000 1.000000 0.813995 0.000000 Perplexity API Key 2 0 0 0 0 2 1.000000 0.000000 0.000000 Postman Credentials 2 0 0 0 0 2 1.000000 0.000000 0.000000 @@ -283,4 +283,4 @@ Token 1140 5268 0 0 Twilio Credentials 30 39 0 0 39 30 0.000000 1.000000 0.565217 0.000000 URL Credentials 225 401 0 0 401 225 0.000000 1.000000 0.640575 0.000000 UUID 2517 3716 0 0 3716 2517 0.000000 1.000000 0.596182 0.000000 - 17155 53614 0 0 0 53614 17155 0.000000 1.000000 0.757592 0.000000 + 17159 53610 0 0 0 53610 17159 0.000000 1.000000 0.757535 0.000000 diff --git a/benchmark/scanner/scanner.py b/benchmark/scanner/scanner.py index f719e735a..8bff6235b 100644 --- a/benchmark/scanner/scanner.py +++ b/benchmark/scanner/scanner.py @@ -14,6 +14,7 @@ from benchmark.common import GitService, LineStatus, Result, ScannerType from benchmark.scanner.file_type_stat import FileTypeStat from benchmark.scanner.true_false_counter import TrueFalseCounter +from constants import LABEL_FALSE, LABEL_TRUE from meta_key import MetaKey from meta_row import _get_source_gen, MetaRow @@ -79,7 +80,7 @@ def _prepare_meta(self): rules = meta_row.Category.split(':') for rule in rules: true_cnt, false_cnt = self.rules_markup_counters.get(rule, (0, 0)) - if 'T' == meta_row.GroundTruth: + if LABEL_TRUE == meta_row.GroundTruth: true_cnt += 1 self.total_true_cnt += 1 type_stat.true_markup += 1 @@ -285,7 +286,7 @@ def check_line_from_meta(self, approximate = f"{self.meta_next_id},{file_id}" \ f",GitHub,{repo_name},{data_path}" \ f",{line_start},{line_end}" \ - f",F,{value_start},{value_end}" \ + f",{LABEL_FALSE},{value_start},{value_end}" \ f",,,{rule}" lost_meta = MetaRow({ "Id": self.meta_next_id, @@ -319,7 +320,7 @@ def check_line_from_meta(self, # it means, all markups are the same file with line start-end if 0 > row.ValueStart and 0 > row.ValueEnd: # the markup is for whole line - any value_start, value_end match - if 'T' == row.GroundTruth and row.LineStart == row.LineEnd: + if LABEL_TRUE == row.GroundTruth and row.LineStart == row.LineEnd: # True markup has to be marked at least start value in single line print(f"WARNING True markup for whole line: {row}", flush=True) pass @@ -355,7 +356,7 @@ def check_line_from_meta(self, code = (data_path, row.LineStart, row.LineEnd, row.ValueStart, row.ValueEnd, rule) if code in self.line_checker: self.result_cnt -= 1 - if 'T' == row.GroundTruth: + if LABEL_TRUE == row.GroundTruth: print(f"WARNING: Already checked True! Duplicate? {code}", flush=True) return LineStatus.CHECKED, repo_name, file_name else: @@ -364,12 +365,12 @@ def check_line_from_meta(self, for meta_rule in row.Category.split(':'): # increase the counter only for corresponded rule mentioned in markup if meta_rule == rule: - if 'T' == row.GroundTruth: + if LABEL_TRUE == row.GroundTruth: self._increase_result_dict_cnt(meta_rule, True) self.true_cnt += 1 return LineStatus.FALSE, repo_name, file_id else: - # MetaRow class checks the correctness of row.GroundTruth = ['T', 'F'] + # MetaRow class checks the correctness of row.GroundTruth self._increase_result_dict_cnt(meta_rule, False) self.false_cnt += 1 return LineStatus.TRUE, repo_name, file_id @@ -484,7 +485,7 @@ def _get_total_true_cnt(self, rule: str) -> int: total_true_cnt = 0 for rows in self.meta.values(): for row in rows: - if row and 'T' == row.GroundTruth and rule in row.Category.split(':'): + if row and LABEL_TRUE == row.GroundTruth and rule in row.Category.split(':'): total_true_cnt += 1 return total_true_cnt diff --git a/constants.py b/constants.py new file mode 100644 index 000000000..6b0b5c694 --- /dev/null +++ b/constants.py @@ -0,0 +1,5 @@ +LABEL_TRUE = 'T' +LABEL_FALSE = 'F' +LABEL_OTHER = 'X' +ALLOWED_LABELS = (LABEL_TRUE, LABEL_FALSE, LABEL_OTHER) +PRIVATE_KEY_CATEGORY = "PEM Private Key" diff --git a/meta/48fd3902.csv b/meta/48fd3902.csv index 8a3dd7edb..bceef75ef 100644 --- a/meta/48fd3902.csv +++ b/meta/48fd3902.csv @@ -21,15 +21,15 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,ValueStart,Valu 86317,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,68,68,T,45,1105,,,Azure Access Token:Token:JSON Web Token 86318,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,70,70,T,43,1123,,,Azure Access Token:Token:JSON Web Token 86319,87e253cc,GitHub,48fd3902,data/48fd3902/test/src/87e253cc.java,81,131,T,11,38,,,PEM Private Key -86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,F,13,46,,,PEM Private Key -86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,F,13,46,,,PEM Private Key +86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,T,13,46,,fake-for-test-but-valid,PEM Private Key +86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,T,13,46,,fake-for-test-but-valid,PEM Private Key 86419,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,496,522,T,13,46,,,PEM Private Key 86493,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,403,429,T,13,46,,,PEM Private Key 86494,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,206,232,T,13,46,,,PEM Private Key 86519,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,236,263,T,13,42,,,PEM Private Key 86546,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,266,270,T,49,41,,,PEM Private Key -86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,F,52,42,,,PEM Private Key -86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,F,58,38,,,PEM Private Key +86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,T,52,42,,fake-for-test-but-valid,PEM Private Key +86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,T,58,38,,fake-for-test-but-valid,PEM Private Key 86628,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,458,508,T,58,42,,,PEM Private Key 86678,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,53,53,F,,,,,Password 86679,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,62,62,F,,,,,Password diff --git a/meta_row.py b/meta_row.py index 058bbdf4b..872f9cd72 100644 --- a/meta_row.py +++ b/meta_row.py @@ -4,6 +4,8 @@ from pathlib import Path from typing import Union, List, Generator +from constants import ALLOWED_LABELS, LABEL_TRUE, PRIVATE_KEY_CATEGORY + # dataclass is required for csv writer @dataclasses.dataclass @@ -26,7 +28,7 @@ class MetaRow: def __init__(self, row: dict): if not isinstance(row, dict) or self.__annotations__.keys() != row.keys(): - raise RuntimeError(f"ERROR: wrong row {row}") + raise ValueError(f"ERROR: wrong row {row}") for key, typ in self.__annotations__.items(): if key.startswith("__"): continue @@ -45,27 +47,26 @@ def __init__(self, row: dict): elif typ is str and isinstance(row_val, str): val = row_val else: - raise RuntimeError(f"ERROR: Unsupported {typ}") + raise ValueError(f"ERROR: Unsupported {typ}") self.__setattr__(key, val) if not hasattr(self, "Category") or not self.Category: - raise RuntimeError(f"ERROR: Category must be set {row}") + raise ValueError(f"ERROR: Category must be set {row}") if ':' in self.Category: rules = self.Category.split(':') rule_set=set(rules) if len(rules) != len(rule_set): - raise RuntimeError(f"ERROR: Each rule must be once in Category {row}") + raise ValueError(f"ERROR: Each rule must be once in Category {row}") if "Other" in rule_set: - raise RuntimeError(f"ERROR: 'Other' Category must be single rule in markup {row}") - allowed_labels = ['T', 'F', 'X'] - if self.GroundTruth not in allowed_labels: - raise RuntimeError(f"ERROR: GroundTruth must be in {allowed_labels} {row}") + raise ValueError(f"ERROR: 'Other' Category must be single rule in markup {row}") + if self.GroundTruth not in ALLOWED_LABELS: + raise ValueError(f"ERROR: GroundTruth must be in {ALLOWED_LABELS} {row}") if 0 > self.LineStart or 0 > self.LineEnd: - raise RuntimeError(f"ERROR: LineStart and LineEnd must be positive {row}") + raise ValueError(f"ERROR: LineStart and LineEnd must be positive {row}") elif self.LineStart > self.LineEnd: - raise RuntimeError(f"ERROR: LineStart must be lower than LineEnd {row}") + raise ValueError(f"ERROR: LineStart must be lower than LineEnd {row}") elif self.LineStart == self.LineEnd and 0 <= self.ValueStart and 0 <= self.ValueEnd < self.ValueStart: # multiline value positions are independent - raise RuntimeError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}") + raise ValueError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}") def __str__(self) -> str: dict_values = self.__dict__.values() @@ -85,7 +86,7 @@ def _meta_from_file(meta_path: Path) -> Generator[dict, None, None]: reader = csv.DictReader(f) for row in reader: if not isinstance(row, dict): - raise RuntimeError(f"ERROR: wrong row '{row}' in {meta_path}") + raise ValueError(f"ERROR: wrong row '{row}' in {meta_path}") yield row @@ -100,17 +101,17 @@ def _meta_from_dir(meta_path: Path) -> Generator[dict, None, None]: def _get_source_gen(meta_path: Union[Path]) -> Generator[dict, None, None]: if not isinstance(meta_path, Path): - raise RuntimeError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}") + raise ValueError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}") if not meta_path.exists(): - raise RuntimeError(f"ERROR: {meta_path} does not exist") + raise ValueError(f"ERROR: {meta_path} does not exist") if meta_path.is_dir(): source_gen = _meta_from_dir elif meta_path.is_file(): source_gen = _meta_from_file else: - raise RuntimeError(f"ERROR: unsupported {meta_path} file type") + raise ValueError(f"ERROR: unsupported {meta_path} file type") yield from source_gen(meta_path) @@ -122,7 +123,7 @@ def read_meta(meta_dir: Union[str, Path]) -> List[MetaRow]: for row in _get_source_gen(Path(meta_dir)): meta_row = MetaRow(row) if meta_row.Id in meta_ids: - raise RuntimeError(f"ERROR: duplicate Id row {row}") + raise ValueError(f"ERROR: duplicate Id row {row}") meta_ids.add(meta_row.Id) meta.append(meta_row) diff --git a/obfuscate_creds.py b/obfuscate_creds.py index b072a41cd..13cf69d28 100644 --- a/obfuscate_creds.py +++ b/obfuscate_creds.py @@ -6,9 +6,9 @@ import string import sys from argparse import Namespace, ArgumentParser -from multiprocessing.managers import Value from typing import List +from constants import PRIVATE_KEY_CATEGORY, LABEL_TRUE from meta_row import read_meta, MetaRow logging.basicConfig( @@ -409,32 +409,6 @@ def gen_random_value(value): return obfuscated_value -def replace_rows(data: List[MetaRow], lines: List[str], noise: int): - # Change data in already copied files - for row in data: - # PEM keys and other multiple-line credentials is processed in other function - if "" != row.CryptographyKey or row.LineEnd != row.LineStart: - continue - - if 'T' != row.GroundTruth: - # false cases do not require an obfuscation - continue - - if not (0 <= row.ValueStart and 0 <= row.ValueEnd): - continue - - if row.Category in ["AWS Multi", "Google Multi"]: - # skip obfuscation for the categories which are multi pattern - continue - - old_line = lines[row.LineStart - 1] - value = old_line[row.ValueStart:row.ValueEnd] - # CredSweeper may scan huge lines since v1.6 - random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise) - obfuscated_value = get_obfuscated_value(value, row) - new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:] - - lines[row.LineStart - 1] = new_line def split_in_bounds(i: int, lines_len: int, old_line: str): @@ -524,56 +498,21 @@ def create_new_key(lines: List[str]): return new_lines -def create_new_multiline(lines: List[str], starting_position: int): - # Create new lines with similar formatting as old one - new_lines = [] - - first_line = lines[0] - - new_lines.append(first_line[:starting_position] + obfuscate_segment(first_line[starting_position:])) - - # Do not replace ssh-rsa substring if present - if "ssh-rsa" in first_line: - s = first_line.find("ssh-rsa") - new_lines[0] = new_lines[0][:s] + "ssh-rsa" + new_lines[0][s + 7:] - - for i, old_l in enumerate(lines[1:]): - new_line = obfuscate_segment(old_l) - new_lines.append(new_line) - - return new_lines - - def process_pem_key(row: MetaRow, lines: List[str], noise: int): # Change data in already copied files (only keys) - try: - # Skip credentials that are not PEM or multiline - if row.CryptographyKey == "" and row.LineStart == row.LineEnd: - return - - if row.Category in ["AWS Multi", "Google Multi"]: - # skip double obfuscation for the categories - return - - random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise) - - if '' != row.CryptographyKey: - new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd]) - else: - new_lines = create_new_multiline(lines[row.LineStart - 1:row.LineEnd], row.ValueStart) - - lines[row.LineStart - 1:row.LineEnd] = new_lines - - except Exception as exc: - logger.error(f"FAILURE: {row}") - logger.critical(exc) - raise + random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise) + new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd]) + lines[row.LineStart - 1:row.LineEnd] = new_lines -def process_pem_keys(data: List[MetaRow], lines: List[str], noise: int): - for row in data: - if 'T' == row.GroundTruth and "Private Key" == row.Category: - process_pem_key(row, lines, noise) +def process_single_value(row: MetaRow, lines: List[str], noise: int): + random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise) + old_line = lines[row.LineStart - 1] + value = old_line[row.ValueStart:row.ValueEnd] + # CredSweeper may scan huge lines since v1.6 + obfuscated_value = get_obfuscated_value(value, row) + new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:] + lines[row.LineStart - 1] = new_line def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0): @@ -594,9 +533,19 @@ def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0): logger.critical(exc) raise meta_rows.sort(key=lambda x: (x.LineStart, x.LineEnd, x.ValueStart, x.ValueEnd)) - replace_rows(meta_rows, lines, noise) - process_pem_keys(meta_rows, lines, noise) - + for row in meta_rows: + if LABEL_TRUE != row.GroundTruth: + # obfuscation is only for True cases + continue + elif row.Category in ["AWS Multi", "Google Multi"]: + # skip obfuscation for the categories which are multi pattern + continue + elif PRIVATE_KEY_CATEGORY == row.Category and row.LineStart < row.LineEnd: + # multiline PEM keys obfuscation + process_pem_key(row, lines, noise) + elif row.LineStart == row.LineEnd and 0 <= row.ValueStart < row.ValueEnd: + # single value obfuscation + process_single_value(row, lines, noise) with open(dataset_file, "w", encoding="utf8") as f: f.write('\n'.join(lines)) diff --git a/review_data.py b/review_data.py index 576856760..be4b5dff7 100644 --- a/review_data.py +++ b/review_data.py @@ -18,6 +18,7 @@ from colorama import Fore, Back, Style +from constants import LABEL_OTHER, LABEL_FALSE, LABEL_TRUE from meta_cred import MetaCred from meta_row import read_meta, MetaRow @@ -50,11 +51,11 @@ def read_data(path, line_start, line_end, value_start, value_end, ground_truth, else: raise RuntimeError(f"Line start must be less than end. {path},{line_start},{line_end}") - if 'T' == ground_truth: + if LABEL_TRUE == ground_truth: fore_style = Fore.GREEN - elif 'F' == ground_truth: + elif LABEL_FALSE == ground_truth: fore_style = Fore.RED - elif 'X' == ground_truth: + elif LABEL_OTHER == ground_truth: fore_style = Fore.MAGENTA else: raise RuntimeError(f"Unknown type {ground_truth}") @@ -173,7 +174,7 @@ def review(meta_dir: str, except Exception as exc: print(f"Failure {row}", exc, flush=True) errors += 1 - if 'T' == row.GroundTruth and row.LineStart == row.LineEnd: + if LABEL_TRUE == row.GroundTruth and row.LineStart == row.LineEnd: if 0 > row.ValueStart: print(f"Missed ValueStart for TRUE markup!\n{row}", flush=True) errors += 1 diff --git a/test_obfuscate_creds.py b/test_obfuscate_creds.py index 24c73652e..1a62ff7a5 100644 --- a/test_obfuscate_creds.py +++ b/test_obfuscate_creds.py @@ -2,7 +2,9 @@ import random import unittest -from obfuscate_creds import gen_random_value, obfuscate_jwt, obfuscate_glsa +from constants import PRIVATE_KEY_CATEGORY, LABEL_TRUE +from meta_row import MetaRow +from obfuscate_creds import gen_random_value, obfuscate_jwt, obfuscate_glsa, process_pem_key class ObfuscatorTest(unittest.TestCase): @@ -159,3 +161,40 @@ def test_obfuscate_glsa(self): self.assertEqual(len(value), len(obfuscated)) # tested value self.assertEqual("glsa_DaldL9OnCudSrj7jWui7wxVj9b4ltV2p_c97ad013", obfuscated) + + def test_obfuscate_pem(self): + random.seed(20251211) + original_lines = [ + "BOM", + "/* some comment */ -----BEGIN RSA PRIVATE KEY----- any dummy info", + "MIIEpQIBAAKCAQEA5mPfjyiQnuiLJPn63vr4sznghBRxzX/FirstLineFixed+J4", + "MIIEpQIBAAKCAQEA5mPfjyiQnuiLJPn63vr4sznghBRxzX/SeconLineUpdat+J4", + "MIIEpQIBAAKCAQEA5mPfjyiQnuiLJPn63vr4sznghBRxzX/ThirdLineUpdat+J4", + "unhanged====", + "-----END RSA PRIVATE KEY-----", + "EOF", + ] + obfuscated_lines = [ + "BOM", + "/* some comment */ -----BEGIN RSA PRIVATE KEY----- any dummy info", + "MIIEpQIBAAKCAQEA5mPfjyiQnuiLJPn63vr4sznghBRxzX/FirstLineFixed+J4", + 'hVBzwYLllXwvsCqC1vIWiVSUrVpchQV32XB7LPfyjpSLlG/SRIMrpCDoiMWFl+A5', + 'ktePWZqcrQEoRLDs1dSJXpLKJSQmroj63oC4xTJPSITaEd/IoPwOBdRoaTEtW+x3', + "unhanged====", + "-----END RSA PRIVATE KEY-----", + "EOF"] + row = MetaRow({"Id": 1, + "FileID": "01234567", + "Domain": "str", + "RepoName": "98765432", + "FilePath": "str", + "LineStart": 2, + "LineEnd": len(original_lines) + 1, + "GroundTruth": LABEL_TRUE, + "ValueStart": 19, + "ValueEnd": 29, + "CryptographyKey": "", + "PredefinedPattern": "", + "Category": PRIVATE_KEY_CATEGORY}) + process_pem_key(row, original_lines, 0) + self.assertListEqual(obfuscated_lines, original_lines)