PEM obfuscation (#289)

babenek · web-flow · commit 3410d7302b9a · 2025-12-11T13:00:12.000+02:00
diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt
@@ -1,5 +1,5 @@
-META MD5 0f056a20273ef7291f1c4fb70600972e
-DATA MD5 2eed0cbe31bb629ffc744b916d064882
+META MD5 8fac9e2b7c95044650e74fff448ddf83
+DATA MD5 d4c0a42111715e6f26fa918cb41b290a
 DATA: 16995334 interested lines. MARKUP: 63711 items
 FileType           FileNumber    ValidLines    Positives    Negatives
 ---------------  ------------  ------------  -----------  -----------
@@ -94,7 +94,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives
 .jwt                        1             1            2
 .key                      115          3067          105           11
 .ks                         1            25            1
-.kt                       120         19864           65          381
+.kt                       120         19864           69          377
 .l                          1           982                         1
 .las                        1          6656                        36
 .lasso                      1           230                         7
@@ -225,7 +225,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives
 .yml                      560         56585         1896         1387
 .zsh                        6           872                        11
 .zsh-theme                  1            97                         1
-TOTAL:                  11361      16995334        17155        53614
+TOTAL:                  11361      16995334        17159        53610
 credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0
 Rules                             Positives    Negatives    Reported    TP    FP     TN     FN       FPR       FNR       ACC  PRC         RCL  F1
 ------------------------------  -----------  -----------  ----------  ----  ----  -----  -----  --------  --------  --------  -----  --------  ----
@@ -268,7 +268,7 @@ NTLM Token                                4            0                 0     0
 Nonce                                   131          109                 0     0    109    131  0.000000  1.000000  0.454167         0.000000
 OTP / 2FA Secret                         64            3                 0     0      3     64  0.000000  1.000000  0.044776         0.000000
 Other                                     0           20                 0     0     20      0  0.000000            1.000000
-PEM Private Key                        1150           76                 0     0     76   1150  0.000000  1.000000  0.061990         0.000000
+PEM Private Key                        1154           72                 0     0     72   1154  0.000000  1.000000  0.058728         0.000000
 Password                               2597        11365                 0     0  11365   2597  0.000000  1.000000  0.813995         0.000000
 Perplexity API Key                        2            0                 0     0      0      2            1.000000  0.000000         0.000000
 Postman Credentials                       2            0                 0     0      0      2            1.000000  0.000000         0.000000
@@ -283,4 +283,4 @@ Token                                  1140         5268                 0     0
 Twilio Credentials                       30           39                 0     0     39     30  0.000000  1.000000  0.565217         0.000000
 URL Credentials                         225          401                 0     0    401    225  0.000000  1.000000  0.640575         0.000000
 UUID                                   2517         3716                 0     0   3716   2517  0.000000  1.000000  0.596182         0.000000
-                                      17155        53614           0     0     0  53614  17155  0.000000  1.000000  0.757592         0.000000
+                                      17159        53610           0     0     0  53610  17159  0.000000  1.000000  0.757535         0.000000
diff --git a/benchmark/scanner/scanner.py b/benchmark/scanner/scanner.py
@@ -14,6 +14,7 @@
 from benchmark.common import GitService, LineStatus, Result, ScannerType
 from benchmark.scanner.file_type_stat import FileTypeStat
 from benchmark.scanner.true_false_counter import TrueFalseCounter
+from constants import LABEL_FALSE, LABEL_TRUE
 from meta_key import MetaKey
 from meta_row import _get_source_gen, MetaRow
 
@@ -79,7 +80,7 @@ def _prepare_meta(self):
             rules = meta_row.Category.split(':')
             for rule in rules:
                 true_cnt, false_cnt = self.rules_markup_counters.get(rule, (0, 0))
-                if 'T' == meta_row.GroundTruth:
+                if LABEL_TRUE == meta_row.GroundTruth:
                     true_cnt += 1
                     self.total_true_cnt += 1
                     type_stat.true_markup += 1
@@ -285,7 +286,7 @@ def check_line_from_meta(self,
         approximate = f"{self.meta_next_id},{file_id}" \
                       f",GitHub,{repo_name},{data_path}" \
                       f",{line_start},{line_end}" \
-                      f",F,{value_start},{value_end}" \
+                      f",{LABEL_FALSE},{value_start},{value_end}" \
                       f",,,{rule}"
         lost_meta = MetaRow({
             "Id": self.meta_next_id,
@@ -319,7 +320,7 @@ def check_line_from_meta(self,
             # it means, all markups are the same file with line start-end
             if 0 > row.ValueStart and 0 > row.ValueEnd:
                 # the markup is for whole line - any value_start, value_end match
-                if 'T' == row.GroundTruth and row.LineStart == row.LineEnd:
+                if LABEL_TRUE == row.GroundTruth and row.LineStart == row.LineEnd:
                     # True markup has to be marked at least start value in single line
                     print(f"WARNING True markup for whole line: {row}", flush=True)
                 pass
@@ -355,7 +356,7 @@ def check_line_from_meta(self,
             code = (data_path, row.LineStart, row.LineEnd, row.ValueStart, row.ValueEnd, rule)
             if code in self.line_checker:
                 self.result_cnt -= 1
-                if 'T' == row.GroundTruth:
+                if LABEL_TRUE == row.GroundTruth:
                     print(f"WARNING: Already checked True! Duplicate? {code}", flush=True)
                 return LineStatus.CHECKED, repo_name, file_name
             else:
@@ -364,12 +365,12 @@ def check_line_from_meta(self,
             for meta_rule in row.Category.split(':'):
                 # increase the counter only for corresponded rule mentioned in markup
                 if meta_rule == rule:
-                    if 'T' == row.GroundTruth:
+                    if LABEL_TRUE == row.GroundTruth:
                         self._increase_result_dict_cnt(meta_rule, True)
                         self.true_cnt += 1
                         return LineStatus.FALSE, repo_name, file_id
                     else:
-                        # MetaRow class checks the correctness of row.GroundTruth = ['T', 'F']
+                        # MetaRow class checks the correctness of row.GroundTruth
                         self._increase_result_dict_cnt(meta_rule, False)
                         self.false_cnt += 1
                         return LineStatus.TRUE, repo_name, file_id
@@ -484,7 +485,7 @@ def _get_total_true_cnt(self, rule: str) -> int:
         total_true_cnt = 0
         for rows in self.meta.values():
             for row in rows:
-                if row and 'T' == row.GroundTruth and rule in row.Category.split(':'):
+                if row and LABEL_TRUE == row.GroundTruth and rule in row.Category.split(':'):
                     total_true_cnt += 1
         return total_true_cnt
 
diff --git a/constants.py b/constants.py
@@ -0,0 +1,5 @@
+LABEL_TRUE = 'T'
+LABEL_FALSE = 'F'
+LABEL_OTHER = 'X'
+ALLOWED_LABELS = (LABEL_TRUE, LABEL_FALSE, LABEL_OTHER)
+PRIVATE_KEY_CATEGORY = "PEM Private Key"
diff --git a/meta/48fd3902.csv b/meta/48fd3902.csv
@@ -21,15 +21,15 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,ValueStart,Valu
 86317,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,68,68,T,45,1105,,,Azure Access Token:Token:JSON Web Token
 86318,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,70,70,T,43,1123,,,Azure Access Token:Token:JSON Web Token
 86319,87e253cc,GitHub,48fd3902,data/48fd3902/test/src/87e253cc.java,81,131,T,11,38,,,PEM Private Key
-86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,F,13,46,,,PEM Private Key
-86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,F,13,46,,,PEM Private Key
+86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,T,13,46,,fake-for-test-but-valid,PEM Private Key
+86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,T,13,46,,fake-for-test-but-valid,PEM Private Key
 86419,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,496,522,T,13,46,,,PEM Private Key
 86493,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,403,429,T,13,46,,,PEM Private Key
 86494,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,206,232,T,13,46,,,PEM Private Key
 86519,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,236,263,T,13,42,,,PEM Private Key
 86546,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,266,270,T,49,41,,,PEM Private Key
-86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,F,52,42,,,PEM Private Key
-86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,F,58,38,,,PEM Private Key
+86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,T,52,42,,fake-for-test-but-valid,PEM Private Key
+86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,T,58,38,,fake-for-test-but-valid,PEM Private Key
 86628,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,458,508,T,58,42,,,PEM Private Key
 86678,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,53,53,F,,,,,Password
 86679,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,62,62,F,,,,,Password
diff --git a/meta_row.py b/meta_row.py
@@ -4,6 +4,8 @@
 from pathlib import Path
 from typing import Union, List, Generator
 
+from constants import ALLOWED_LABELS, LABEL_TRUE, PRIVATE_KEY_CATEGORY
+
 
 # dataclass is required for csv writer
 @dataclasses.dataclass
@@ -26,7 +28,7 @@ class MetaRow:
 
     def __init__(self, row: dict):
         if not isinstance(row, dict) or self.__annotations__.keys() != row.keys():
-            raise RuntimeError(f"ERROR: wrong row {row}")
+            raise ValueError(f"ERROR: wrong row {row}")
         for key, typ in self.__annotations__.items():
             if key.startswith("__"):
                 continue
@@ -45,27 +47,26 @@ def __init__(self, row: dict):
                 elif typ is str and isinstance(row_val, str):
                     val = row_val
                 else:
-                    raise RuntimeError(f"ERROR: Unsupported {typ}")
+                    raise ValueError(f"ERROR: Unsupported {typ}")
                 self.__setattr__(key, val)
         if not hasattr(self, "Category") or not self.Category:
-            raise RuntimeError(f"ERROR: Category must be set {row}")
+            raise ValueError(f"ERROR: Category must be set {row}")
         if ':' in self.Category:
             rules = self.Category.split(':')
             rule_set=set(rules)
             if len(rules) != len(rule_set):
-                raise RuntimeError(f"ERROR: Each rule must be once in Category {row}")
+                raise ValueError(f"ERROR: Each rule must be once in Category {row}")
             if "Other" in rule_set:
-                raise RuntimeError(f"ERROR: 'Other' Category must be single rule in markup {row}")
-        allowed_labels = ['T', 'F', 'X']
-        if self.GroundTruth not in allowed_labels:
-            raise RuntimeError(f"ERROR: GroundTruth must be in {allowed_labels} {row}")
+                raise ValueError(f"ERROR: 'Other' Category must be single rule in markup {row}")
+        if self.GroundTruth not in ALLOWED_LABELS:
+            raise ValueError(f"ERROR: GroundTruth must be in {ALLOWED_LABELS} {row}")
         if 0 > self.LineStart or 0 > self.LineEnd:
-            raise RuntimeError(f"ERROR: LineStart and LineEnd must be positive {row}")
+            raise ValueError(f"ERROR: LineStart and LineEnd must be positive {row}")
         elif self.LineStart > self.LineEnd:
-            raise RuntimeError(f"ERROR: LineStart must be lower than LineEnd {row}")
+            raise ValueError(f"ERROR: LineStart must be lower than LineEnd {row}")
         elif self.LineStart == self.LineEnd and 0 <= self.ValueStart and 0 <= self.ValueEnd < self.ValueStart:
             # multiline value positions are independent
-            raise RuntimeError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}")
+            raise ValueError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}")
 
     def __str__(self) -> str:
         dict_values = self.__dict__.values()
@@ -85,7 +86,7 @@ def _meta_from_file(meta_path: Path) -> Generator[dict, None, None]:
         reader = csv.DictReader(f)
         for row in reader:
             if not isinstance(row, dict):
-                raise RuntimeError(f"ERROR: wrong row '{row}' in {meta_path}")
+                raise ValueError(f"ERROR: wrong row '{row}' in {meta_path}")
             yield row
 
 
@@ -100,17 +101,17 @@ def _meta_from_dir(meta_path: Path) -> Generator[dict, None, None]:
 
 def _get_source_gen(meta_path: Union[Path]) -> Generator[dict, None, None]:
     if not isinstance(meta_path, Path):
-        raise RuntimeError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}")
+        raise ValueError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}")
 
     if not meta_path.exists():
-        raise RuntimeError(f"ERROR: {meta_path} does not exist")
+        raise ValueError(f"ERROR: {meta_path} does not exist")
 
     if meta_path.is_dir():
         source_gen = _meta_from_dir
     elif meta_path.is_file():
         source_gen = _meta_from_file
     else:
-        raise RuntimeError(f"ERROR: unsupported {meta_path} file type")
+        raise ValueError(f"ERROR: unsupported {meta_path} file type")
     yield from source_gen(meta_path)
 
 
@@ -122,7 +123,7 @@ def read_meta(meta_dir: Union[str, Path]) -> List[MetaRow]:
     for row in _get_source_gen(Path(meta_dir)):
         meta_row = MetaRow(row)
         if meta_row.Id in meta_ids:
-            raise RuntimeError(f"ERROR: duplicate Id row {row}")
+            raise ValueError(f"ERROR: duplicate Id row {row}")
         meta_ids.add(meta_row.Id)
 
         meta.append(meta_row)
diff --git a/obfuscate_creds.py b/obfuscate_creds.py
@@ -6,9 +6,9 @@
 import string
 import sys
 from argparse import Namespace, ArgumentParser
-from multiprocessing.managers import Value
 from typing import List
 
+from constants import PRIVATE_KEY_CATEGORY, LABEL_TRUE
 from meta_row import read_meta, MetaRow
 
 logging.basicConfig(
@@ -409,32 +409,6 @@ def gen_random_value(value):
     return obfuscated_value
 
 
-def replace_rows(data: List[MetaRow], lines: List[str], noise: int):
-    # Change data in already copied files
-    for row in data:
-        # PEM keys and other multiple-line credentials is processed in other function
-        if "" != row.CryptographyKey or row.LineEnd != row.LineStart:
-            continue
-
-        if 'T' != row.GroundTruth:
-            # false cases do not require an obfuscation
-            continue
-
-        if not (0 <= row.ValueStart and 0 <= row.ValueEnd):
-            continue
-
-        if row.Category in ["AWS Multi", "Google Multi"]:
-            # skip obfuscation for the categories which are multi pattern
-            continue
-
-        old_line = lines[row.LineStart - 1]
-        value = old_line[row.ValueStart:row.ValueEnd]
-        # CredSweeper may scan huge lines since v1.6
-        random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise)
-        obfuscated_value = get_obfuscated_value(value, row)
-        new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:]
-
-        lines[row.LineStart - 1] = new_line
 
 
 def split_in_bounds(i: int, lines_len: int, old_line: str):
@@ -524,56 +498,21 @@ def create_new_key(lines: List[str]):
     return new_lines
 
 
-def create_new_multiline(lines: List[str], starting_position: int):
-    # Create new lines with similar formatting as old one
-    new_lines = []
-
-    first_line = lines[0]
-
-    new_lines.append(first_line[:starting_position] + obfuscate_segment(first_line[starting_position:]))
-
-    # Do not replace ssh-rsa substring if present
-    if "ssh-rsa" in first_line:
-        s = first_line.find("ssh-rsa")
-        new_lines[0] = new_lines[0][:s] + "ssh-rsa" + new_lines[0][s + 7:]
-
-    for i, old_l in enumerate(lines[1:]):
-        new_line = obfuscate_segment(old_l)
-        new_lines.append(new_line)
-
-    return new_lines
-
-
 def process_pem_key(row: MetaRow, lines: List[str], noise: int):
     # Change data in already copied files (only keys)
-    try:
-        # Skip credentials that are not PEM or multiline
-        if row.CryptographyKey == "" and row.LineStart == row.LineEnd:
-            return
-
-        if row.Category in ["AWS Multi", "Google Multi"]:
-            # skip double obfuscation for the categories
-            return
-
-        random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise)
-
-        if '' != row.CryptographyKey:
-            new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd])
-        else:
-            new_lines = create_new_multiline(lines[row.LineStart - 1:row.LineEnd], row.ValueStart)
-
-        lines[row.LineStart - 1:row.LineEnd] = new_lines
-
-    except Exception as exc:
-        logger.error(f"FAILURE: {row}")
-        logger.critical(exc)
-        raise
+    random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise)
+    new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd])
+    lines[row.LineStart - 1:row.LineEnd] = new_lines
 
 
-def process_pem_keys(data: List[MetaRow], lines: List[str], noise: int):
-    for row in data:
-        if 'T' == row.GroundTruth and "Private Key" == row.Category:
-            process_pem_key(row, lines, noise)
+def process_single_value(row: MetaRow, lines: List[str], noise: int):
+    random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise)
+    old_line = lines[row.LineStart - 1]
+    value = old_line[row.ValueStart:row.ValueEnd]
+    # CredSweeper may scan huge lines since v1.6
+    obfuscated_value = get_obfuscated_value(value, row)
+    new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:]
+    lines[row.LineStart - 1] = new_line
 
 
 def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0):
@@ -594,9 +533,19 @@ def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0):
             logger.critical(exc)
             raise
         meta_rows.sort(key=lambda x: (x.LineStart, x.LineEnd, x.ValueStart, x.ValueEnd))
-        replace_rows(meta_rows, lines, noise)
-        process_pem_keys(meta_rows, lines, noise)
-
+        for row in meta_rows:
+            if LABEL_TRUE != row.GroundTruth:
+                # obfuscation is only for True cases
+                continue
+            elif row.Category in ["AWS Multi", "Google Multi"]:
+                # skip obfuscation for the categories which are multi pattern
+                continue
+            elif PRIVATE_KEY_CATEGORY == row.Category and row.LineStart < row.LineEnd:
+                # multiline PEM keys obfuscation
+                process_pem_key(row, lines, noise)
+            elif row.LineStart == row.LineEnd and 0 <= row.ValueStart < row.ValueEnd:
+                # single value obfuscation
+                process_single_value(row, lines, noise)
         with open(dataset_file, "w", encoding="utf8") as f:
             f.write('\n'.join(lines))
 
diff --git a/review_data.py b/review_data.py
diff --git a/test_obfuscate_creds.py b/test_obfuscate_creds.py