Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
META MD5 0f056a20273ef7291f1c4fb70600972e
DATA MD5 2eed0cbe31bb629ffc744b916d064882
META MD5 8fac9e2b7c95044650e74fff448ddf83
DATA MD5 d4c0a42111715e6f26fa918cb41b290a
DATA: 16995334 interested lines. MARKUP: 63711 items
FileType FileNumber ValidLines Positives Negatives
--------------- ------------ ------------ ----------- -----------
Expand Down Expand Up @@ -94,7 +94,7 @@ FileType FileNumber ValidLines Positives Negatives
.jwt 1 1 2
.key 115 3067 105 11
.ks 1 25 1
.kt 120 19864 65 381
.kt 120 19864 69 377
.l 1 982 1
.las 1 6656 36
.lasso 1 230 7
Expand Down Expand Up @@ -225,7 +225,7 @@ FileType FileNumber ValidLines Positives Negatives
.yml 560 56585 1896 1387
.zsh 6 872 11
.zsh-theme 1 97 1
TOTAL: 11361 16995334 17155 53614
TOTAL: 11361 16995334 17159 53610
credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0
Rules Positives Negatives Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ----
Expand Down Expand Up @@ -268,7 +268,7 @@ NTLM Token 4 0 0 0
Nonce 131 109 0 0 109 131 0.000000 1.000000 0.454167 0.000000
OTP / 2FA Secret 64 3 0 0 3 64 0.000000 1.000000 0.044776 0.000000
Other 0 20 0 0 20 0 0.000000 1.000000
PEM Private Key 1150 76 0 0 76 1150 0.000000 1.000000 0.061990 0.000000
PEM Private Key 1154 72 0 0 72 1154 0.000000 1.000000 0.058728 0.000000
Password 2597 11365 0 0 11365 2597 0.000000 1.000000 0.813995 0.000000
Perplexity API Key 2 0 0 0 0 2 1.000000 0.000000 0.000000
Postman Credentials 2 0 0 0 0 2 1.000000 0.000000 0.000000
Expand All @@ -283,4 +283,4 @@ Token 1140 5268 0 0
Twilio Credentials 30 39 0 0 39 30 0.000000 1.000000 0.565217 0.000000
URL Credentials 225 401 0 0 401 225 0.000000 1.000000 0.640575 0.000000
UUID 2517 3716 0 0 3716 2517 0.000000 1.000000 0.596182 0.000000
17155 53614 0 0 0 53614 17155 0.000000 1.000000 0.757592 0.000000
17159 53610 0 0 0 53610 17159 0.000000 1.000000 0.757535 0.000000
15 changes: 8 additions & 7 deletions benchmark/scanner/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from benchmark.common import GitService, LineStatus, Result, ScannerType
from benchmark.scanner.file_type_stat import FileTypeStat
from benchmark.scanner.true_false_counter import TrueFalseCounter
from constants import LABEL_FALSE, LABEL_TRUE
from meta_key import MetaKey
from meta_row import _get_source_gen, MetaRow

Expand Down Expand Up @@ -79,7 +80,7 @@ def _prepare_meta(self):
rules = meta_row.Category.split(':')
for rule in rules:
true_cnt, false_cnt = self.rules_markup_counters.get(rule, (0, 0))
if 'T' == meta_row.GroundTruth:
if LABEL_TRUE == meta_row.GroundTruth:
true_cnt += 1
self.total_true_cnt += 1
type_stat.true_markup += 1
Expand Down Expand Up @@ -285,7 +286,7 @@ def check_line_from_meta(self,
approximate = f"{self.meta_next_id},{file_id}" \
f",GitHub,{repo_name},{data_path}" \
f",{line_start},{line_end}" \
f",F,{value_start},{value_end}" \
f",{LABEL_FALSE},{value_start},{value_end}" \
f",,,{rule}"
lost_meta = MetaRow({
"Id": self.meta_next_id,
Expand Down Expand Up @@ -319,7 +320,7 @@ def check_line_from_meta(self,
# it means, all markups are the same file with line start-end
if 0 > row.ValueStart and 0 > row.ValueEnd:
# the markup is for whole line - any value_start, value_end match
if 'T' == row.GroundTruth and row.LineStart == row.LineEnd:
if LABEL_TRUE == row.GroundTruth and row.LineStart == row.LineEnd:
# True markup has to be marked at least start value in single line
print(f"WARNING True markup for whole line: {row}", flush=True)
pass
Expand Down Expand Up @@ -355,7 +356,7 @@ def check_line_from_meta(self,
code = (data_path, row.LineStart, row.LineEnd, row.ValueStart, row.ValueEnd, rule)
if code in self.line_checker:
self.result_cnt -= 1
if 'T' == row.GroundTruth:
if LABEL_TRUE == row.GroundTruth:
print(f"WARNING: Already checked True! Duplicate? {code}", flush=True)
return LineStatus.CHECKED, repo_name, file_name
else:
Expand All @@ -364,12 +365,12 @@ def check_line_from_meta(self,
for meta_rule in row.Category.split(':'):
# increase the counter only for corresponded rule mentioned in markup
if meta_rule == rule:
if 'T' == row.GroundTruth:
if LABEL_TRUE == row.GroundTruth:
self._increase_result_dict_cnt(meta_rule, True)
self.true_cnt += 1
return LineStatus.FALSE, repo_name, file_id
else:
# MetaRow class checks the correctness of row.GroundTruth = ['T', 'F']
# MetaRow class checks the correctness of row.GroundTruth
self._increase_result_dict_cnt(meta_rule, False)
self.false_cnt += 1
return LineStatus.TRUE, repo_name, file_id
Expand Down Expand Up @@ -484,7 +485,7 @@ def _get_total_true_cnt(self, rule: str) -> int:
total_true_cnt = 0
for rows in self.meta.values():
for row in rows:
if row and 'T' == row.GroundTruth and rule in row.Category.split(':'):
if row and LABEL_TRUE == row.GroundTruth and rule in row.Category.split(':'):
total_true_cnt += 1
return total_true_cnt

Expand Down
5 changes: 5 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
LABEL_TRUE = 'T'
LABEL_FALSE = 'F'
LABEL_OTHER = 'X'
ALLOWED_LABELS = (LABEL_TRUE, LABEL_FALSE, LABEL_OTHER)
PRIVATE_KEY_CATEGORY = "PEM Private Key"
8 changes: 4 additions & 4 deletions meta/48fd3902.csv
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,ValueStart,Valu
86317,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,68,68,T,45,1105,,,Azure Access Token:Token:JSON Web Token
86318,8605db08,GitHub,48fd3902,data/48fd3902/test/src/util/8605db08.kt,70,70,T,43,1123,,,Azure Access Token:Token:JSON Web Token
86319,87e253cc,GitHub,48fd3902,data/48fd3902/test/src/87e253cc.java,81,131,T,11,38,,,PEM Private Key
86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,F,13,46,,,PEM Private Key
86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,F,13,46,,,PEM Private Key
86369,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,87,137,T,13,46,,fake-for-test-but-valid,PEM Private Key
86370,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,442,492,T,13,46,,fake-for-test-but-valid,PEM Private Key
86419,c709b566,GitHub,48fd3902,data/48fd3902/test/src/util/c709b566.kt,496,522,T,13,46,,,PEM Private Key
86493,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,403,429,T,13,46,,,PEM Private Key
86494,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,206,232,T,13,46,,,PEM Private Key
86519,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,236,263,T,13,42,,,PEM Private Key
86546,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,266,270,T,49,41,,,PEM Private Key
86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,F,52,42,,,PEM Private Key
86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,F,58,38,,,PEM Private Key
86550,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,323,349,T,52,42,,fake-for-test-but-valid,PEM Private Key
86576,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,351,378,T,58,38,,fake-for-test-but-valid,PEM Private Key
86628,e9b76075,GitHub,48fd3902,data/48fd3902/test/src/util/e9b76075.kt,458,508,T,58,42,,,PEM Private Key
86678,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,53,53,F,,,,,Password
86679,238cc5a0,GitHub,48fd3902,data/48fd3902/docs/238cc5a0.md,62,62,F,,,,,Password
Expand Down
33 changes: 17 additions & 16 deletions meta_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path
from typing import Union, List, Generator

from constants import ALLOWED_LABELS, LABEL_TRUE, PRIVATE_KEY_CATEGORY


# dataclass is required for csv writer
@dataclasses.dataclass
Expand All @@ -26,7 +28,7 @@ class MetaRow:

def __init__(self, row: dict):
if not isinstance(row, dict) or self.__annotations__.keys() != row.keys():
raise RuntimeError(f"ERROR: wrong row {row}")
raise ValueError(f"ERROR: wrong row {row}")
for key, typ in self.__annotations__.items():
if key.startswith("__"):
continue
Expand All @@ -45,27 +47,26 @@ def __init__(self, row: dict):
elif typ is str and isinstance(row_val, str):
val = row_val
else:
raise RuntimeError(f"ERROR: Unsupported {typ}")
raise ValueError(f"ERROR: Unsupported {typ}")
self.__setattr__(key, val)
if not hasattr(self, "Category") or not self.Category:
raise RuntimeError(f"ERROR: Category must be set {row}")
raise ValueError(f"ERROR: Category must be set {row}")
if ':' in self.Category:
rules = self.Category.split(':')
rule_set=set(rules)
if len(rules) != len(rule_set):
raise RuntimeError(f"ERROR: Each rule must be once in Category {row}")
raise ValueError(f"ERROR: Each rule must be once in Category {row}")
if "Other" in rule_set:
raise RuntimeError(f"ERROR: 'Other' Category must be single rule in markup {row}")
allowed_labels = ['T', 'F', 'X']
if self.GroundTruth not in allowed_labels:
raise RuntimeError(f"ERROR: GroundTruth must be in {allowed_labels} {row}")
raise ValueError(f"ERROR: 'Other' Category must be single rule in markup {row}")
if self.GroundTruth not in ALLOWED_LABELS:
raise ValueError(f"ERROR: GroundTruth must be in {ALLOWED_LABELS} {row}")
if 0 > self.LineStart or 0 > self.LineEnd:
raise RuntimeError(f"ERROR: LineStart and LineEnd must be positive {row}")
raise ValueError(f"ERROR: LineStart and LineEnd must be positive {row}")
elif self.LineStart > self.LineEnd:
raise RuntimeError(f"ERROR: LineStart must be lower than LineEnd {row}")
raise ValueError(f"ERROR: LineStart must be lower than LineEnd {row}")
elif self.LineStart == self.LineEnd and 0 <= self.ValueStart and 0 <= self.ValueEnd < self.ValueStart:
# multiline value positions are independent
raise RuntimeError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}")
raise ValueError(f"ERROR: ValueStart must be lower than ValueEnd for single line {row}")

def __str__(self) -> str:
dict_values = self.__dict__.values()
Expand All @@ -85,7 +86,7 @@ def _meta_from_file(meta_path: Path) -> Generator[dict, None, None]:
reader = csv.DictReader(f)
for row in reader:
if not isinstance(row, dict):
raise RuntimeError(f"ERROR: wrong row '{row}' in {meta_path}")
raise ValueError(f"ERROR: wrong row '{row}' in {meta_path}")
yield row


Expand All @@ -100,17 +101,17 @@ def _meta_from_dir(meta_path: Path) -> Generator[dict, None, None]:

def _get_source_gen(meta_path: Union[Path]) -> Generator[dict, None, None]:
if not isinstance(meta_path, Path):
raise RuntimeError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}")
raise ValueError(f"ERROR: unsupported source {meta_path} type {type(meta_path)}")

if not meta_path.exists():
raise RuntimeError(f"ERROR: {meta_path} does not exist")
raise ValueError(f"ERROR: {meta_path} does not exist")

if meta_path.is_dir():
source_gen = _meta_from_dir
elif meta_path.is_file():
source_gen = _meta_from_file
else:
raise RuntimeError(f"ERROR: unsupported {meta_path} file type")
raise ValueError(f"ERROR: unsupported {meta_path} file type")
yield from source_gen(meta_path)


Expand All @@ -122,7 +123,7 @@ def read_meta(meta_dir: Union[str, Path]) -> List[MetaRow]:
for row in _get_source_gen(Path(meta_dir)):
meta_row = MetaRow(row)
if meta_row.Id in meta_ids:
raise RuntimeError(f"ERROR: duplicate Id row {row}")
raise ValueError(f"ERROR: duplicate Id row {row}")
meta_ids.add(meta_row.Id)

meta.append(meta_row)
Expand Down
101 changes: 25 additions & 76 deletions obfuscate_creds.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import string
import sys
from argparse import Namespace, ArgumentParser
from multiprocessing.managers import Value
from typing import List

from constants import PRIVATE_KEY_CATEGORY, LABEL_TRUE
from meta_row import read_meta, MetaRow

logging.basicConfig(
Expand Down Expand Up @@ -409,32 +409,6 @@ def gen_random_value(value):
return obfuscated_value


def replace_rows(data: List[MetaRow], lines: List[str], noise: int):
# Change data in already copied files
for row in data:
# PEM keys and other multiple-line credentials is processed in other function
if "" != row.CryptographyKey or row.LineEnd != row.LineStart:
continue

if 'T' != row.GroundTruth:
# false cases do not require an obfuscation
continue

if not (0 <= row.ValueStart and 0 <= row.ValueEnd):
continue

if row.Category in ["AWS Multi", "Google Multi"]:
# skip obfuscation for the categories which are multi pattern
continue

old_line = lines[row.LineStart - 1]
value = old_line[row.ValueStart:row.ValueEnd]
# CredSweeper may scan huge lines since v1.6
random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise)
obfuscated_value = get_obfuscated_value(value, row)
new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:]

lines[row.LineStart - 1] = new_line


def split_in_bounds(i: int, lines_len: int, old_line: str):
Expand Down Expand Up @@ -524,56 +498,21 @@ def create_new_key(lines: List[str]):
return new_lines


def create_new_multiline(lines: List[str], starting_position: int):
# Create new lines with similar formatting as old one
new_lines = []

first_line = lines[0]

new_lines.append(first_line[:starting_position] + obfuscate_segment(first_line[starting_position:]))

# Do not replace ssh-rsa substring if present
if "ssh-rsa" in first_line:
s = first_line.find("ssh-rsa")
new_lines[0] = new_lines[0][:s] + "ssh-rsa" + new_lines[0][s + 7:]

for i, old_l in enumerate(lines[1:]):
new_line = obfuscate_segment(old_l)
new_lines.append(new_line)

return new_lines


def process_pem_key(row: MetaRow, lines: List[str], noise: int):
# Change data in already copied files (only keys)
try:
# Skip credentials that are not PEM or multiline
if row.CryptographyKey == "" and row.LineStart == row.LineEnd:
return

if row.Category in ["AWS Multi", "Google Multi"]:
# skip double obfuscation for the categories
return

random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise)

if '' != row.CryptographyKey:
new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd])
else:
new_lines = create_new_multiline(lines[row.LineStart - 1:row.LineEnd], row.ValueStart)

lines[row.LineStart - 1:row.LineEnd] = new_lines

except Exception as exc:
logger.error(f"FAILURE: {row}")
logger.critical(exc)
raise
random.seed(row.LineStart ^ int(row.FileID, 16) ^ noise)
new_lines = create_new_key(lines[row.LineStart - 1:row.LineEnd])
lines[row.LineStart - 1:row.LineEnd] = new_lines


def process_pem_keys(data: List[MetaRow], lines: List[str], noise: int):
for row in data:
if 'T' == row.GroundTruth and "Private Key" == row.Category:
process_pem_key(row, lines, noise)
def process_single_value(row: MetaRow, lines: List[str], noise: int):
random.seed((row.ValueStart | (row.LineStart << 16)) ^ int(row.FileID, 16) ^ noise)
old_line = lines[row.LineStart - 1]
value = old_line[row.ValueStart:row.ValueEnd]
# CredSweeper may scan huge lines since v1.6
obfuscated_value = get_obfuscated_value(value, row)
new_line = old_line[:row.ValueStart] + obfuscated_value + old_line[row.ValueEnd:]
lines[row.LineStart - 1] = new_line


def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0):
Expand All @@ -594,9 +533,19 @@ def obfuscate_creds(meta_dir: str, dataset_dir: str, noise: int = 0):
logger.critical(exc)
raise
meta_rows.sort(key=lambda x: (x.LineStart, x.LineEnd, x.ValueStart, x.ValueEnd))
replace_rows(meta_rows, lines, noise)
process_pem_keys(meta_rows, lines, noise)

for row in meta_rows:
if LABEL_TRUE != row.GroundTruth:
# obfuscation is only for True cases
continue
elif row.Category in ["AWS Multi", "Google Multi"]:
# skip obfuscation for the categories which are multi pattern
continue
elif PRIVATE_KEY_CATEGORY == row.Category and row.LineStart < row.LineEnd:
# multiline PEM keys obfuscation
process_pem_key(row, lines, noise)
elif row.LineStart == row.LineEnd and 0 <= row.ValueStart < row.ValueEnd:
# single value obfuscation
process_single_value(row, lines, noise)
with open(dataset_file, "w", encoding="utf8") as f:
f.write('\n'.join(lines))

Expand Down
Loading
Loading