Skip to content

Commit 8832adf

Browse files
authored
New patterns benchmark fix (#787)
* gitlab pattern upgrade * BM ref fix * rule fix, banner fix * BM ref fix * ML train optimization
1 parent 589c24f commit 8832adf

File tree

13 files changed

+63
-60
lines changed

13 files changed

+63
-60
lines changed

.ci/benchmark.txt

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
META MD5 f4ea95be1b92cc0c1a9a78483d86d6e5
2-
DATA MD5 a78bd8abd1ef9b059a75f59d71ea1c31
3-
DATA: 16990255 interested lines. MARKUP: 54202 items
1+
META MD5 36fda64f33e1557d8a36fb3b6322ff65
2+
DATA MD5 7736b990c61c8dfad6ef7a6f372df160
3+
DATA: 16995334 interested lines. MARKUP: 54239 items
44
FileType FileNumber ValidLines Positives Negatives
55
--------------- ------------ ------------ ----------- -----------
66
684 567150 137 445
@@ -19,7 +19,7 @@ FileType FileNumber ValidLines Positives Negatives
1919
.build 2 40 1 2
2020
.bundle 4 1512 441
2121
.bzl 3 2503 11
22-
.c 183 285200 20 679
22+
.c 183 285200 20 680
2323
.cast 2 704 6 1
2424
.cc 28 29149 609
2525
.cf 2 19 2
@@ -62,7 +62,7 @@ FileType FileNumber ValidLines Positives Negatives
6262
.gd 1 37 1
6363
.gml 3 3075 16
6464
.gni 3 5017 17
65-
.go 1242 706630 1479 4537
65+
.go 1242 706630 1479 4540
6666
.golden 5 1168 1 42
6767
.gradle 50 4295 8 139
6868
.graphql 8 454 2 13
@@ -74,21 +74,21 @@ FileType FileNumber ValidLines Positives Negatives
7474
.har 6 2229 18
7575
.hbs 1 51 1
7676
.hpp 1 237 2
77-
.hs 14 4140 30 56
77+
.hs 14 4140 30 58
7878
.html 121 33979 188 100
7979
.idl 3 1625 37 5
8080
.iml 6 699 30
8181
.in 7 2242 10 48
8282
.inc 2 56 1 2
8383
.ini 12 1461 25 29
84-
.ipynb 1 134 2
84+
.ipynb 6 4804 10 10
8585
.j 1 241 4
8686
.j2 32 6043 8 174
8787
.java 650 141112 478 1282
8888
.jenkinsfile 1 58 2 6
8989
.jinja2 1 64 2
90-
.js 640 530803 844 2547
91-
.json 884 13053260 1879 5308
90+
.js 640 530803 844 2548
91+
.json 884 13053260 1879 5310
9292
.jsp 13 3202 1 37
9393
.jsx 7 857 15
9494
.jwt 1 1 2
@@ -116,7 +116,7 @@ FileType FileNumber ValidLines Positives Negatives
116116
.markdown 38 5862 69 5
117117
.markerb 3 12 3
118118
.marko 1 21 2
119-
.md 768 181113 1043 2250
119+
.md 768 181113 1043 2252
120120
.mdx 3 549 7
121121
.mjml 1 18 1
122122
.mjs 19 4119 88 89
@@ -128,7 +128,7 @@ FileType FileNumber ValidLines Positives Negatives
128128
.mqh 1 1023 1
129129
.msg 1 26644 1
130130
.mysql 1 36 2
131-
.ndjson 2 5006 79 221
131+
.ndjson 2 5006 80 230
132132
.nix 1 114 4
133133
.nolint 1 2 1
134134
.odd 1 1281 43
@@ -157,14 +157,14 @@ FileType FileNumber ValidLines Positives Negatives
157157
.pug 2 193 2
158158
.purs 1 69 4
159159
.pxd 1 150 2 4
160-
.py 875 292004 737 3440
160+
.py 876 292413 740 3444
161161
.pyi 4 1361 9
162162
.pyp 1 167 1
163163
.python 1 213
164164
.pyx 2 1094 23
165165
.r 4 62 5 2
166166
.rake 2 51 2
167-
.rb 834 127133 410 2778
167+
.rb 834 127133 410 2782
168168
.re 1 31 1
169169
.red 1 159 1
170170
.release 1 13 4
@@ -211,9 +211,9 @@ FileType FileNumber ValidLines Positives Negatives
211211
.toml 86 2471 60 248
212212
.tpl 1 43 1
213213
.travis 1 34 2 4
214-
.ts 607 107776 265 1882
214+
.ts 607 107776 266 1882
215215
.tsx 54 7914 1 116
216-
.txt 322 89402 5252 4043
216+
.txt 322 89402 5252 4045
217217
.utf8 1 77 1
218218
.vsmdi 1 6 2
219219
.vue 50 8736 1 153
@@ -225,16 +225,17 @@ FileType FileNumber ValidLines Positives Negatives
225225
.yml 560 56585 1907 1134
226226
.zsh 6 872 11
227227
.zsh-theme 1 97 1
228-
TOTAL: 11355 16990255 17183 41387
229-
credsweeper result_cnt : 16914, lost_cnt : 0, true_cnt : 16816, false_cnt : 98
228+
TOTAL: 11361 16995334 17198 41425
229+
credsweeper result_cnt : 16928, lost_cnt : 0, true_cnt : 16830, false_cnt : 98
230230
Rules Positives Negatives Reported TP FP TN FN FPR FNR ACC PRC RCL F1
231231
------------------------------ ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- --------
232-
API 242 3370 237 237 0 3370 5 0.000000 0.020661 0.998616 1.000000 0.979339 0.989562
232+
API 245 3375 240 240 0 3375 5 0.000000 0.020408 0.998619 1.000000 0.979592 0.989691
233233
AWS Client ID 205 19 197 197 0 19 8 0.000000 0.039024 0.964286 1.000000 0.960976 0.980100
234234
AWS Multi 82 11 31 29 2 9 53 0.181818 0.646341 0.408602 0.935484 0.353659 0.513274
235235
AWS S3 Bucket 67 23 92 67 23 0 0 1.000000 0.000000 0.744444 0.744444 1.000000 0.853503
236236
Akamai Credentials 6 2 6 6 0 2 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
237-
Auth 1171 2708 1145 1144 1 2707 27 0.000369 0.023057 0.992782 0.999127 0.976943 0.987910
237+
Anthropic API Key 1 0 1 1 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
238+
Auth 1171 2713 1145 1144 1 2712 27 0.000369 0.023057 0.992791 0.999127 0.976943 0.987910
238239
Azure Access Token 24 0 17 17 0 0 7 0.291667 0.708333 1.000000 0.708333 0.829268
239240
BASE64 Private Key 22 4 22 22 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
240241
BASE64 encoded PEM Private Key 12 0 12 12 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
@@ -260,24 +261,25 @@ Grafana Service Account Token 3 0 3 3
260261
JSON Web Token 174 61 165 165 0 61 9 0.000000 0.051724 0.961702 1.000000 0.948276 0.973451
261262
JWK 80 0 80 80 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
262263
Jira / Confluence PAT token 0 4 0 0 4 0 0.000000 1.000000
263-
Key 4285 16132 4280 4262 18 16114 23 0.001116 0.005368 0.997992 0.995794 0.994632 0.995213
264+
Key 4288 16143 4283 4265 18 16125 23 0.001115 0.005364 0.997993 0.995797 0.994636 0.995216
264265
MailGun API Key 8 0 8 8 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
265266
NKEY Seed 60 0 59 59 0 0 1 0.016667 0.983333 1.000000 0.983333 0.991597
266267
Nonce 130 55 129 128 1 54 2 0.018182 0.015385 0.983784 0.992248 0.984615 0.988417
267268
OTP / 2FA Secret 64 3 56 54 2 1 10 0.666667 0.156250 0.820896 0.964286 0.843750 0.900000
268-
Other 0 20 0 0 20 0 0.000000 1.000000
269+
Other 0 21 0 0 21 0 0.000000 1.000000
269270
PEM Private Key 1150 76 1154 1150 4 72 0 0.052632 0.000000 0.996737 0.996534 1.000000 0.998264
270-
Password 2577 9933 2502 2491 11 9922 86 0.001107 0.033372 0.992246 0.995604 0.966628 0.980902
271+
Password 2578 9940 2503 2492 11 9929 86 0.001107 0.033359 0.992251 0.995605 0.966641 0.980909
272+
Perplexity API Key 2 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
271273
Postman Credentials 2 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
272274
SQL Password 44 14 42 42 0 14 2 0.000000 0.045455 0.965517 1.000000 0.954545 0.976744
273275
Salesforce Credentials 6 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
274276
Salt 90 80 88 88 0 80 2 0.000000 0.022222 0.988235 1.000000 0.977778 0.988764
275-
Secret 1527 2379 1519 1517 2 2377 10 0.000841 0.006549 0.996928 0.998683 0.993451 0.996060
277+
Secret 1527 2380 1519 1517 2 2378 10 0.000840 0.006549 0.996929 0.998683 0.993451 0.996060
276278
Slack Token 15 1 15 15 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
277279
Stripe Credentials 2 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
278280
Tencent WeChat API App ID 47 0 47 47 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
279-
Token 1137 4672 1062 1059 3 4669 78 0.000642 0.068602 0.986056 0.997175 0.931398 0.963165
281+
Token 1138 4677 1062 1059 3 4674 79 0.000641 0.069420 0.985899 0.997175 0.930580 0.962727
280282
Twilio Credentials 30 39 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
281-
URL Credentials 225 382 221 221 0 382 4 0.000000 0.017778 0.993410 1.000000 0.982222 0.991031
282-
UUID 2523 16 2521 2505 16 0 18 1.000000 0.007134 0.986609 0.993653 0.992866 0.993259
283-
17183 41387 16917 16816 98 41289 367 0.002368 0.021358 0.992061 0.994206 0.978642 0.986362
283+
URL Credentials 225 385 221 221 0 385 4 0.000000 0.017778 0.993443 1.000000 0.982222 0.991031
284+
UUID 2527 16 2525 2509 16 0 18 1.000000 0.007123 0.986630 0.993663 0.992877 0.993270
285+
17198 41425 16931 16830 98 41327 368 0.002366 0.021398 0.992051 0.994211 0.978602 0.986345

.github/workflows/benchmark.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
3232
with:
3333
repository: Samsung/CredData
34-
ref: c130061999adedf5c54ed142300942fd37902471
34+
ref: a329489bdf03679d26f9a433e0143744a1da0fbd
3535

3636
- name: Markup hashing
3737
run: |
@@ -87,7 +87,7 @@ jobs:
8787
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
8888
with:
8989
repository: Samsung/CredData
90-
ref: c130061999adedf5c54ed142300942fd37902471
90+
ref: a329489bdf03679d26f9a433e0143744a1da0fbd
9191

9292
- name: Markup hashing
9393
run: |
@@ -190,7 +190,7 @@ jobs:
190190
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
191191
with:
192192
repository: Samsung/CredData
193-
ref: c130061999adedf5c54ed142300942fd37902471
193+
ref: a329489bdf03679d26f9a433e0143744a1da0fbd
194194

195195
- name: Markup hashing
196196
run: |
@@ -378,7 +378,7 @@ jobs:
378378
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
379379
with:
380380
repository: Samsung/CredData
381-
ref: c130061999adedf5c54ed142300942fd37902471
381+
ref: a329489bdf03679d26f9a433e0143744a1da0fbd
382382

383383
- name: Markup hashing
384384
run: |

.github/workflows/check.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ jobs:
9292
run: |
9393
banner="$(python -m credsweeper --banner | head -1)"
9494
echo "banner = '${banner}'"
95-
if [ "CredSweeper 1.13.3 crc32:5891e72f" != "${banner}" ]; then
95+
if [ "CredSweeper 1.13.3 crc32:21a09c86" != "${banner}" ]; then
9696
echo "Update the check for '${banner}'"
9797
exit 1
9898
fi

credsweeper/rules/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -988,7 +988,7 @@
988988
confidence: strong
989989
type: pattern
990990
values:
991-
- (?P<value>(_gitlab_session=|GR1348941|gl(agent|soat|ffct|p[at]t|oas|cbt|imt|[dfr]t)-)[0-9A-Za-z_-]{20,64})(?![0-9A-Za-z_-])
991+
- (?P<value>(_gitlab_session=|GR1348941|gl(agent|soat|ffct|p[at]t|oas|cbt|imt|[dfr]t)-)[0-9A-Za-z_-]{20,64}(\.[0-9A-Za-z_-]{2,16}){0,2})(?![0-9A-Za-z_-])
992992
filter_type:
993993
- ValuePatternCheck
994994
min_line_len: 25

experiment/data_loader.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@ def read_text(path) -> list[str]:
207207
line_data, line[line_data["value_start"]:line_data["value_end"]], line_data["value"])
208208
# todo: variable input has to be markup in meta too, or/and new feature "VariableExists" created ???
209209
line_data["GroundTruth"] = label
210+
# auxiliary field for model_config_preprocess
211+
# no extra memory usage due the dataframe is deleted before train
210212
line_data["ext"] = Util.get_extension(line_data["path"])
211213
values.append(line_data)
212214

@@ -239,6 +241,9 @@ def read_text(path) -> list[str]:
239241
break
240242
read_text.cache_clear()
241243
df = pd.DataFrame(values)
244+
print(f"Initial full dataset: {len(df)} items\n{df.memory_usage(deep=True)}", flush=True)
245+
df = df.drop_duplicates(subset=["line", "variable", "value", "path"])
246+
print(f"Full dataset: {len(df)} items after drop duplicates\n{df.memory_usage(deep=True)}", flush=True)
242247
return df
243248

244249

experiment/train.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,6 @@ def train(
120120
else:
121121
raise RuntimeError("Something went wrong")
122122

123-
print(f"Common dataset: {len(df_all)} items", flush=True)
124-
df_all = df_all.drop_duplicates(subset=["line", "variable", "value", "path", "ext"])
125-
print(f"Common dataset: {len(df_all)} items after drop duplicates", flush=True)
126-
127123
# random split
128124
lucky_number = random.randint(1, 1 << 32)
129125
print(f"Lucky number: {lucky_number}", flush=True)

tests/data/depth_3_pedantic.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5635,17 +5635,17 @@
56355635
"ml_probability": null,
56365636
"line_data_list": [
56375637
{
5638-
"line": "glpat-8d5ri2n9g85LAnC9YW85 # Personal access token, Impersonation token, Project access token, Group access token",
5638+
"line": "glpat-8d5ri2n9g85LAnC9YW85.01.cgpAsnEmP # Personal access token, Impersonation token, Project access token, Group access token",
56395639
"line_num": 2,
56405640
"path": "./tests/samples/gitlab_prefix_token",
56415641
"info": "FILE:./tests/samples/gitlab_prefix_token|RAW",
56425642
"variable": null,
56435643
"variable_start": -2,
56445644
"variable_end": -2,
5645-
"value": "glpat-8d5ri2n9g85LAnC9YW85",
5645+
"value": "glpat-8d5ri2n9g85LAnC9YW85.01.cgpAsnEmP",
56465646
"value_start": 0,
5647-
"value_end": 26,
5648-
"entropy": 4.10391
5647+
"value_end": 39,
5648+
"entropy": 4.59259
56495649
}
56505650
]
56515651
},

0 commit comments

Comments
 (0)