Skip to content

Commit cfb2d7f

Browse files
committed
Ruby: add shared SensitiveDataHeuristics.qll
1 parent 864b61a commit cfb2d7f

File tree

2 files changed

+131
-1
lines changed

2 files changed

+131
-1
lines changed

config/identical-files.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,8 @@
465465
],
466466
"SensitiveDataHeuristics Python/JS": [
467467
"javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll",
468-
"python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll"
468+
"python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll",
469+
"ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll"
469470
],
470471
"ReDoS Util Python/JS/Ruby": [
471472
"javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll",
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/**
2+
* INTERNAL: Do not use.
3+
*
4+
* Provides classes and predicates for identifying strings that may indicate the presence of sensitive data.
5+
* Such that we can share this logic across our CodeQL analysis of different languages.
6+
*
7+
* 'Sensitive' data in general is anything that should not be sent around in unencrypted form.
8+
*/
9+
10+
/**
11+
* A classification of different kinds of sensitive data:
12+
*
13+
* - secret: generic secret or trusted data;
14+
* - id: a user name or other account information;
15+
* - password: a password or authorization key;
16+
* - certificate: a certificate.
17+
*
18+
* While classifications are represented as strings, this should not be relied upon.
19+
* Instead, use the predicates in `SensitiveDataClassification::` to work with
20+
* classifications.
21+
*/
22+
class SensitiveDataClassification extends string {
23+
SensitiveDataClassification() { this in ["secret", "id", "password", "certificate"] }
24+
}
25+
26+
/**
27+
* Provides predicates to select the different kinds of sensitive data we support.
28+
*/
29+
module SensitiveDataClassification {
30+
/** Gets the classification for secret or trusted data. */
31+
SensitiveDataClassification secret() { result = "secret" }
32+
33+
/** Gets the classification for user names or other account information. */
34+
SensitiveDataClassification id() { result = "id" }
35+
36+
/** Gets the classification for passwords or authorization keys. */
37+
SensitiveDataClassification password() { result = "password" }
38+
39+
/** Gets the classification for certificates. */
40+
SensitiveDataClassification certificate() { result = "certificate" }
41+
}
42+
43+
/**
44+
* INTERNAL: Do not use.
45+
*
46+
* Provides heuristics for identifying names related to sensitive information.
47+
*/
48+
module HeuristicNames {
49+
/**
50+
* Gets a regular expression that identifies strings that may indicate the presence of secret
51+
* or trusted data.
52+
*/
53+
string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
54+
55+
/**
56+
* Gets a regular expression that identifies strings that may indicate the presence of
57+
* user names or other account information.
58+
*/
59+
string maybeAccountInfo() {
60+
result = "(?is).*acc(ou)?nt.*" or
61+
result = "(?is).*(puid|username|userid|session(id|key)).*" or
62+
result = "(?s).*([uU]|^|_|[a-z](?=U))([uU][iI][dD]).*"
63+
}
64+
65+
/**
66+
* Gets a regular expression that identifies strings that may indicate the presence of
67+
* a password or an authorization key.
68+
*/
69+
string maybePassword() {
70+
result = "(?is).*pass(wd|word|code|phrase)(?!.*question).*" or
71+
result = "(?is).*(auth(entication|ori[sz]ation)?)key.*"
72+
}
73+
74+
/**
75+
* Gets a regular expression that identifies strings that may indicate the presence of
76+
* a certificate.
77+
*/
78+
string maybeCertificate() { result = "(?is).*(cert)(?!.*(format|name)).*" }
79+
80+
/**
81+
* Gets a regular expression that identifies strings that may indicate the presence
82+
* of sensitive data, with `classification` describing the kind of sensitive data involved.
83+
*/
84+
string maybeSensitiveRegexp(SensitiveDataClassification classification) {
85+
result = maybeSecret() and classification = SensitiveDataClassification::secret()
86+
or
87+
result = maybeAccountInfo() and classification = SensitiveDataClassification::id()
88+
or
89+
result = maybePassword() and classification = SensitiveDataClassification::password()
90+
or
91+
result = maybeCertificate() and
92+
classification = SensitiveDataClassification::certificate()
93+
}
94+
95+
/**
96+
* Gets a regular expression that identifies strings that may indicate the presence of data
97+
* that is hashed or encrypted, and hence rendered non-sensitive, or contains special characters
98+
* suggesting nouns within the string do not represent the meaning of the whole string (e.g. a URL or a SQL query).
99+
*/
100+
string notSensitiveRegexp() {
101+
result = "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|((?<!un)(en))?(crypt|code)).*"
102+
}
103+
104+
/**
105+
* DEPRECATED: Use `maybeSensitiveRegexp` instead.
106+
*/
107+
deprecated predicate maybeSensitive = maybeSensitiveRegexp/1;
108+
109+
/**
110+
* DEPRECATED: Use `notSensitiveRegexp` instead.
111+
*/
112+
deprecated predicate notSensitive = notSensitiveRegexp/0;
113+
114+
/**
115+
* Holds if `name` may indicate the presence of sensitive data, and
116+
* `name` does not indicate that the data is in fact non-sensitive (for example since
117+
* it is hashed or encrypted). `classification` describes the kind of sensitive data
118+
* involved.
119+
*
120+
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
121+
* given classification), and none of the regexps from `notSensitiveRegexp` matches
122+
* `name`.
123+
*/
124+
bindingset[name]
125+
predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
126+
name.regexpMatch(maybeSensitiveRegexp(classification)) and
127+
not name.regexpMatch(notSensitiveRegexp())
128+
}
129+
}

0 commit comments

Comments
 (0)