Skip to content

Commit 16b6248

Browse files
committed
Python: Extract SensitiveDataHeuristics to be shared with JS
Initially I had called `nameIndicatesSensitiveData` for `maybeSensitiveName`, which made the relationship with `maybeSensitive` and `notSensitive` quite strange -- and therefore I added the more informative `maybeSensitiveRegexp` and `notSensitiveRegexp`. Although I'm no longer using `maybeSensitiveName`, and I no longer have a strong argument for making this name change, I still like it. If someone thinks this is a terrible idea, I'm happy to change it though 👍
1 parent 1ed11b2 commit 16b6248

File tree

4 files changed

+285
-70
lines changed

4 files changed

+285
-70
lines changed

config/identical-files.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,5 +439,9 @@
439439
"CryptoAlgorithms Python/JS": [
440440
"javascript/ql/src/semmle/javascript/security/CryptoAlgorithms.qll",
441441
"python/ql/src/semmle/crypto/Crypto.qll"
442+
],
443+
"SensitiveDataHeuristics Python/JS": [
444+
"javascript/ql/src/semmle/javascript/security/internal/SensitiveDataHeuristics.qll",
445+
"python/ql/src/semmle/python/security/internal/SensitiveDataHeuristics.qll"
442446
]
443-
}
447+
}
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
/**
2+
* INTERNAL: Do not use.
3+
*
4+
* Provides classes and predicates for identifying strings that may indicate the presence of sensitive data.
5+
* Such that we can share this logic across our CodeQL analysis of different languages.
6+
*
7+
* 'Sensitive' data in general is anything that should not be sent around in unencrypted form.
8+
*/
9+
10+
/**
11+
* A classification of different kinds of sensitive data:
12+
*
13+
* - secret: generic secret or trusted data;
14+
* - id: a user name or other account information;
15+
* - password: a password or authorization key;
16+
* - certificate: a certificate.
17+
*
18+
* While classifications are represented as strings, this should not be relied upon.
19+
* Instead, use the predicates in `SensitiveDataClassification::` to work with
20+
* classifications.
21+
*/
22+
class SensitiveDataClassification extends string {
23+
SensitiveDataClassification() { this in ["secret", "id", "password", "certificate"] }
24+
}
25+
26+
/**
27+
* Provides predicates to select the different kinds of sensitive data we support.
28+
*/
29+
module SensitiveDataClassification {
30+
/** Gets the classification for secret or trusted data. */
31+
SensitiveDataClassification secret() { result = "secret" }
32+
33+
/** Gets the classification for user names or other account information. */
34+
SensitiveDataClassification id() { result = "id" }
35+
36+
/** Gets the classification for passwords or authorization keys. */
37+
SensitiveDataClassification password() { result = "password" }
38+
39+
/** Gets the classification for certificates. */
40+
SensitiveDataClassification certificate() { result = "certificate" }
41+
}
42+
43+
/**
44+
* INTERNAL: Do not use.
45+
*
46+
* Provides heuristics for identifying names related to sensitive information.
47+
*/
48+
module HeuristicNames {
49+
/**
50+
* Gets a regular expression that identifies strings that may indicate the presence of secret
51+
* or trusted data.
52+
*/
53+
string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
54+
55+
/**
56+
* Gets a regular expression that identifies strings that may indicate the presence of
57+
* user names or other account information.
58+
*/
59+
string maybeAccountInfo() {
60+
result = "(?is).*acc(ou)?nt.*" or
61+
result = "(?is).*(puid|username|userid).*"
62+
}
63+
64+
/**
65+
* Gets a regular expression that identifies strings that may indicate the presence of
66+
* a password or an authorization key.
67+
*/
68+
string maybePassword() {
69+
result = "(?is).*pass(wd|word|code|phrase)(?!.*question).*" or
70+
result = "(?is).*(auth(entication|ori[sz]ation)?)key.*"
71+
}
72+
73+
/**
74+
* Gets a regular expression that identifies strings that may indicate the presence of
75+
* a certificate.
76+
*/
77+
string maybeCertificate() { result = "(?is).*(cert)(?!.*(format|name)).*" }
78+
79+
/**
80+
* Gets a regular expression that identifies strings that may indicate the presence
81+
* of sensitive data, with `classification` describing the kind of sensitive data involved.
82+
*/
83+
string maybeSensitiveRegexp(SensitiveDataClassification classification) {
84+
result = maybeSecret() and classification = SensitiveDataClassification::secret()
85+
or
86+
result = maybeAccountInfo() and classification = SensitiveDataClassification::id()
87+
or
88+
result = maybePassword() and classification = SensitiveDataClassification::password()
89+
or
90+
result = maybeCertificate() and
91+
classification = SensitiveDataClassification::certificate()
92+
}
93+
94+
/**
95+
* Gets a regular expression that identifies strings that may indicate the presence of data
96+
* that is hashed or encrypted, and hence rendered non-sensitive.
97+
*/
98+
string notSensitiveRegexp() {
99+
result = "(?is).*(redact|censor|obfuscate|hash|md5|sha|((?<!un)(en))?(crypt|code)).*"
100+
}
101+
102+
/**
103+
* DEPRECATED: Use `maybeSensitiveRegexp` instead.
104+
* Only added to aid with internal rewrite
105+
*/
106+
deprecated predicate maybeSensitive = maybeSensitiveRegexp/1;
107+
108+
/**
109+
* DEPRECATED: Use `notSensitiveRegexp` instead.
110+
* Only added to aid with internal rewrite
111+
*/
112+
deprecated predicate notSensitive = notSensitiveRegexp/0;
113+
114+
/**
115+
* Holds if `name` may indicate the presence of sensitive data, and
116+
* `name` does not indicate the presence of data that is hashed or encrypted, which would have
117+
* rendered the data non-sensitive. `classification` describes the kind of sensitive data involved.
118+
*
119+
* That is, one of the rexeps from `maybeSensitiveRegexp` matches `name` (with the
120+
* given classification), and none of the regexps from `notSensitiveRegexp` matches
121+
* `name`.
122+
*/
123+
bindingset[name]
124+
predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
125+
name.regexpMatch(maybeSensitiveRegexp(classification)) and
126+
not name.regexpMatch(notSensitiveRegexp())
127+
}
128+
}

python/ql/src/semmle/python/security/SensitiveData.qll

Lines changed: 24 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -12,105 +12,60 @@
1212
import python
1313
import semmle.python.dataflow.TaintTracking
1414
import semmle.python.web.HttpRequest
15-
16-
/**
17-
* Provides heuristics for identifying names related to sensitive information.
18-
*
19-
* INTERNAL: Do not use directly.
20-
* This is copied from the javascript library, but should be language independent.
21-
*/
22-
private module HeuristicNames {
23-
/**
24-
* Gets a regular expression that identifies strings that may indicate the presence of secret
25-
* or trusted data.
26-
*/
27-
string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
28-
29-
/**
30-
* Gets a regular expression that identifies strings that may indicate the presence of
31-
* user names or other account information.
32-
*/
33-
string maybeAccountInfo() {
34-
result = "(?is).*acc(ou)?nt.*" or
35-
result = "(?is).*(puid|username|userid).*"
36-
}
37-
38-
/**
39-
* Gets a regular expression that identifies strings that may indicate the presence of
40-
* a password or an authorization key.
41-
*/
42-
string maybePassword() {
43-
result = "(?is).*pass(wd|word|code|phrase)(?!.*question).*" or
44-
result = "(?is).*(auth(entication|ori[sz]ation)?)key.*"
45-
}
46-
47-
/**
48-
* Gets a regular expression that identifies strings that may indicate the presence of
49-
* a certificate.
50-
*/
51-
string maybeCertificate() { result = "(?is).*(cert)(?!.*(format|name)).*" }
52-
53-
/**
54-
* Gets a regular expression that identifies strings that may indicate the presence
55-
* of sensitive data, with `classification` describing the kind of sensitive data involved.
56-
*/
57-
string maybeSensitive(SensitiveData data) {
58-
result = maybeSecret() and data instanceof SensitiveData::Secret
59-
or
60-
result = maybeAccountInfo() and data instanceof SensitiveData::Id
61-
or
62-
result = maybePassword() and data instanceof SensitiveData::Password
63-
or
64-
result = maybeCertificate() and data instanceof SensitiveData::Certificate
65-
}
66-
67-
/**
68-
* Gets a regular expression that identifies strings that may indicate the presence of data
69-
* that is hashed or encrypted, and hence rendered non-sensitive.
70-
*/
71-
string notSensitive() {
72-
result = "(?is).*(redact|censor|obfuscate|hash|md5|sha|((?<!un)(en))?(crypt|code)).*"
73-
}
74-
75-
bindingset[name]
76-
SensitiveData getSensitiveDataForName(string name) {
77-
name.regexpMatch(HeuristicNames::maybeSensitive(result)) and
78-
not name.regexpMatch(HeuristicNames::notSensitive())
79-
}
80-
}
15+
import semmle.python.security.internal.SensitiveDataHeuristics
16+
private import HeuristicNames
8117

8218
abstract class SensitiveData extends TaintKind {
8319
bindingset[this]
8420
SensitiveData() { this = this }
21+
22+
/** Gets the classification of this sensitive data taint kind. */
23+
abstract SensitiveDataClassification getClassification();
8524
}
8625

8726
module SensitiveData {
8827
class Secret extends SensitiveData {
8928
Secret() { this = "sensitive.data.secret" }
9029

9130
override string repr() { result = "a secret" }
31+
32+
override SensitiveDataClassification getClassification() {
33+
result = SensitiveDataClassification::secret()
34+
}
9235
}
9336

9437
class Id extends SensitiveData {
9538
Id() { this = "sensitive.data.id" }
9639

9740
override string repr() { result = "an ID" }
41+
42+
override SensitiveDataClassification getClassification() {
43+
result = SensitiveDataClassification::id()
44+
}
9845
}
9946

10047
class Password extends SensitiveData {
10148
Password() { this = "sensitive.data.password" }
10249

10350
override string repr() { result = "a password" }
51+
52+
override SensitiveDataClassification getClassification() {
53+
result = SensitiveDataClassification::password()
54+
}
10455
}
10556

10657
class Certificate extends SensitiveData {
10758
Certificate() { this = "sensitive.data.certificate" }
10859

10960
override string repr() { result = "a certificate or key" }
61+
62+
override SensitiveDataClassification getClassification() {
63+
result = SensitiveDataClassification::certificate()
64+
}
11065
}
11166

11267
private SensitiveData fromFunction(Value func) {
113-
result = HeuristicNames::getSensitiveDataForName(func.getName())
68+
nameIndicatesSensitiveData(func.getName(), result.getClassification())
11469
}
11570

11671
abstract class Source extends TaintSource {
@@ -134,7 +89,7 @@ module SensitiveData {
13489
SensitiveData data;
13590

13691
SensitiveVariableAccess() {
137-
data = HeuristicNames::getSensitiveDataForName(this.(AttrNode).getName())
92+
nameIndicatesSensitiveData(this.(AttrNode).getName(), data.getClassification())
13893
}
13994

14095
override predicate isSourceOf(TaintKind kind) { kind = data }
@@ -149,7 +104,7 @@ module SensitiveData {
149104
this.(CallNode).getFunction().(AttrNode).getName() = "get" and
150105
exists(StringValue sensitive |
151106
this.(CallNode).getAnArg().pointsTo(sensitive) and
152-
data = HeuristicNames::getSensitiveDataForName(sensitive.getText())
107+
nameIndicatesSensitiveData(sensitive.getText(), data.getClassification())
153108
)
154109
}
155110

0 commit comments

Comments
 (0)