Skip to content

Commit 63de58c

Browse files
authored
Merge pull request github#3819 from dbartol/codeql-c-analysis-team/40/2
C++: More IR QLDoc (including `Opcode.qll`)
2 parents 989b57c + 47bb007 commit 63de58c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+3238
-204
lines changed

config/opcode-qldoc.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/usr/bin/env python3
2+
3+
import os
4+
import re
5+
path = os.path
6+
7+
needs_an_re = re.compile(r'^(?!Unary)[AEIOU]') # Name requiring "an" instead of "a".
8+
start_qldoc_re = re.compile(r'^\s*/\*\*') # Start of a QLDoc comment
9+
end_qldoc_re = re.compile(r'\*/\s*$') # End of a QLDoc comment
10+
blank_qldoc_line_re = re.compile(r'^\s*\*\s*$') # A line in a QLDoc comment with only the '*'
11+
instruction_class_re = re.compile(r'^class (?P<name>[A-aa-z0-9]+)Instruction\s') # Declaration of an `Instruction` class
12+
opcode_base_class_re = re.compile(r'^abstract class (?P<name>[A-aa-z0-9]+)Opcode\s') # Declaration of an `Opcode` base class
13+
opcode_class_re = re.compile(r'^ class (?P<name>[A-aa-z0-9]+)\s') # Declaration of an `Opcode` class
14+
15+
script_dir = path.realpath(path.dirname(__file__))
16+
instruction_path = path.realpath(path.join(script_dir, '../cpp/ql/src/semmle/code/cpp/ir/implementation/raw/Instruction.qll'))
17+
opcode_path = path.realpath(path.join(script_dir, '../cpp/ql/src/semmle/code/cpp/ir/implementation/Opcode.qll'))
18+
19+
# Scan `Instruction.qll`, keeping track of the QLDoc comment attached to each declaration of a class
20+
# whose name ends with `Instruction`.
21+
instruction_comments = {}
22+
in_qldoc = False
23+
saw_blank_line_in_qldoc = False
24+
qldoc_lines = []
25+
with open(instruction_path, 'r', encoding='utf-8') as instr:
26+
for line in instr:
27+
if in_qldoc:
28+
if end_qldoc_re.search(line):
29+
qldoc_lines.append(line)
30+
in_qldoc = False
31+
elif blank_qldoc_line_re.search(line):
32+
# We're going to skip any lines after the first blank line, to avoid duplicating all
33+
# of the verbose description.
34+
saw_blank_line_in_qldoc = True
35+
elif not saw_blank_line_in_qldoc:
36+
qldoc_lines.append(line)
37+
else:
38+
if start_qldoc_re.search(line):
39+
# Starting a new QLDoc comment.
40+
saw_blank_line_in_qldoc = False
41+
qldoc_lines.append(line)
42+
if not end_qldoc_re.search(line):
43+
in_qldoc = True
44+
else:
45+
instruction_match = instruction_class_re.search(line)
46+
if instruction_match:
47+
# Found the declaration of an `Instruction` class. Record the QLDoc comments.
48+
instruction_comments[instruction_match.group('name')] = qldoc_lines
49+
qldoc_lines = []
50+
51+
# Scan `Opcode.qll`. Whenever we see the declaration of an `Opcode` class for which we have a
52+
# corresponding `Instruction` class, we'll attach a copy of the `Instruction`'s QLDoc comment.
53+
in_qldoc = False
54+
qldoc_lines = []
55+
output_lines = []
56+
with open(opcode_path, 'r', encoding='utf-8') as opcode:
57+
for line in opcode:
58+
if in_qldoc:
59+
qldoc_lines.append(line)
60+
if end_qldoc_re.search(line):
61+
in_qldoc = False
62+
else:
63+
if start_qldoc_re.search(line):
64+
qldoc_lines.append(line)
65+
if not end_qldoc_re.search(line):
66+
in_qldoc = True
67+
else:
68+
name_without_suffix = None
69+
name = None
70+
indent = ''
71+
opcode_base_match = opcode_base_class_re.search(line)
72+
if opcode_base_match:
73+
name_without_suffix = opcode_base_match.group('name')
74+
name = name_without_suffix + 'Opcode'
75+
else:
76+
opcode_match = opcode_class_re.search(line)
77+
if opcode_match:
78+
name_without_suffix = opcode_match.group('name')
79+
name = name_without_suffix
80+
# Indent by two additional spaces, since opcodes are declared in the
81+
# `Opcode` module.
82+
indent = ' '
83+
84+
if name_without_suffix:
85+
# Found an `Opcode` that matches a known `Instruction`. Replace the QLDoc with
86+
# a copy of the one from the `Instruction`.
87+
if instruction_comments.get(name_without_suffix):
88+
article = 'an' if needs_an_re.search(name_without_suffix) else 'a'
89+
qldoc_lines = [
90+
indent + '/**\n',
91+
indent + ' * The `Opcode` for ' + article + ' `' + name_without_suffix + 'Instruction`.\n',
92+
indent + ' *\n',
93+
indent + ' * See the `' + name_without_suffix + 'Instruction` documentation for more details.\n',
94+
indent + ' */\n'
95+
]
96+
output_lines.extend(qldoc_lines)
97+
qldoc_lines = []
98+
output_lines.append(line)
99+
100+
# Write out the updated `Opcode.qll`
101+
with open(opcode_path, 'w', encoding='utf-8') as opcode:
102+
opcode.writelines(output_lines)

cpp/ql/src/semmle/code/cpp/ir/IR.qll

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,47 @@
1+
/**
2+
* Provides classes that describe the Intermediate Representation (IR) of the program.
3+
*
4+
* The IR is a representation of the semantics of the program, with very little dependence on the
5+
* syntax that was used to write the program. For example, in C++, the statements `i += 1;`, `i++`,
6+
* and `++i` all have the same semantic effect, but appear in the AST as three different types of
7+
* `Expr` node. In the IR, all three statements are broken down into a sequence of fundamental
8+
* operations similar to:
9+
*
10+
* ```
11+
* r1(int*) = VariableAddress[i] // Compute the address of variable `i`
12+
* r2(int) = Load &:r1, m0 // Load the value of `i`
13+
* r3(int) = Constant[1] // An integer constant with the value `1`
14+
* r4(int) = Add r2, r3 // Add `1` to the value of `i`
15+
* r5(int) = Store &r1, r4 // Store the new value back into the variable `i`
16+
* ```
17+
*
18+
* This allows IR-based analysis to focus on the fundamental operations, rather than having to be
19+
* concerned with the various ways of expressing those operations in source code.
20+
*
21+
* The key classes in the IR are:
22+
*
23+
* - `IRFunction` - Contains the IR for an entire function definition, including all of that
24+
* function's `Instruction`s, `IRBlock`s, and `IRVariables`.
25+
* - `Instruction` - A single operation in the IR. An instruction specifies the operation to be
26+
* performed, the operands that produce the inputs to that operation, and the type of the result
27+
* of the operation. Control flows from an `Instruction` to one of a set of successor
28+
* `Instruction`s.
29+
* - `Operand` - An input value of an `Instruction`. All inputs of an `Instruction` are explicitly
30+
* represented as `Operand`s, even if the input was implicit in the source code. An `Operand` has
31+
* a link to the `Instruction` that consumes its value (its "use") and a link to the `Instruction`
32+
* that produces its value (its "definition").
33+
* - `IRVariable` - A variable accessed by the IR for a particular function. An `IRVariable` is
34+
* created for each variable directly accessed by the function. In addition, `IRVariable`s are
35+
* created to represent certain temporary storage locations that do not have explicitly declared
36+
* variables in the source code, such as the return value of the function.
37+
* - `IRBlock` - A "basic block" in the control flow graph of a function. An `IRBlock` contains a
38+
* sequence of instructions such that control flow can only enter the block at the first
39+
* instruction, and can only leave the block from the last instruction.
40+
* - `IRType` - The type of a value accessed in the IR. Unlike the `Type` class in the AST, `IRType`
41+
* is language-neutral. For example, in C++, `unsigned int`, `char32_t`, and `wchar_t` might all
42+
* be represented as the `IRType` `uint4`, a four-byte unsigned integer.
43+
*/
44+
145
// Most queries should operate on the aliased SSA IR, so that's what we expose
2-
// publically as the "IR".
46+
// publicly as the "IR".
347
import implementation.aliased_ssa.IR
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1+
/**
2+
* Module used to configure the IR generation process.
3+
*/
4+
15
import implementation.IRConfiguration
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,11 @@
1+
/**
2+
* Outputs a representation of the IR as a control flow graph.
3+
*
4+
* This file contains the actual implementation of `PrintIR.ql`. For test cases and very small
5+
* databases, `PrintIR.ql` can be run directly to dump the IR for the entire database. For most
6+
* uses, however, it is better to write a query that imports `PrintIR.qll`, extends
7+
* `PrintIRConfiguration`, and overrides `shouldPrintFunction()` to select a subset of functions to
8+
* dump.
9+
*/
10+
111
import implementation.aliased_ssa.PrintIR

cpp/ql/src/semmle/code/cpp/ir/implementation/EdgeKind.qll

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
/**
2+
* Provides classes that specify the conditions under which control flows along a given edge.
3+
*/
4+
15
private import internal.EdgeKindInternal
26

37
private newtype TEdgeKind =
@@ -77,9 +81,15 @@ class CaseEdge extends EdgeKind, TCaseEdge {
7781
else result = "Case[" + minValue + ".." + maxValue + "]"
7882
}
7983

80-
string getMinValue() { result = minValue }
84+
/**
85+
* Gets the smallest value of the switch expression for which control will flow along this edge.
86+
*/
87+
final string getMinValue() { result = minValue }
8188

82-
string getMaxValue() { result = maxValue }
89+
/**
90+
* Gets the largest value of the switch expression for which control will flow along this edge.
91+
*/
92+
final string getMaxValue() { result = maxValue }
8393
}
8494

8595
/**

cpp/ql/src/semmle/code/cpp/ir/implementation/IRConfiguration.qll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,21 @@ private newtype TIRConfiguration = MkIRConfiguration()
1010
* The query can extend this class to control which functions have IR generated for them.
1111
*/
1212
class IRConfiguration extends TIRConfiguration {
13+
/** Gets a textual representation of this element. */
1314
string toString() { result = "IRConfiguration" }
1415

1516
/**
1617
* Holds if IR should be created for function `func`. By default, holds for all functions.
1718
*/
1819
predicate shouldCreateIRForFunction(Language::Function func) { any() }
1920

21+
/**
22+
* Holds if the strings used as part of an IR dump should be generated for function `func`.
23+
*
24+
* This predicate is overridden in `PrintIR.qll` to avoid the expense of generating a large number
25+
* of debug strings for IR that will not be dumped. We still generate the actual IR for these
26+
* functions, however, to preserve the results of any interprocedural analysis.
27+
*/
2028
predicate shouldEvaluateDebugStringsForFunction(Language::Function func) { any() }
2129
}
2230

@@ -26,6 +34,7 @@ private newtype TIREscapeAnalysisConfiguration = MkIREscapeAnalysisConfiguration
2634
* The query can extend this class to control what escape analysis is used when generating SSA.
2735
*/
2836
class IREscapeAnalysisConfiguration extends TIREscapeAnalysisConfiguration {
37+
/** Gets a textual representation of this element. */
2938
string toString() { result = "IREscapeAnalysisConfiguration" }
3039

3140
/**

cpp/ql/src/semmle/code/cpp/ir/implementation/IRType.qll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ private newtype TIRType =
3232
* all pointer types map to the same instance of `IRAddressType`.
3333
*/
3434
class IRType extends TIRType {
35+
/** Gets a textual representation of this type. */
3536
string toString() { none() }
3637

3738
/**

cpp/ql/src/semmle/code/cpp/ir/implementation/MemoryAccessKind.qll

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
/**
2+
* Provides classes that describe how a particular `Instruction` or its operands access memory.
3+
*/
4+
5+
private import IRConfiguration
6+
17
private newtype TMemoryAccessKind =
28
TIndirectMemoryAccess() or
39
TBufferMemoryAccess() or
@@ -14,6 +20,7 @@ private newtype TMemoryAccessKind =
1420
* memory result.
1521
*/
1622
class MemoryAccessKind extends TMemoryAccessKind {
23+
/** Gets a textual representation of this access kind. */
1724
string toString() { none() }
1825

1926
/**

0 commit comments

Comments
 (0)