Skip to content

Commit 27d1512

Browse files
committed
Python: MWE for call-graph tracing and ql comparison
1 parent 6d80445 commit 27d1512

File tree

9 files changed

+338
-0
lines changed

9 files changed

+338
-0
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Recorded Call Graph Metrics
2+
3+
also known as _call graph tracing_.
4+
5+
Execute a python program and for each call being made, record the call and callable. This allows us to compare call graph resolution from static analysis with actual data -- that is, can we statically determine the target of each actual call correctly.
6+
7+
This is still in the early stages, and currently only support a very minimal working example (to show that this approach might work).
8+
9+
The next hurdle is being able to handle multiple calls on the same line, such as
10+
11+
- `foo(); bar()`
12+
- `foo(bar())`
13+
- `foo().bar()`
14+
15+
## How do I give it a spin?
16+
17+
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory.
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
#!/usr/bin/env python3
2+
3+
"""Call Graph tracing.
4+
5+
Execute a python program and for each call being made, record the call and callable. This
6+
allows us to compare call graph resolution from static analysis with actual data -- that
7+
is, can we statically determine the target of each actual call correctly.
8+
9+
If there is 100% code coverage from the Python execution, it would also be possible to
10+
look at the precision of the call graph resolutions -- that is, do we expect a function to
11+
be able to be called in a place where it is not? Currently not something we're looking at.
12+
"""
13+
14+
# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/
15+
16+
# TODO: Know that a call to a C-function was made. See
17+
# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as
18+
# test
19+
20+
# For inspiration, look at these projects:
21+
# - https://github.com/joerick/pyinstrument (capture call-stack every <n> ms for profiling)
22+
# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution)
23+
24+
import argparse
25+
import bdb
26+
from io import StringIO
27+
import sys
28+
import os
29+
import dis
30+
import dataclasses
31+
import csv
32+
import xml.etree.ElementTree as ET
33+
34+
# Copy-Paste and uncomment for interactive ipython sessions
35+
# import IPython; IPython.embed(); sys.exit()
36+
37+
38+
@dataclasses.dataclass(frozen=True)
39+
class Call():
40+
"""A call to a callable
41+
"""
42+
filename: str
43+
linenum: int
44+
inst_index: int
45+
46+
@classmethod
47+
def from_frame(cls, frame, debugger: bdb.Bdb):
48+
code = frame.f_code
49+
50+
# Uncomment to see the bytecode
51+
# b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
52+
# print(b.dis(), file=sys.__stderr__)
53+
54+
return cls(
55+
filename = debugger.canonic(code.co_filename),
56+
linenum = frame.f_lineno,
57+
inst_index = frame.f_lasti,
58+
)
59+
60+
61+
@dataclasses.dataclass(frozen=True)
62+
class Callable():
63+
"""A callable (Function/Lambda) should (hopefully) be uniquely identified by its name and
64+
location (filename+line number)
65+
66+
TODO: Callable is maybe not a good name, since classes with __call__ will return true
67+
for the python code `callable(cls)` -- will have to consider how __call__ is handled
68+
"""
69+
funcname: str
70+
filename: str
71+
linenum: int
72+
73+
@classmethod
74+
def from_frame(cls, frame, debugger: bdb.Bdb):
75+
code = frame.f_code
76+
return cls(
77+
funcname = code.co_name,
78+
filename = debugger.canonic(code.co_filename),
79+
linenum = frame.f_lineno,
80+
)
81+
82+
83+
class CallGraphTracer(bdb.Bdb):
84+
"""Tracer that records calls being made
85+
86+
It would seem obvious that this should have extended `trace` library
87+
(https://docs.python.org/3/library/trace.html), but that part is not extensible --
88+
however, the basic debugger (bdb) is, and provides maybe a bit more help than just
89+
using `sys.settrace` directly.
90+
"""
91+
92+
recorded_calls: set
93+
94+
def __init__(self):
95+
self.recorded_calls = set()
96+
super().__init__()
97+
98+
def user_call(self, frame, argument_list):
99+
call = Call.from_frame(frame.f_back, self)
100+
callable = Callable.from_frame(frame, self)
101+
102+
# _print(f'{call} -> {callable}')
103+
self.recorded_calls.add((call, callable))
104+
105+
106+
################################################################################
107+
# Export
108+
################################################################################
109+
110+
111+
class Exporter:
112+
113+
@staticmethod
114+
def export(recorded_calls, outfile_path):
115+
raise NotImplementedError()
116+
117+
@staticmethod
118+
def dataclass_to_dict(obj):
119+
d = dataclasses.asdict(obj)
120+
prefix = obj.__class__.__name__.lower()
121+
return {f"{prefix}_{key}": val for (key, val) in d.items()}
122+
123+
124+
class CSVExporter(Exporter):
125+
126+
@staticmethod
127+
def export(recorded_calls, outfile_path):
128+
with open(outfile_path, 'w', newline='') as csv_file:
129+
writer = None
130+
for (call, callable) in recorded_calls:
131+
132+
data = {
133+
**Exporter.dataclass_to_dict(call),
134+
**Exporter.dataclass_to_dict(callable)
135+
}
136+
137+
if writer is None:
138+
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
139+
writer.writeheader()
140+
141+
writer.writerow(data)
142+
143+
144+
print(f'output written to {outfile_path}')
145+
146+
# embed(); sys.exit()
147+
148+
149+
class XMLExporter(Exporter):
150+
151+
@staticmethod
152+
def export(recorded_calls, outfile_path):
153+
154+
root = ET.Element('root')
155+
156+
for (call, callable) in recorded_calls:
157+
data = {
158+
**Exporter.dataclass_to_dict(call),
159+
**Exporter.dataclass_to_dict(callable)
160+
}
161+
162+
rc = ET.SubElement(root, 'recorded_call')
163+
# this xml library only supports serializing attributes that have string values
164+
rc.attrib = {k: str(v) for k, v in data.items()}
165+
166+
tree = ET.ElementTree(root)
167+
tree.write(outfile_path, encoding='utf-8')
168+
169+
170+
################################################################################
171+
# __main__
172+
################################################################################
173+
174+
175+
if __name__ == "__main__":
176+
177+
178+
parser = argparse.ArgumentParser()
179+
180+
181+
parser.add_argument('--csv')
182+
parser.add_argument('--xml')
183+
184+
parser.add_argument('progname', help='file to run as main program')
185+
parser.add_argument('arguments', nargs=argparse.REMAINDER,
186+
help='arguments to the program')
187+
188+
opts = parser.parse_args()
189+
190+
# These details of setting up the program to be run is very much inspired by `trace`
191+
# from the standard library
192+
sys.argv = [opts.progname, *opts.arguments]
193+
sys.path[0] = os.path.dirname(opts.progname)
194+
195+
with open(opts.progname) as fp:
196+
code = compile(fp.read(), opts.progname, 'exec')
197+
198+
# try to emulate __main__ namespace as much as possible
199+
globs = {
200+
'__file__': opts.progname,
201+
'__name__': '__main__',
202+
'__package__': None,
203+
'__cached__': None,
204+
}
205+
206+
real_stdout = sys.stdout
207+
real_stderr = sys.stderr
208+
captured_stdout = StringIO()
209+
210+
sys.stdout = captured_stdout
211+
cgt = CallGraphTracer()
212+
cgt.run(code, globs, globs)
213+
sys.stdout = real_stdout
214+
215+
if opts.csv:
216+
CSVExporter.export(cgt.recorded_calls, opts.csv)
217+
elif opts.xml:
218+
XMLExporter.export(cgt.recorded_calls, opts.xml)
219+
else:
220+
for (call, callable) in cgt.recorded_calls:
221+
print(f'{call} -> {callable}')
222+
223+
print('--- captured stdout ---')
224+
print(captured_stdout.getvalue(), end='')
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
def foo():
2+
print('foo')
3+
4+
def bar():
5+
print('bar')
6+
7+
foo()
8+
bar()
9+
10+
foo(); bar()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<root>
2+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" callable_funcname="bar" callable_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callable_linenum="4" />
3+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" callable_funcname="foo" callable_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callable_linenum="1" />
4+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" callable_funcname="foo" callable_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callable_linenum="1" />
5+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" callable_funcname="bar" callable_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callable_linenum="4" />
6+
</root>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import RecordedCalls
2+
3+
from ValidRecordedCall rc, Call call, Function callable, CallableValue callableValue
4+
where
5+
call = rc.getCall() and
6+
callable = rc.getCallable() and
7+
callableValue.getScope() = callable and
8+
callableValue.getACall() = call.getAFlowNode()
9+
select call, "-->", callable
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import python
2+
3+
class RecordedCall extends XMLElement {
4+
RecordedCall() {
5+
this.hasName("recorded_call")
6+
}
7+
8+
string call_filename() { result = this.getAttributeValue("call_filename") }
9+
10+
int call_linenum() { result = this.getAttributeValue("call_linenum").toInt() }
11+
12+
int call_inst_index() { result = this.getAttributeValue("call_inst_index").toInt() }
13+
14+
Call getCall() {
15+
// TODO: handle calls spanning multiple lines
16+
result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _)
17+
}
18+
19+
string callable_filename() { result = this.getAttributeValue("callable_filename") }
20+
21+
int callable_linenum() { result = this.getAttributeValue("callable_linenum").toInt() }
22+
23+
string callable_funcname() { result = this.getAttributeValue("callable_funcname") }
24+
25+
Function getCallable() {
26+
result.getLocation().hasLocationInfo(this.callable_filename(), this.callable_linenum(), _, _, _)
27+
}
28+
}
29+
30+
/**
31+
* Class of recorded calls where we can uniquely identify both the `call` and the `callable`.
32+
*/
33+
class ValidRecordedCall extends RecordedCall {
34+
ValidRecordedCall() {
35+
strictcount(this.getCall()) = 1 and
36+
strictcount(this.getCallable()) = 1
37+
}
38+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import RecordedCalls
2+
3+
from RecordedCall rc
4+
where not rc instanceof ValidRecordedCall
5+
select "Could not uniquely identify this recorded call (either call or callable was not uniquely identified)",
6+
rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callable_filename(),
7+
rc.callable_linenum(), rc.callable_funcname()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name: codeql-python-recorded-call-graph-metrics
2+
version: 0.0.1
3+
libraryPathDependencies: codeql-python
4+
extractor: python
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
3+
set -e
4+
set -x
5+
6+
DB="cg-trace-example-db"
7+
SRC="example/"
8+
XMLDIR="$SRC"
9+
PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
10+
11+
12+
./cg_trace.py --xml example/simple.xml example/simple.py
13+
14+
rm -rf "$DB"
15+
16+
17+
codeql database init --source-root="$SRC" --language=python "$DB"
18+
codeql database trace-command --working-dir="$SRC" "$DB" "$PYTHON_EXTRACTOR/tools/autobuild.sh"
19+
codeql database index-files --language xml --include-extension .xml --working-dir="$XMLDIR" "$DB"
20+
codeql database finalize "$DB"
21+
22+
set +x
23+
echo "Created database '$DB'"

0 commit comments

Comments
 (0)