Skip to content

Commit 0bee068

Browse files
authored
Merge pull request github#3911 from RasmusWL/python-call-graph-tracing
Approved by tausbn
2 parents f8c03dc + f1601d6 commit 0bee068

File tree

9 files changed

+334
-0
lines changed

9 files changed

+334
-0
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Recorded Call Graph Metrics
2+
3+
also known as _call graph tracing_.
4+
5+
Execute a python program and for each call being made, record the call and callee. This allows us to compare call graph resolution from static analysis with actual data -- that is, can we statically determine the target of each actual call correctly.
6+
7+
This is still in the early stages, and currently only supports a very minimal working example (to show that this approach might work).
8+
9+
The next hurdle is being able to handle multiple calls on the same line, such as
10+
11+
- `foo(); bar()`
12+
- `foo(bar())`
13+
- `foo().bar()`
14+
15+
## How do I give it a spin?
16+
17+
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory.
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#!/usr/bin/env python3
2+
3+
"""Call Graph tracing.
4+
5+
Execute a python program and for each call being made, record the call and callee. This
6+
allows us to compare call graph resolution from static analysis with actual data -- that
7+
is, can we statically determine the target of each actual call correctly.
8+
9+
If there is 100% code coverage from the Python execution, it would also be possible to
10+
look at the precision of the call graph resolutions -- that is, do we expect a function to
11+
be able to be called in a place where it is not? Currently not something we're looking at.
12+
"""
13+
14+
# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/
15+
16+
# TODO: Know that a call to a C-function was made. See
17+
# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as
18+
# test
19+
20+
# For inspiration, look at these projects:
21+
# - https://github.com/joerick/pyinstrument (capture call-stack every <n> ms for profiling)
22+
# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution)
23+
24+
import argparse
25+
import bdb
26+
from io import StringIO
27+
import sys
28+
import os
29+
import dis
30+
import dataclasses
31+
import csv
32+
import xml.etree.ElementTree as ET
33+
34+
# Copy-Paste and uncomment for interactive ipython sessions
35+
# import IPython; IPython.embed(); sys.exit()
36+
37+
38+
@dataclasses.dataclass(frozen=True)
39+
class Call():
40+
"""A call
41+
"""
42+
filename: str
43+
linenum: int
44+
inst_index: int
45+
46+
@classmethod
47+
def from_frame(cls, frame, debugger: bdb.Bdb):
48+
code = frame.f_code
49+
50+
# Uncomment to see the bytecode
51+
# b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
52+
# print(b.dis(), file=sys.__stderr__)
53+
54+
return cls(
55+
filename = debugger.canonic(code.co_filename),
56+
linenum = frame.f_lineno,
57+
inst_index = frame.f_lasti,
58+
)
59+
60+
61+
@dataclasses.dataclass(frozen=True)
62+
class Callee():
63+
"""A callee (Function/Lambda/???)
64+
65+
should (hopefully) be uniquely identified by its name and location (filename+line
66+
number)
67+
"""
68+
funcname: str
69+
filename: str
70+
linenum: int
71+
72+
@classmethod
73+
def from_frame(cls, frame, debugger: bdb.Bdb):
74+
code = frame.f_code
75+
return cls(
76+
funcname = code.co_name,
77+
filename = debugger.canonic(code.co_filename),
78+
linenum = frame.f_lineno,
79+
)
80+
81+
82+
class CallGraphTracer(bdb.Bdb):
83+
"""Tracer that records calls being made
84+
85+
It would seem obvious that this should have extended `trace` library
86+
(https://docs.python.org/3/library/trace.html), but that part is not extensible --
87+
however, the basic debugger (bdb) is, and provides maybe a bit more help than just
88+
using `sys.settrace` directly.
89+
"""
90+
91+
recorded_calls: set
92+
93+
def __init__(self):
94+
self.recorded_calls = set()
95+
super().__init__()
96+
97+
def user_call(self, frame, argument_list):
98+
call = Call.from_frame(frame.f_back, self)
99+
callee = Callee.from_frame(frame, self)
100+
101+
# _print(f'{call} -> {callee}')
102+
self.recorded_calls.add((call, callee))
103+
104+
105+
################################################################################
106+
# Export
107+
################################################################################
108+
109+
110+
class Exporter:
111+
112+
@staticmethod
113+
def export(recorded_calls, outfile_path):
114+
raise NotImplementedError()
115+
116+
@staticmethod
117+
def dataclass_to_dict(obj):
118+
d = dataclasses.asdict(obj)
119+
prefix = obj.__class__.__name__.lower()
120+
return {f"{prefix}_{key}": val for (key, val) in d.items()}
121+
122+
123+
class CSVExporter(Exporter):
124+
125+
@staticmethod
126+
def export(recorded_calls, outfile_path):
127+
with open(outfile_path, 'w', newline='') as csv_file:
128+
writer = None
129+
for (call, callee) in recorded_calls:
130+
data = {
131+
**Exporter.dataclass_to_dict(call),
132+
**Exporter.dataclass_to_dict(callee)
133+
}
134+
135+
if writer is None:
136+
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
137+
writer.writeheader()
138+
139+
writer.writerow(data)
140+
141+
142+
print(f'output written to {outfile_path}')
143+
144+
# embed(); sys.exit()
145+
146+
147+
class XMLExporter(Exporter):
148+
149+
@staticmethod
150+
def export(recorded_calls, outfile_path):
151+
152+
root = ET.Element('root')
153+
154+
for (call, callee) in recorded_calls:
155+
data = {
156+
**Exporter.dataclass_to_dict(call),
157+
**Exporter.dataclass_to_dict(callee)
158+
}
159+
160+
rc = ET.SubElement(root, 'recorded_call')
161+
# this xml library only supports serializing attributes that have string values
162+
rc.attrib = {k: str(v) for k, v in data.items()}
163+
164+
tree = ET.ElementTree(root)
165+
tree.write(outfile_path, encoding='utf-8')
166+
167+
168+
################################################################################
169+
# __main__
170+
################################################################################
171+
172+
173+
if __name__ == "__main__":
174+
175+
176+
parser = argparse.ArgumentParser()
177+
178+
179+
parser.add_argument('--csv')
180+
parser.add_argument('--xml')
181+
182+
parser.add_argument('progname', help='file to run as main program')
183+
parser.add_argument('arguments', nargs=argparse.REMAINDER,
184+
help='arguments to the program')
185+
186+
opts = parser.parse_args()
187+
188+
# These details of setting up the program to be run is very much inspired by `trace`
189+
# from the standard library
190+
sys.argv = [opts.progname, *opts.arguments]
191+
sys.path[0] = os.path.dirname(opts.progname)
192+
193+
with open(opts.progname) as fp:
194+
code = compile(fp.read(), opts.progname, 'exec')
195+
196+
# try to emulate __main__ namespace as much as possible
197+
globs = {
198+
'__file__': opts.progname,
199+
'__name__': '__main__',
200+
'__package__': None,
201+
'__cached__': None,
202+
}
203+
204+
real_stdout = sys.stdout
205+
real_stderr = sys.stderr
206+
captured_stdout = StringIO()
207+
208+
sys.stdout = captured_stdout
209+
cgt = CallGraphTracer()
210+
cgt.run(code, globs, globs)
211+
sys.stdout = real_stdout
212+
213+
if opts.csv:
214+
CSVExporter.export(cgt.recorded_calls, opts.csv)
215+
elif opts.xml:
216+
XMLExporter.export(cgt.recorded_calls, opts.xml)
217+
else:
218+
for (call, callee) in cgt.recorded_calls:
219+
print(f'{call} -> {callee}')
220+
221+
print('--- captured stdout ---')
222+
print(captured_stdout.getvalue(), end='')
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
def foo():
2+
print('foo')
3+
4+
def bar():
5+
print('bar')
6+
7+
foo()
8+
bar()
9+
10+
foo(); bar()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<root>
2+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
3+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
4+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
5+
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
6+
</root>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import RecordedCalls
2+
3+
from ValidRecordedCall rc, Call call, Function callee, CallableValue calleeValue
4+
where
5+
call = rc.getCall() and
6+
callee = rc.getCallee() and
7+
calleeValue.getScope() = callee and
8+
calleeValue.getACall() = call.getAFlowNode()
9+
select call, "-->", callee
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import python
2+
3+
class RecordedCall extends XMLElement {
4+
RecordedCall() { this.hasName("recorded_call") }
5+
6+
string call_filename() { result = this.getAttributeValue("call_filename") }
7+
8+
int call_linenum() { result = this.getAttributeValue("call_linenum").toInt() }
9+
10+
int call_inst_index() { result = this.getAttributeValue("call_inst_index").toInt() }
11+
12+
Call getCall() {
13+
// TODO: handle calls spanning multiple lines
14+
result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _)
15+
}
16+
17+
string callee_filename() { result = this.getAttributeValue("callee_filename") }
18+
19+
int callee_linenum() { result = this.getAttributeValue("callee_linenum").toInt() }
20+
21+
string callee_funcname() { result = this.getAttributeValue("callee_funcname") }
22+
23+
Function getCallee() {
24+
result.getLocation().hasLocationInfo(this.callee_filename(), this.callee_linenum(), _, _, _)
25+
}
26+
}
27+
28+
/**
29+
* Class of recorded calls where we can uniquely identify both the `call` and the `callee`.
30+
*/
31+
class ValidRecordedCall extends RecordedCall {
32+
ValidRecordedCall() {
33+
strictcount(this.getCall()) = 1 and
34+
strictcount(this.getCallee()) = 1
35+
}
36+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import RecordedCalls
2+
3+
from RecordedCall rc
4+
where not rc instanceof ValidRecordedCall
5+
select "Could not uniquely identify this recorded call (either call or callee was not uniquely identified)",
6+
rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callee_filename(),
7+
rc.callee_linenum(), rc.callee_funcname()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name: codeql-python-recorded-call-graph-metrics
2+
version: 0.0.1
3+
libraryPathDependencies: codeql-python
4+
extractor: python
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
3+
set -e
4+
set -x
5+
6+
DB="cg-trace-example-db"
7+
SRC="example/"
8+
XMLDIR="$SRC"
9+
PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
10+
11+
12+
./cg_trace.py --xml example/simple.xml example/simple.py
13+
14+
rm -rf "$DB"
15+
16+
17+
codeql database init --source-root="$SRC" --language=python "$DB"
18+
codeql database trace-command --working-dir="$SRC" "$DB" "$PYTHON_EXTRACTOR/tools/autobuild.sh"
19+
codeql database index-files --language xml --include-extension .xml --working-dir="$XMLDIR" "$DB"
20+
codeql database finalize "$DB"
21+
22+
set +x
23+
echo "Created database '$DB'"

0 commit comments

Comments
 (0)