Skip to content

Commit 7d8b624

Browse files
committed
Basic script to generate shared code metrics
1 parent f4cb920 commit 7d8b624

File tree

1 file changed

+329
-0
lines changed

1 file changed

+329
-0
lines changed

misc/scripts/shared-code-metrics.py

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
# Generates a report on the amount of code sharing in this repo
2+
#
3+
# The purpose of this is
4+
# a) To be able to understand the structure and dependencies
5+
# b) To provide a metric that measures the amount of shared vs non-shared code
6+
7+
import datetime
8+
from pathlib import Path
9+
import json
10+
import yaml
11+
12+
# To add more languages, add them to this list:
13+
languages = ['cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ql', 'ruby', 'swift']
14+
15+
repo_location = Path(__file__).parent.parent.parent
16+
17+
# Gets the total number of lines in a file
18+
def linecount(file):
19+
with open(file, 'r') as fp: return len(fp.readlines())
20+
21+
# Gets the language name from the path
22+
def get_language(path):
23+
return path.parts[len(repo_location.parts)]
24+
25+
# Is this path a CodeQL query file
26+
def is_query(path):
27+
return path.suffix == '.ql'
28+
29+
# Is this path a CodeQL library file
30+
def is_library(path):
31+
return path.suffix == '.qll'
32+
33+
# Is this path a relevant CodeQL file
34+
def is_ql(path):
35+
return is_query(path) or is_library(path)
36+
37+
# Is this file a CodeQL package file
38+
def is_package(path):
39+
return path.name == 'qlpack.yml'
40+
41+
# A CodeQL source file
42+
class QlFile:
43+
def __init__(self, path):
44+
self.path = path
45+
self.lines = linecount(path)
46+
shared = False
47+
48+
def language(self):
49+
return get_language(self.path)
50+
51+
def query(self):
52+
return is_query(self.path)
53+
54+
def library(self):
55+
return is_library(self.path)
56+
57+
# Returns if this qlfile is not shared, and is in a pack that is only in one language
58+
def isOnlyInLanguage(self, language):
59+
return not self.shared and (self.package is None or self.package.languages == {language}) and self.language() == language
60+
61+
# Represents a language folder
62+
class Language:
63+
def __init__(self, name):
64+
self.name = name
65+
self.packs = []
66+
self.nonshared_files = 0
67+
self.nonshared_lines = 0
68+
self.imported_files = 0
69+
self.imported_lines = 0
70+
71+
def addQlFile(self, qlfile):
72+
if not qlfile.shared:
73+
self.nonshared_files += 1
74+
self.nonshared_lines += qlfile.lines
75+
76+
def addSharedAsset(self, package):
77+
self.imported_files += package.files
78+
self.imported_lines += package.lines
79+
80+
# A shared package or file
81+
class SharedAsset:
82+
def __init__(self, name):
83+
self.name = name
84+
85+
# A file shared using identical-files.json
86+
class IdenticalFileSet(SharedAsset):
87+
def __init__(self, name, ql_files):
88+
self.name = name
89+
self.languages = set()
90+
self.files = 0
91+
self.lines = 0
92+
for file in ql_files:
93+
file.package = self
94+
file.shared = True
95+
self.files = 1
96+
self.lines = file.lines
97+
self.languages.add(file.language())
98+
99+
# Gets a pretty-printed markdown link
100+
def link(self):
101+
return self.name
102+
103+
# Represents all files shared in `identical-files.json`
104+
# Reads the file and builds a list of assets
105+
class IdenticalFiles:
106+
def __init__(self, repo_location, ql_file_index):
107+
identical_files = repo_location/'config'/'identical-files.json'
108+
with open(identical_files, "r") as fp:
109+
identical_files_json = json.load(fp)
110+
# Create a list of assets
111+
self.assets = []
112+
for group in identical_files_json:
113+
paths = []
114+
for file in identical_files_json[group]:
115+
path = repo_location / file
116+
if is_ql(path):
117+
ql_file_index[path].shared = True
118+
paths.append(ql_file_index[path])
119+
self.assets.append(IdenticalFileSet(group, paths))
120+
121+
# A package created from a `qlpack.yml`` file
122+
class Package(SharedAsset):
123+
def __init__(self, path, ql_file_index):
124+
self.path = path
125+
self.language = get_language(path)
126+
self.lines = 0
127+
self.files = 0
128+
self.languages = set()
129+
self.languages.add(self.language)
130+
self.identical_files_dependencies = set()
131+
with open(path, 'r') as fp:
132+
y = yaml.safe_load(fp)
133+
if 'name' in y:
134+
self.name = y['name']
135+
else:
136+
self.name = path.parent.name
137+
if 'dependencies' in y:
138+
self.deps = y['dependencies']
139+
if self.deps is None:
140+
self.deps = {}
141+
else:
142+
self.deps = {}
143+
# Mark all relevant files with their package
144+
for file in ql_file_index:
145+
if self.containsDirectory(file):
146+
file = ql_file_index[file]
147+
if not file.shared:
148+
file.package = self
149+
self.lines += file.lines
150+
self.files += 1
151+
else:
152+
self.identical_files_dependencies.add(file.package)
153+
self.url = "https://github.com/github/codeql/blob/main/" + str(path.relative_to(repo_location))
154+
155+
# Gets a pretty-printed markdown link
156+
def link(self):
157+
return '[' + self.name + '](' + self.url + ')'
158+
159+
def containsDirectory(self, dir):
160+
return self.path.parent.parts == dir.parts[:len(self.path.parent.parts)]
161+
# dir.startsWith(self.path.parent)
162+
163+
# Constructs a list of transitive depedencies of this package.
164+
def calculateDependencies(self, packageNameMap):
165+
self.transitive_dependencies = set(self.deps)
166+
queue = list(self.deps)
167+
while len(queue):
168+
item = queue.pop()
169+
for dep2 in packageNameMap[item].deps:
170+
if dep2 not in self.transitive_dependencies:
171+
self.transitive_dependencies.add(dep2)
172+
queue.append(dep2)
173+
# Calculate the amount of imported code
174+
self.total_imported_files = 0
175+
self.total_imported_lines = 0
176+
self.all_dependencies = set(self.identical_files_dependencies)
177+
for dep in self.transitive_dependencies:
178+
self.all_dependencies.add(packageNameMap[dep])
179+
for dep in self.all_dependencies:
180+
self.total_imported_files += dep.files
181+
self.total_imported_lines += dep.lines
182+
dep.languages.add(self.language)
183+
184+
# Create a big index of all files and their line counts.
185+
186+
# Map from path to line count
187+
ql_file_index = {}
188+
package_files = []
189+
190+
# Queue of directories to read
191+
directories_to_scan = [repo_location]
192+
193+
while len(directories_to_scan)!=0:
194+
dir = directories_to_scan.pop()
195+
for p in dir.iterdir():
196+
if p.is_dir():
197+
directories_to_scan.append(p)
198+
elif is_ql(p):
199+
ql_file_index[p] = QlFile(p)
200+
elif is_package(p):
201+
package_files.append(p)
202+
203+
# Create identical_files_json
204+
identical_files = IdenticalFiles(repo_location, ql_file_index)
205+
206+
# Create packages
207+
# Do this after identical_files so that we can figure out the package sizes
208+
# Do this after getting the ql_file_index fully built
209+
packages = []
210+
for file in package_files:
211+
packages.append(Package(file, ql_file_index))
212+
213+
# List all shared assets
214+
shared_assets = packages + identical_files.assets
215+
216+
# Construct statistics for each language
217+
language_info = {}
218+
for l in languages:
219+
language_info[l] = Language(l)
220+
221+
for qlfile in ql_file_index.values():
222+
lang = qlfile.language()
223+
if lang in language_info:
224+
info = language_info[lang]
225+
if qlfile.isOnlyInLanguage(lang):
226+
info.addQlFile(qlfile)
227+
228+
# Determine all package dependencies
229+
230+
packageNameMap = {}
231+
232+
for package in packages:
233+
packageNameMap[package.name] = package
234+
235+
for package in packages:
236+
package.calculateDependencies(packageNameMap)
237+
238+
for asset in shared_assets:
239+
if len(asset.languages)>1:
240+
for lang in asset.languages:
241+
if lang in language_info:
242+
language_info[lang].addSharedAsset(asset)
243+
244+
245+
# Functions to output the results
246+
247+
def list_assets(shared_assets, language_info):
248+
print('| Asset | Files | Lines |', end='')
249+
for lang in language_info:
250+
print('', lang, '|', end='')
251+
print()
252+
print('| ----- | ----- | ----- |', end='')
253+
for lang in language_info:
254+
print(' ---- |', end='')
255+
print()
256+
for asset in shared_assets:
257+
print('|', asset.link(), '|', asset.files ,'|', asset.lines, '|', end=' ')
258+
for lang in language_info:
259+
if lang in asset.languages:
260+
print('yes |', end=' ')
261+
else:
262+
print(' |', end=' ');
263+
print()
264+
print()
265+
266+
def list_package_dependencies(package):
267+
print("Package", package.path, package.name, package.files, package.lines, package.total_imported_files, package.total_imported_lines)
268+
for dep in package.all_dependencies:
269+
print(" ", dep.name, dep.files, dep.lines)
270+
271+
def print_package_dependencies(packages):
272+
print('| Package name | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
273+
print('| ------------ | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
274+
for package in packages:
275+
nlines = package.lines + package.total_imported_lines
276+
shared_percentage = 100 * package.total_imported_lines / nlines if nlines>0 else 0
277+
print('|', package.link(), '|', package.files, '|', package.lines, '|', package.total_imported_files, '|', package.total_imported_lines, '|',
278+
# ','.join([p.name for p in package.all_dependencies]),
279+
"%.2f" % shared_percentage, '|')
280+
print()
281+
282+
def print_language_dependencies(packages):
283+
print_package_dependencies([p for p in packages if p.name.endswith('-all') and p.name.count('-')==1])
284+
285+
def list_shared_code_by_language(language_info):
286+
# For each language directory, list the files that are (1) inside the directory and not shared,
287+
# (2) packages from outside the directory, plus identical files
288+
print('| Language | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
289+
print('| -------- | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
290+
for lang in language_info:
291+
info = language_info[lang]
292+
total = info.imported_lines + info.nonshared_lines
293+
shared_percentage = 100 * info.imported_lines / total if total>0 else 0
294+
print('|', lang, '|', info.nonshared_files, '|', info.nonshared_lines, '|', info.imported_files, '|', info.imported_lines, '|', "%.2f" % shared_percentage, '|')
295+
print()
296+
297+
298+
# Output reports
299+
300+
print('# Report on CodeQL code sharing\n')
301+
print('Generated on', datetime.datetime.now())
302+
print()
303+
304+
print('## Shared code by language\n')
305+
306+
list_shared_code_by_language(language_info)
307+
308+
print('''
309+
* *Non-shared files*: The number of CodeQL files (`.ql`/`.qll`) that are only used within this language folder. Excludes `identical-files.json` that are shared between multiple languages.
310+
* *Non-shared lines of code*: The number of lines of code in the non-shared files.
311+
* *Imported files*: All CodeQL files (`.ql`/`.qll`) files that are transitively used in this language folder, either via packages or `identical-files.json`
312+
* *Imported lines of code*: The number of lines of code in the imported files
313+
* *Shared code %*: The proportion of imported lines / total lines (nonshared + imported).
314+
315+
## Shared packages use by language
316+
317+
A package is *used* if it is a direct or indirect dependency, or a file shared via `identical-files.json`.
318+
319+
''')
320+
321+
list_assets(shared_assets, language_info)
322+
323+
print('## Shared code by language pack\n')
324+
325+
print_language_dependencies(packages)
326+
327+
print('## Shared code by package\n')
328+
329+
print_package_dependencies(packages)

0 commit comments

Comments
 (0)