Skip to content

Commit 779c236

Browse files
authored
Merge pull request github#12091 from github/cg/shared-code-metrics
Script to generate shared code metrics
2 parents e19e28f + 2cfd6c5 commit 779c236

File tree

1 file changed

+330
-0
lines changed

1 file changed

+330
-0
lines changed

misc/scripts/shared-code-metrics.py

Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
#!/bin/env python3
2+
# Generates a report on the amount of code sharing in this repo
3+
#
4+
# The purpose of this is
5+
# a) To be able to understand the structure and dependencies
6+
# b) To provide a metric that measures the amount of shared vs non-shared code
7+
8+
import datetime
9+
from pathlib import Path
10+
import json
11+
import yaml
12+
13+
# To add more languages, add them to this list:
14+
languages = ['cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ql', 'ruby', 'swift']
15+
16+
repo_location = Path(__file__).parent.parent.parent
17+
18+
# Gets the total number of lines in a file
19+
def linecount(file):
20+
with open(file, 'r') as fp: return len(fp.readlines())
21+
22+
# Gets the language name from the path
23+
def get_language(path):
24+
return path.parts[len(repo_location.parts)]
25+
26+
# Is this path a CodeQL query file
27+
def is_query(path):
28+
return path.suffix == '.ql'
29+
30+
# Is this path a CodeQL library file
31+
def is_library(path):
32+
return path.suffix == '.qll'
33+
34+
# Is this path a relevant CodeQL file
35+
def is_ql(path):
36+
return is_query(path) or is_library(path)
37+
38+
# Is this file a CodeQL package file
39+
def is_package(path):
40+
return path.name == 'qlpack.yml'
41+
42+
# A CodeQL source file
43+
class QlFile:
44+
def __init__(self, path):
45+
self.path = path
46+
self.lines = linecount(path)
47+
shared = False
48+
49+
def language(self):
50+
return get_language(self.path)
51+
52+
def query(self):
53+
return is_query(self.path)
54+
55+
def library(self):
56+
return is_library(self.path)
57+
58+
# Returns if this qlfile is not shared, and is in a pack that is only in one language
59+
def isOnlyInLanguage(self, language):
60+
return not self.shared and (self.package is None or self.package.languages == {language}) and self.language() == language
61+
62+
# Represents a language folder
63+
class Language:
64+
def __init__(self, name):
65+
self.name = name
66+
self.packs = []
67+
self.nonshared_files = 0
68+
self.nonshared_lines = 0
69+
self.imported_files = 0
70+
self.imported_lines = 0
71+
72+
def addQlFile(self, qlfile):
73+
if not qlfile.shared:
74+
self.nonshared_files += 1
75+
self.nonshared_lines += qlfile.lines
76+
77+
def addSharedAsset(self, package):
78+
self.imported_files += package.files
79+
self.imported_lines += package.lines
80+
81+
# A shared package or file
82+
class SharedAsset:
83+
def __init__(self, name):
84+
self.name = name
85+
86+
# A file shared using identical-files.json
87+
class IdenticalFileSet(SharedAsset):
88+
def __init__(self, name, ql_files):
89+
self.name = name
90+
self.languages = set()
91+
self.files = 0
92+
self.lines = 0
93+
for file in ql_files:
94+
file.package = self
95+
file.shared = True
96+
self.files = 1
97+
self.lines = file.lines
98+
self.languages.add(file.language())
99+
100+
# Gets a pretty-printed markdown link
101+
def link(self):
102+
return self.name
103+
104+
# Represents all files shared in `identical-files.json`
105+
# Reads the file and builds a list of assets
106+
class IdenticalFiles:
107+
def __init__(self, repo_location, ql_file_index):
108+
identical_files = repo_location/'config'/'identical-files.json'
109+
with open(identical_files, "r") as fp:
110+
identical_files_json = json.load(fp)
111+
# Create a list of assets
112+
self.assets = []
113+
for group in identical_files_json:
114+
paths = []
115+
for file in identical_files_json[group]:
116+
path = repo_location / file
117+
if is_ql(path):
118+
ql_file_index[path].shared = True
119+
paths.append(ql_file_index[path])
120+
self.assets.append(IdenticalFileSet(group, paths))
121+
122+
# A package created from a `qlpack.yml`` file
123+
class Package(SharedAsset):
124+
def __init__(self, path, ql_file_index):
125+
self.path = path
126+
self.language = get_language(path)
127+
self.lines = 0
128+
self.files = 0
129+
self.languages = set()
130+
self.languages.add(self.language)
131+
self.identical_files_dependencies = set()
132+
with open(path, 'r') as fp:
133+
y = yaml.safe_load(fp)
134+
if 'name' in y:
135+
self.name = y['name']
136+
else:
137+
self.name = path.parent.name
138+
if 'dependencies' in y:
139+
self.deps = y['dependencies']
140+
if self.deps is None:
141+
self.deps = {}
142+
else:
143+
self.deps = {}
144+
# Mark all relevant files with their package
145+
for file in ql_file_index:
146+
if self.containsDirectory(file):
147+
file = ql_file_index[file]
148+
if not file.shared:
149+
file.package = self
150+
self.lines += file.lines
151+
self.files += 1
152+
else:
153+
self.identical_files_dependencies.add(file.package)
154+
self.url = "https://github.com/github/codeql/blob/main/" + str(path.relative_to(repo_location))
155+
156+
# Gets a pretty-printed markdown link
157+
def link(self):
158+
return '[' + self.name + '](' + self.url + ')'
159+
160+
def containsDirectory(self, dir):
161+
return self.path.parent.parts == dir.parts[:len(self.path.parent.parts)]
162+
# dir.startsWith(self.path.parent)
163+
164+
# Constructs a list of transitive depedencies of this package.
165+
def calculateDependencies(self, packageNameMap):
166+
self.transitive_dependencies = set(self.deps)
167+
queue = list(self.deps)
168+
while len(queue):
169+
item = queue.pop()
170+
for dep2 in packageNameMap[item].deps:
171+
if dep2 not in self.transitive_dependencies:
172+
self.transitive_dependencies.add(dep2)
173+
queue.append(dep2)
174+
# Calculate the amount of imported code
175+
self.total_imported_files = 0
176+
self.total_imported_lines = 0
177+
self.all_dependencies = set(self.identical_files_dependencies)
178+
for dep in self.transitive_dependencies:
179+
self.all_dependencies.add(packageNameMap[dep])
180+
for dep in self.all_dependencies:
181+
self.total_imported_files += dep.files
182+
self.total_imported_lines += dep.lines
183+
dep.languages.add(self.language)
184+
185+
# Create a big index of all files and their line counts.
186+
187+
# Map from path to line count
188+
ql_file_index = {}
189+
package_files = []
190+
191+
# Queue of directories to read
192+
directories_to_scan = [repo_location]
193+
194+
while len(directories_to_scan)!=0:
195+
dir = directories_to_scan.pop()
196+
for p in dir.iterdir():
197+
if p.is_dir():
198+
directories_to_scan.append(p)
199+
elif is_ql(p):
200+
ql_file_index[p] = QlFile(p)
201+
elif is_package(p):
202+
package_files.append(p)
203+
204+
# Create identical_files_json
205+
identical_files = IdenticalFiles(repo_location, ql_file_index)
206+
207+
# Create packages
208+
# Do this after identical_files so that we can figure out the package sizes
209+
# Do this after getting the ql_file_index fully built
210+
packages = []
211+
for file in package_files:
212+
packages.append(Package(file, ql_file_index))
213+
214+
# List all shared assets
215+
shared_assets = packages + identical_files.assets
216+
217+
# Construct statistics for each language
218+
language_info = {}
219+
for l in languages:
220+
language_info[l] = Language(l)
221+
222+
for qlfile in ql_file_index.values():
223+
lang = qlfile.language()
224+
if lang in language_info:
225+
info = language_info[lang]
226+
if qlfile.isOnlyInLanguage(lang):
227+
info.addQlFile(qlfile)
228+
229+
# Determine all package dependencies
230+
231+
packageNameMap = {}
232+
233+
for package in packages:
234+
packageNameMap[package.name] = package
235+
236+
for package in packages:
237+
package.calculateDependencies(packageNameMap)
238+
239+
for asset in shared_assets:
240+
if len(asset.languages)>1:
241+
for lang in asset.languages:
242+
if lang in language_info:
243+
language_info[lang].addSharedAsset(asset)
244+
245+
246+
# Functions to output the results
247+
248+
def list_assets(shared_assets, language_info):
249+
print('| Asset | Files | Lines |', end='')
250+
for lang in language_info:
251+
print('', lang, '|', end='')
252+
print()
253+
print('| ----- | ----- | ----- |', end='')
254+
for lang in language_info:
255+
print(' ---- |', end='')
256+
print()
257+
for asset in shared_assets:
258+
print('|', asset.link(), '|', asset.files ,'|', asset.lines, '|', end=' ')
259+
for lang in language_info:
260+
if lang in asset.languages:
261+
print('yes |', end=' ')
262+
else:
263+
print(' |', end=' ');
264+
print()
265+
print()
266+
267+
def list_package_dependencies(package):
268+
print("Package", package.path, package.name, package.files, package.lines, package.total_imported_files, package.total_imported_lines)
269+
for dep in package.all_dependencies:
270+
print(" ", dep.name, dep.files, dep.lines)
271+
272+
def print_package_dependencies(packages):
273+
print('| Package name | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
274+
print('| ------------ | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
275+
for package in packages:
276+
nlines = package.lines + package.total_imported_lines
277+
shared_percentage = 100 * package.total_imported_lines / nlines if nlines>0 else 0
278+
print('|', package.link(), '|', package.files, '|', package.lines, '|', package.total_imported_files, '|', package.total_imported_lines, '|',
279+
# ','.join([p.name for p in package.all_dependencies]),
280+
"%.2f" % shared_percentage, '|')
281+
print()
282+
283+
def print_language_dependencies(packages):
284+
print_package_dependencies([p for p in packages if p.name.endswith('-all') and p.name.count('-')==1])
285+
286+
def list_shared_code_by_language(language_info):
287+
# For each language directory, list the files that are (1) inside the directory and not shared,
288+
# (2) packages from outside the directory, plus identical files
289+
print('| Language | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
290+
print('| -------- | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
291+
for lang in language_info:
292+
info = language_info[lang]
293+
total = info.imported_lines + info.nonshared_lines
294+
shared_percentage = 100 * info.imported_lines / total if total>0 else 0
295+
print('|', lang, '|', info.nonshared_files, '|', info.nonshared_lines, '|', info.imported_files, '|', info.imported_lines, '|', "%.2f" % shared_percentage, '|')
296+
print()
297+
298+
299+
# Output reports
300+
301+
print('# Report on CodeQL code sharing\n')
302+
print('Generated on', datetime.datetime.now())
303+
print()
304+
305+
print('## Shared code by language\n')
306+
307+
list_shared_code_by_language(language_info)
308+
309+
print('''
310+
* *Non-shared files*: The number of CodeQL files (`.ql`/`.qll`) that are only used within this language folder. Excludes `identical-files.json` that are shared between multiple languages.
311+
* *Non-shared lines of code*: The number of lines of code in the non-shared files.
312+
* *Imported files*: All CodeQL files (`.ql`/`.qll`) files that are transitively used in this language folder, either via packages or `identical-files.json`
313+
* *Imported lines of code*: The number of lines of code in the imported files
314+
* *Shared code %*: The proportion of imported lines / total lines (nonshared + imported).
315+
316+
## Shared packages use by language
317+
318+
A package is *used* if it is a direct or indirect dependency, or a file shared via `identical-files.json`.
319+
320+
''')
321+
322+
list_assets(shared_assets, language_info)
323+
324+
print('## Shared code by language pack\n')
325+
326+
print_language_dependencies(packages)
327+
328+
print('## Shared code by package\n')
329+
330+
print_package_dependencies(packages)

0 commit comments

Comments
 (0)