|
| 1 | +# Generates a report on the amount of code sharing in this repo |
| 2 | +# |
| 3 | +# The purpose of this is |
| 4 | +# a) To be able to understand the structure and dependencies |
| 5 | +# b) To provide a metric that measures the amount of shared vs non-shared code |
| 6 | + |
| 7 | +import datetime |
| 8 | +from pathlib import Path |
| 9 | +import json |
| 10 | +import yaml |
| 11 | + |
| 12 | +# To add more languages, add them to this list: |
| 13 | +languages = ['cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ql', 'ruby', 'swift'] |
| 14 | + |
| 15 | +repo_location = Path(__file__).parent.parent.parent |
| 16 | + |
| 17 | +# Gets the total number of lines in a file |
| 18 | +def linecount(file): |
| 19 | + with open(file, 'r') as fp: return len(fp.readlines()) |
| 20 | + |
| 21 | +# Gets the language name from the path |
| 22 | +def get_language(path): |
| 23 | + return path.parts[len(repo_location.parts)] |
| 24 | + |
| 25 | +# Is this path a CodeQL query file |
| 26 | +def is_query(path): |
| 27 | + return path.suffix == '.ql' |
| 28 | + |
| 29 | +# Is this path a CodeQL library file |
| 30 | +def is_library(path): |
| 31 | + return path.suffix == '.qll' |
| 32 | + |
| 33 | +# Is this path a relevant CodeQL file |
| 34 | +def is_ql(path): |
| 35 | + return is_query(path) or is_library(path) |
| 36 | + |
| 37 | +# Is this file a CodeQL package file |
| 38 | +def is_package(path): |
| 39 | + return path.name == 'qlpack.yml' |
| 40 | + |
| 41 | +# A CodeQL source file |
| 42 | +class QlFile: |
| 43 | + def __init__(self, path): |
| 44 | + self.path = path |
| 45 | + self.lines = linecount(path) |
| 46 | + shared = False |
| 47 | + |
| 48 | + def language(self): |
| 49 | + return get_language(self.path) |
| 50 | + |
| 51 | + def query(self): |
| 52 | + return is_query(self.path) |
| 53 | + |
| 54 | + def library(self): |
| 55 | + return is_library(self.path) |
| 56 | + |
| 57 | + # Returns if this qlfile is not shared, and is in a pack that is only in one language |
| 58 | + def isOnlyInLanguage(self, language): |
| 59 | + return not self.shared and (self.package is None or self.package.languages == {language}) and self.language() == language |
| 60 | + |
| 61 | +# Represents a language folder |
| 62 | +class Language: |
| 63 | + def __init__(self, name): |
| 64 | + self.name = name |
| 65 | + self.packs = [] |
| 66 | + self.nonshared_files = 0 |
| 67 | + self.nonshared_lines = 0 |
| 68 | + self.imported_files = 0 |
| 69 | + self.imported_lines = 0 |
| 70 | + |
| 71 | + def addQlFile(self, qlfile): |
| 72 | + if not qlfile.shared: |
| 73 | + self.nonshared_files += 1 |
| 74 | + self.nonshared_lines += qlfile.lines |
| 75 | + |
| 76 | + def addSharedAsset(self, package): |
| 77 | + self.imported_files += package.files |
| 78 | + self.imported_lines += package.lines |
| 79 | + |
| 80 | +# A shared package or file |
| 81 | +class SharedAsset: |
| 82 | + def __init__(self, name): |
| 83 | + self.name = name |
| 84 | + |
| 85 | +# A file shared using identical-files.json |
| 86 | +class IdenticalFileSet(SharedAsset): |
| 87 | + def __init__(self, name, ql_files): |
| 88 | + self.name = name |
| 89 | + self.languages = set() |
| 90 | + self.files = 0 |
| 91 | + self.lines = 0 |
| 92 | + for file in ql_files: |
| 93 | + file.package = self |
| 94 | + file.shared = True |
| 95 | + self.files = 1 |
| 96 | + self.lines = file.lines |
| 97 | + self.languages.add(file.language()) |
| 98 | + |
| 99 | + # Gets a pretty-printed markdown link |
| 100 | + def link(self): |
| 101 | + return self.name |
| 102 | + |
| 103 | +# Represents all files shared in `identical-files.json` |
| 104 | +# Reads the file and builds a list of assets |
| 105 | +class IdenticalFiles: |
| 106 | + def __init__(self, repo_location, ql_file_index): |
| 107 | + identical_files = repo_location/'config'/'identical-files.json' |
| 108 | + with open(identical_files, "r") as fp: |
| 109 | + identical_files_json = json.load(fp) |
| 110 | + # Create a list of assets |
| 111 | + self.assets = [] |
| 112 | + for group in identical_files_json: |
| 113 | + paths = [] |
| 114 | + for file in identical_files_json[group]: |
| 115 | + path = repo_location / file |
| 116 | + if is_ql(path): |
| 117 | + ql_file_index[path].shared = True |
| 118 | + paths.append(ql_file_index[path]) |
| 119 | + self.assets.append(IdenticalFileSet(group, paths)) |
| 120 | + |
| 121 | +# A package created from a `qlpack.yml`` file |
| 122 | +class Package(SharedAsset): |
| 123 | + def __init__(self, path, ql_file_index): |
| 124 | + self.path = path |
| 125 | + self.language = get_language(path) |
| 126 | + self.lines = 0 |
| 127 | + self.files = 0 |
| 128 | + self.languages = set() |
| 129 | + self.languages.add(self.language) |
| 130 | + self.identical_files_dependencies = set() |
| 131 | + with open(path, 'r') as fp: |
| 132 | + y = yaml.safe_load(fp) |
| 133 | + if 'name' in y: |
| 134 | + self.name = y['name'] |
| 135 | + else: |
| 136 | + self.name = path.parent.name |
| 137 | + if 'dependencies' in y: |
| 138 | + self.deps = y['dependencies'] |
| 139 | + if self.deps is None: |
| 140 | + self.deps = {} |
| 141 | + else: |
| 142 | + self.deps = {} |
| 143 | + # Mark all relevant files with their package |
| 144 | + for file in ql_file_index: |
| 145 | + if self.containsDirectory(file): |
| 146 | + file = ql_file_index[file] |
| 147 | + if not file.shared: |
| 148 | + file.package = self |
| 149 | + self.lines += file.lines |
| 150 | + self.files += 1 |
| 151 | + else: |
| 152 | + self.identical_files_dependencies.add(file.package) |
| 153 | + self.url = "https://github.com/github/codeql/blob/main/" + str(path.relative_to(repo_location)) |
| 154 | + |
| 155 | + # Gets a pretty-printed markdown link |
| 156 | + def link(self): |
| 157 | + return '[' + self.name + '](' + self.url + ')' |
| 158 | + |
| 159 | + def containsDirectory(self, dir): |
| 160 | + return self.path.parent.parts == dir.parts[:len(self.path.parent.parts)] |
| 161 | + # dir.startsWith(self.path.parent) |
| 162 | + |
| 163 | + # Constructs a list of transitive depedencies of this package. |
| 164 | + def calculateDependencies(self, packageNameMap): |
| 165 | + self.transitive_dependencies = set(self.deps) |
| 166 | + queue = list(self.deps) |
| 167 | + while len(queue): |
| 168 | + item = queue.pop() |
| 169 | + for dep2 in packageNameMap[item].deps: |
| 170 | + if dep2 not in self.transitive_dependencies: |
| 171 | + self.transitive_dependencies.add(dep2) |
| 172 | + queue.append(dep2) |
| 173 | + # Calculate the amount of imported code |
| 174 | + self.total_imported_files = 0 |
| 175 | + self.total_imported_lines = 0 |
| 176 | + self.all_dependencies = set(self.identical_files_dependencies) |
| 177 | + for dep in self.transitive_dependencies: |
| 178 | + self.all_dependencies.add(packageNameMap[dep]) |
| 179 | + for dep in self.all_dependencies: |
| 180 | + self.total_imported_files += dep.files |
| 181 | + self.total_imported_lines += dep.lines |
| 182 | + dep.languages.add(self.language) |
| 183 | + |
| 184 | +# Create a big index of all files and their line counts. |
| 185 | + |
| 186 | +# Map from path to line count |
| 187 | +ql_file_index = {} |
| 188 | +package_files = [] |
| 189 | + |
| 190 | +# Queue of directories to read |
| 191 | +directories_to_scan = [repo_location] |
| 192 | + |
| 193 | +while len(directories_to_scan)!=0: |
| 194 | + dir = directories_to_scan.pop() |
| 195 | + for p in dir.iterdir(): |
| 196 | + if p.is_dir(): |
| 197 | + directories_to_scan.append(p) |
| 198 | + elif is_ql(p): |
| 199 | + ql_file_index[p] = QlFile(p) |
| 200 | + elif is_package(p): |
| 201 | + package_files.append(p) |
| 202 | + |
| 203 | +# Create identical_files_json |
| 204 | +identical_files = IdenticalFiles(repo_location, ql_file_index) |
| 205 | + |
| 206 | +# Create packages |
| 207 | +# Do this after identical_files so that we can figure out the package sizes |
| 208 | +# Do this after getting the ql_file_index fully built |
| 209 | +packages = [] |
| 210 | +for file in package_files: |
| 211 | + packages.append(Package(file, ql_file_index)) |
| 212 | + |
| 213 | +# List all shared assets |
| 214 | +shared_assets = packages + identical_files.assets |
| 215 | + |
| 216 | +# Construct statistics for each language |
| 217 | +language_info = {} |
| 218 | +for l in languages: |
| 219 | + language_info[l] = Language(l) |
| 220 | + |
| 221 | +for qlfile in ql_file_index.values(): |
| 222 | + lang = qlfile.language() |
| 223 | + if lang in language_info: |
| 224 | + info = language_info[lang] |
| 225 | + if qlfile.isOnlyInLanguage(lang): |
| 226 | + info.addQlFile(qlfile) |
| 227 | + |
| 228 | +# Determine all package dependencies |
| 229 | + |
| 230 | +packageNameMap = {} |
| 231 | + |
| 232 | +for package in packages: |
| 233 | + packageNameMap[package.name] = package |
| 234 | + |
| 235 | +for package in packages: |
| 236 | + package.calculateDependencies(packageNameMap) |
| 237 | + |
| 238 | +for asset in shared_assets: |
| 239 | + if len(asset.languages)>1: |
| 240 | + for lang in asset.languages: |
| 241 | + if lang in language_info: |
| 242 | + language_info[lang].addSharedAsset(asset) |
| 243 | + |
| 244 | + |
| 245 | +# Functions to output the results |
| 246 | + |
| 247 | +def list_assets(shared_assets, language_info): |
| 248 | + print('| Asset | Files | Lines |', end='') |
| 249 | + for lang in language_info: |
| 250 | + print('', lang, '|', end='') |
| 251 | + print() |
| 252 | + print('| ----- | ----- | ----- |', end='') |
| 253 | + for lang in language_info: |
| 254 | + print(' ---- |', end='') |
| 255 | + print() |
| 256 | + for asset in shared_assets: |
| 257 | + print('|', asset.link(), '|', asset.files ,'|', asset.lines, '|', end=' ') |
| 258 | + for lang in language_info: |
| 259 | + if lang in asset.languages: |
| 260 | + print('yes |', end=' ') |
| 261 | + else: |
| 262 | + print(' |', end=' '); |
| 263 | + print() |
| 264 | + print() |
| 265 | + |
| 266 | +def list_package_dependencies(package): |
| 267 | + print("Package", package.path, package.name, package.files, package.lines, package.total_imported_files, package.total_imported_lines) |
| 268 | + for dep in package.all_dependencies: |
| 269 | + print(" ", dep.name, dep.files, dep.lines) |
| 270 | + |
| 271 | +def print_package_dependencies(packages): |
| 272 | + print('| Package name | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |') |
| 273 | + print('| ------------ | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |') |
| 274 | + for package in packages: |
| 275 | + nlines = package.lines + package.total_imported_lines |
| 276 | + shared_percentage = 100 * package.total_imported_lines / nlines if nlines>0 else 0 |
| 277 | + print('|', package.link(), '|', package.files, '|', package.lines, '|', package.total_imported_files, '|', package.total_imported_lines, '|', |
| 278 | + # ','.join([p.name for p in package.all_dependencies]), |
| 279 | + "%.2f" % shared_percentage, '|') |
| 280 | + print() |
| 281 | + |
| 282 | +def print_language_dependencies(packages): |
| 283 | + print_package_dependencies([p for p in packages if p.name.endswith('-all') and p.name.count('-')==1]) |
| 284 | + |
| 285 | +def list_shared_code_by_language(language_info): |
| 286 | + # For each language directory, list the files that are (1) inside the directory and not shared, |
| 287 | + # (2) packages from outside the directory, plus identical files |
| 288 | + print('| Language | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |') |
| 289 | + print('| -------- | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |') |
| 290 | + for lang in language_info: |
| 291 | + info = language_info[lang] |
| 292 | + total = info.imported_lines + info.nonshared_lines |
| 293 | + shared_percentage = 100 * info.imported_lines / total if total>0 else 0 |
| 294 | + print('|', lang, '|', info.nonshared_files, '|', info.nonshared_lines, '|', info.imported_files, '|', info.imported_lines, '|', "%.2f" % shared_percentage, '|') |
| 295 | + print() |
| 296 | + |
| 297 | + |
| 298 | +# Output reports |
| 299 | + |
| 300 | +print('# Report on CodeQL code sharing\n') |
| 301 | +print('Generated on', datetime.datetime.now()) |
| 302 | +print() |
| 303 | + |
| 304 | +print('## Shared code by language\n') |
| 305 | + |
| 306 | +list_shared_code_by_language(language_info) |
| 307 | + |
| 308 | +print(''' |
| 309 | +* *Non-shared files*: The number of CodeQL files (`.ql`/`.qll`) that are only used within this language folder. Excludes `identical-files.json` that are shared between multiple languages. |
| 310 | +* *Non-shared lines of code*: The number of lines of code in the non-shared files. |
| 311 | +* *Imported files*: All CodeQL files (`.ql`/`.qll`) files that are transitively used in this language folder, either via packages or `identical-files.json` |
| 312 | +* *Imported lines of code*: The number of lines of code in the imported files |
| 313 | +* *Shared code %*: The proportion of imported lines / total lines (nonshared + imported). |
| 314 | +
|
| 315 | +## Shared packages use by language |
| 316 | +
|
| 317 | +A package is *used* if it is a direct or indirect dependency, or a file shared via `identical-files.json`. |
| 318 | +
|
| 319 | +''') |
| 320 | + |
| 321 | +list_assets(shared_assets, language_info) |
| 322 | + |
| 323 | +print('## Shared code by language pack\n') |
| 324 | + |
| 325 | +print_language_dependencies(packages) |
| 326 | + |
| 327 | +print('## Shared code by package\n') |
| 328 | + |
| 329 | +print_package_dependencies(packages) |
0 commit comments