Skip to content

Commit 3e6423a

Browse files
committed
Python: Add ability to split and join autogenerated yml files
Verified by joining all files, splitting again, and observing no diff in git. (these operations only take a few seconds on my local machine, so shouldn't be too much of an issue)
1 parent f30a3b0 commit 3e6423a

File tree

4 files changed

+140
-61
lines changed

4 files changed

+140
-61
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python3
2+
3+
"""Concerns were raised about performance on Windows with having 2.5 k files for modeling, and it was recommended we join them all together when shipping.
4+
5+
This script does that.
6+
"""
7+
8+
import sys
9+
import glob
10+
import os
11+
12+
from shared_subclass_functions import *
13+
14+
if joined_file.exists():
15+
sys.exit(f"File {joined_file} already exists")
16+
17+
package_data = gather_from_existing()
18+
as_lists = list()
19+
for data in package_data.values():
20+
as_lists.extend(list(t) for t in data)
21+
as_lists.sort()
22+
23+
24+
to_write = wrap_in_template(as_lists)
25+
write_data(to_write, joined_file)
26+
27+
print("Joined all files into", joined_file)
28+
29+
for f in glob.glob(f"{subclass_capture_path}/auto-*.model.yml", recursive=True):
30+
os.unlink(f)

python/ql/src/meta/ClassHierarchy/process-mrva-results.py

Lines changed: 9 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,17 @@
22

33
import sys
44
import glob
5-
from pathlib import Path
65
import json
76
import subprocess
87
from collections import defaultdict
9-
import yaml
108
import shutil
119
import os
12-
import re
1310

14-
VERSION = "process-mrva-results 0.0.1"
15-
16-
mad_path = Path(__file__).parent.parent.parent.parent / "lib/semmle/python/frameworks/data/internal/"
11+
from shared_subclass_functions import *
1712

1813
assert mad_path.exists(), mad_path
1914

20-
package_data = defaultdict(set)
15+
2116

2217
# process data
2318

@@ -55,34 +50,9 @@ def command(self, args):
5550
return res.decode('utf-8')
5651
res += b
5752

58-
def wrap_in_template(data):
59-
return {
60-
"extensions": [
61-
{
62-
"addsTo": {
63-
"pack": "codeql/python-all",
64-
"extensible": "typeModel",
65-
},
66-
"data": data,
67-
}
68-
]
69-
}
70-
71-
def parse_from_file(path: Path) -> set:
72-
if not path.exists():
73-
return set()
74-
75-
f = path.open("r")
76-
assert f.readline().startswith(f"# {VERSION}\n"), path
77-
78-
raw_data = yaml.load(f, Loader=yaml.CBaseLoader)
79-
assert len(raw_data["extensions"]) == 1, path
80-
assert raw_data["extensions"][0]["addsTo"]["extensible"] == "typeModel", path
81-
82-
return set(tuple(x) for x in raw_data["extensions"][0]["data"])
83-
8453

8554
def gather_from_bqrs_results():
55+
package_data = defaultdict(set)
8656
with CodeQL() as codeql:
8757
if os.path.exists(sys.argv[1]) and not os.path.isdir(sys.argv[1]) and sys.argv[1].endswith(".bqrs"):
8858
files = [sys.argv[1]]
@@ -98,34 +68,12 @@ def gather_from_bqrs_results():
9868
for t in select["#select"]["tuples"]:
9969
pkg = t[1]
10070
package_data[pkg].add(tuple(t))
71+
return package_data
10172

102-
def gather_from_existing():
103-
for f in glob.glob(f"{mad_path}/subclass-capture/auto-*.model.yml", recursive=True):
104-
print(f"Processing {f}")
105-
106-
all_data = parse_from_file(Path(f))
107-
pkg = f.split("/")[-1].split(".")[0][5:]
108-
package_data[pkg].update(all_data)
109-
110-
gather_from_bqrs_results()
111-
112-
for pkg in package_data:
113-
if not re.match(r"[a-zA-Z0-9-_]+", pkg):
114-
print(f"Skipping {repr(pkg)}")
115-
continue
116-
117-
pkg_path = mad_path / "subclass-capture" / f"auto-{pkg}.model.yml"
118-
119-
print(f"Writing {pkg_path}")
120-
121-
all_data = parse_from_file(pkg_path)
122-
all_data.update(package_data[pkg])
123-
124-
as_lists = [list(t) for t in all_data]
125-
as_lists.sort()
12673

127-
data_for_yaml = wrap_in_template(as_lists)
74+
if __name__ == "__main__":
75+
if joined_file.exists():
76+
sys.exit(f"File {joined_file} exists, you should split it up first")
12877

129-
f = pkg_path.open("w+")
130-
f.write(f"# {VERSION}\n")
131-
yaml.dump(data_for_yaml, indent=2, stream=f, Dumper=yaml.CDumper)
78+
package_data = gather_from_bqrs_results()
79+
write_all_package_data_to_files(package_data)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from typing import Dict
2+
import yaml
3+
from pathlib import Path
4+
import glob
5+
from collections import defaultdict
6+
import re
7+
8+
VERSION = "process-mrva-results 0.0.1"
9+
10+
mad_path = Path(__file__).parent.parent.parent.parent / "lib/semmle/python/frameworks/data/internal/"
11+
12+
subclass_capture_path = mad_path / "subclass-capture"
13+
14+
joined_file = subclass_capture_path / "ALL.model.yml"
15+
16+
def parse_from_file(path: Path) -> set:
17+
if not path.exists():
18+
return set()
19+
20+
f = path.open("r")
21+
assert f.readline().startswith(f"# {VERSION}\n"), path
22+
23+
raw_data = yaml.load(f, Loader=yaml.CBaseLoader)
24+
assert len(raw_data["extensions"]) == 1, path
25+
assert raw_data["extensions"][0]["addsTo"]["extensible"] == "typeModel", path
26+
27+
return set(tuple(x) for x in raw_data["extensions"][0]["data"])
28+
29+
30+
def wrap_in_template(data):
31+
return {
32+
"extensions": [
33+
{
34+
"addsTo": {
35+
"pack": "codeql/python-all",
36+
"extensible": "typeModel",
37+
},
38+
"data": data,
39+
}
40+
]
41+
}
42+
43+
44+
def write_data(data, path: Path):
45+
f = path.open("w+")
46+
f.write(f"# {VERSION}\n")
47+
yaml.dump(data, indent=2, stream=f, Dumper=yaml.CDumper)
48+
49+
50+
def gather_from_existing():
51+
package_data = defaultdict(set)
52+
for f in glob.glob(f"{subclass_capture_path}/auto-*.model.yml", recursive=True):
53+
print(f"Processing {f}")
54+
55+
all_data = parse_from_file(Path(f))
56+
pkg = f.split("/")[-1].split(".")[0][5:]
57+
package_data[pkg].update(all_data)
58+
return package_data
59+
60+
61+
def write_all_package_data_to_files(package_data: Dict[str, set]):
62+
for pkg in package_data:
63+
if not re.match(r"[a-zA-Z0-9-_]+", pkg):
64+
print(f"Skipping {repr(pkg)}")
65+
continue
66+
67+
pkg_path = subclass_capture_path / f"auto-{pkg}.model.yml"
68+
69+
print(f"Writing {pkg_path}")
70+
71+
all_data = parse_from_file(pkg_path)
72+
all_data.update(package_data[pkg])
73+
74+
as_lists = [list(t) for t in all_data]
75+
as_lists.sort()
76+
77+
data_for_yaml = wrap_in_template(as_lists)
78+
79+
write_data(data_for_yaml, pkg_path)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python3
2+
3+
"""Concerns were raised about performance on Windows with having 2.5 k files for modeling, and it was recommended we join them all together when shipping.
4+
5+
This script does the opposite, so it's easier to work with locally.
6+
"""
7+
8+
import sys
9+
from collections import defaultdict
10+
11+
from shared_subclass_functions import *
12+
13+
if not joined_file.exists():
14+
sys.exit(f"File {joined_file} does not exists")
15+
16+
all_data = parse_from_file(joined_file)
17+
package_data = defaultdict(set)
18+
for t in all_data:
19+
package_data[t[1]].add(t)
20+
write_all_package_data_to_files(package_data)
21+
22+
joined_file.unlink()

0 commit comments

Comments
 (0)