Skip to content

Commit 02860ba

Browse files
committed
Update pickle stream analyzer to be more robust
1 parent ca34b89 commit 02860ba

File tree

1 file changed

+55
-69
lines changed

1 file changed

+55
-69
lines changed

src/sasctl/pzmm/write_json_files.py

Lines changed: 55 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,11 +1009,11 @@ def create_requirements_json(cls, json_path=Path.cwd()):
10091009
----------
10101010
json_path : str, optional
10111011
The path to a Python project, by default the current working directory.
1012+
10121013
Yields
10131014
------
10141015
requirements.json : file
1015-
JSON file used to create a specific Python environment in a SAS Model Manager published
1016-
container.
1016+
JSON file used to create a specific Python environment in a SAS Model Manager published container.
10171017
"""
10181018

10191019
pickle_packages = []
@@ -1037,13 +1037,15 @@ def create_requirements_json(cls, json_path=Path.cwd()):
10371037
json_step = json.dumps(
10381038
[
10391039
{
1040-
"Warning": "The versions for the following packages could not be determined:",
1040+
"Warning": "The existence and/or versions for the following packages could not be "
1041+
"determined:",
10411042
"Packages": ", ".join(missing_package_versions),
10421043
}
10431044
],
10441045
indent=4,
10451046
)
10461047
file.write(json_step)
1048+
10471049
for package, version in package_and_version:
10481050
if version:
10491051
json_step = json.dumps(
@@ -1055,16 +1057,6 @@ def create_requirements_json(cls, json_path=Path.cwd()):
10551057
],
10561058
indent=4,
10571059
)
1058-
else:
1059-
json_step = json.dumps(
1060-
[
1061-
{
1062-
"step": "install " + package,
1063-
"command": "pip install " + package,
1064-
}
1065-
],
1066-
indent=4,
1067-
)
10681060
file.write(json_step)
10691061

10701062
@classmethod
@@ -1087,11 +1079,10 @@ def get_local_package_version(cls, package_list):
10871079

10881080
def package_not_found_output(package_name, package_versions):
10891081
print(
1090-
f"Warning: Package {package_name} was not found in the local environment, so a version could not be "
1091-
"determined."
1092-
)
1093-
print(
1094-
f"The pip installation command will not include a version number for {package_name}."
1082+
f"Warning: Package {package_name} was not found in the local environment. Either {package_name} is not "
1083+
f"a valid Python package, or the package is not present in this environment. The requirements.json file"
1084+
f" will include a commented out version of the pip installation command at the bottom of the file. "
1085+
f"Please review the file and verify that the package exists and input the version needed."
10951086
)
10961087
package_versions.append([package_name, None])
10971088
return package_versions
@@ -1234,16 +1225,17 @@ def get_pickle_dependencies(cls, pickle_file):
12341225
obj = pickle.load(open_file)
12351226
dumps = pickle.dumps(obj)
12361227

1237-
modules = {mod.split(".")[0] for mod, _ in cls.get_package_names(dumps)}
1238-
modules.discard("builtins")
1239-
return list(modules)
1228+
modules = cls.get_package_names(dumps)
1229+
return modules
12401230

12411231
@classmethod
12421232
def get_package_names(cls, stream):
12431233
"""
1244-
Generates (module, class_name) tuples from a pickle stream. Extracts all class names referenced
1245-
by GLOBAL and STACK_GLOBAL opcodes.
1234+
Generates a list of found `package` names from a pickle stream. In most cases, the `packages` returned by the
1235+
function will be valid Python packages. A check is made in get_local_package_version to ensure that the package
1236+
is in fact a valid Python package.
12461237
1238+
This code has been adapted from the following stackoverflow example and utilizes the pickletools package.
12471239
Credit: modified from https://stackoverflow.com/questions/64850179/inspecting-a-pickle-dump-for-dependencies
12481240
More information here: https://github.com/python/cpython/blob/main/Lib/pickletools.py
12491241
@@ -1252,54 +1244,48 @@ def get_package_names(cls, stream):
12521244
stream : bytes or str
12531245
A file like object or string containing the pickle.
12541246
1255-
Yields
1256-
------
1257-
tuple
1258-
Generated (module, class_name) tuples.
1247+
Returns
1248+
-------
1249+
list
1250+
List of package names found as module dependencies in the pickle file.
12591251
"""
1252+
# Collect all the opcodes, arguments, and position values from the pickle stream into three lists
1253+
opcode, arg, pos = [], [], []
1254+
for o, a, p in pickletools.genops(stream):
1255+
opcode.append(o.name)
1256+
arg.append(a)
1257+
pos.append(p)
1258+
1259+
# Convert to a pandas dataframe for ease of conditional filtering
1260+
df_pickle = pd.DataFrame({"opcode": opcode, "arg": arg, "pos": pos})
1261+
1262+
# For all opcodes labelled GLOBAL or STACK_GLOBAL pull out the package names
1263+
global_stack = df_pickle[
1264+
(df_pickle.opcode == "GLOBAL") | (df_pickle.opcode == "STACK_GLOBAL")
1265+
]
1266+
# From the argument column, split the string of the form `X.Y.Z` by `.` and return only the unique `X's`
1267+
stack_packages = (
1268+
global_stack.arg.str.split().str[0].str.split(".").str[0].unique().tolist()
1269+
)
12601270

1261-
stack, mark_stack, memo = [], [], []
1262-
mark = pickletools.markobject
1263-
1264-
# Step through the pickle stack and retrieve names used by STACK_GLOBAL
1265-
for opcode, arg, pos in pickletools.genops(stream):
1266-
1267-
before, after = opcode.stack_before, opcode.stack_after
1268-
number_to_pop = len(before)
1269-
1270-
if opcode.name == "GLOBAL":
1271-
yield tuple(arg.split(1, None))
1272-
elif opcode.name == "STACK_GLOBAL":
1273-
yield stack[-2], stack[-1]
1274-
elif mark in before or (
1275-
opcode.name == "POP" and stack and stack[-1] is mark
1276-
):
1277-
mark_stack.pop()
1278-
while stack[-1] is not mark:
1279-
stack.pop()
1280-
stack.pop()
1281-
try:
1282-
number_to_pop = before.index(mark)
1283-
except ValueError:
1284-
number_to_pop = 0
1285-
elif opcode.name in {"PUT", "BINPUT", "LONG_BINPUT", "MEMOIZE"}:
1286-
if opcode.name == "MEMOIZE":
1287-
memo.append(stack[-1])
1288-
else:
1289-
memo[arg] = stack[-1]
1290-
number_to_pop, after = 0, [] # memoize and put; do not pop the stack
1291-
elif opcode.name in {"GET", "BINGET", "LONG_BINGET"}:
1292-
arg = memo[arg]
1293-
1294-
if number_to_pop:
1295-
del stack[-number_to_pop:]
1296-
if mark in after:
1297-
mark_stack.append(pos)
1298-
1299-
if len(after) == 1 and opcode.arg is not None:
1300-
stack.append(arg)
1301-
else:
1302-
stack.extend(after)
1271+
# For all opcodes labelled BINUNICODE or SHORT_BINUNICODE pull out the package names
1272+
binunicode = df_pickle[
1273+
(df_pickle.opcode == "BINUNICODE")
1274+
| (df_pickle.opcode == "SHORT_BINUNICODE")
1275+
]
1276+
# From the argument column, split the string by `.`, then return only unique cells with at least one split
1277+
arg_binunicode = binunicode.arg.str.split(".")
1278+
unicode_packages = (
1279+
arg_binunicode.loc[arg_binunicode.str.len() > 1].str[0].unique().tolist()
1280+
)
1281+
# Remove invalid `package` names from the list
1282+
unicode_packages = [x for x in unicode_packages if x.isidentifier()]
1283+
1284+
# Combine the two package lists and remove any duplicates
1285+
packages = list(set(stack_packages + unicode_packages))
1286+
1287+
# Return the package list without any None values
1288+
return [x for x in packages if x]
13031289

13041290
@classmethod
13051291
def remove_standard_library_packages(cls, package_list):

0 commit comments

Comments
 (0)