From 5843e73d71b01b47e22a95c2a4bae84133ee89a3 Mon Sep 17 00:00:00 2001 From: Stephen Carter Date: Thu, 23 Oct 2025 15:04:54 -0400 Subject: [PATCH 1/3] NEW: @W-18146548@: Add new Flow Scanner rules --- package-lock.json | 2 +- .../FlowScanner/flow_parser/README.md | 19 - .../flow_parser/expression_parser.py | 2 +- .../FlowScanner/flow_parser/parse.py | 797 ++++++++----- .../{flowtest => flow_scanner}/ESAPI.py | 5 +- .../{flowtest => flow_scanner}/__init__.py | 0 .../{flowtest => flow_scanner}/__main__.py | 283 ++++- .../branch_state.py | 325 +++-- .../FlowScanner/flow_scanner/control_flow.py | 1053 ++++++++++++++++ .../data/FlowSecurity_preset.txt | 0 .../data/flow_scanner_query_data.txt} | 0 .../data/footer.out | 2 +- .../data/header.out | 0 .../{flowtest => flow_scanner}/executor.py | 292 +++-- .../flow_metrics.py | 192 +-- .../FlowScanner/flow_scanner/flow_result.py | 497 ++++++++ .../{flowtest => flow_scanner}/flows.py | 79 +- .../FlowScanner/flow_scanner/query_manager.py | 402 +++++++ .../FlowScanner/flow_scanner/util.py | 445 +++++++ .../{flowtest => flow_scanner}/version.py | 4 +- .../FlowScanner/flow_scanner/wire.py | 446 +++++++ .../FlowScanner/flowtest/control_flow.py | 723 ----------- .../FlowScanner/flowtest/flow_result.py | 425 ------- .../FlowScanner/flowtest/query_manager.py | 207 ---- .../FlowScanner/flowtest/util.py | 318 ----- .../FlowScanner/flowtest/wire.py | 238 ---- .../FlowScanner/public/contracts.py | 323 ++++- .../FlowScanner/public/custom_parser.py | 6 + .../FlowScanner/public/data_obj.py | 233 +++- .../FlowScanner/public/enums.py | 100 +- ...ceptions.py => flow_scanner_exceptions.py} | 4 +- .../FlowScanner/public/parse_utils.py | 698 ++++++++++- .../FlowScanner/queries/debug_query.py | 69 ++ .../FlowScanner/queries/default_query.py | 69 +- .../FlowScanner/queries/optional_query.py | 1055 +++++++++++++++++ .../code-analyzer-flow-engine/package.json | 2 +- .../code-analyzer-flow-engine/src/engine.ts | 49 +- .../src/hardcoded-catalog.ts | 342 +++++- .../code-analyzer-flow-engine/src/messages.ts | 48 +- .../src/python/FlowScannerCommandWrapper.ts | 23 +- .../test/engine.test.ts | 116 +- .../python/FlowScannerCommandWrapper.test.ts | 16 +- .../results.goldfile.json | 404 ++++--- .../goldfiles/all_rules.goldfile.json | 166 +++ 44 files changed, 7345 insertions(+), 3134 deletions(-) delete mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flow_parser/README.md rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/ESAPI.py (93%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/__init__.py (100%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/__main__.py (57%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/branch_state.py (80%) create mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/control_flow.py rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/data/FlowSecurity_preset.txt (100%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest/data/flowtest_query_data.txt => flow_scanner/data/flow_scanner_query_data.txt} (100%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/data/footer.out (99%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/data/header.out (100%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/executor.py (74%) rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/flow_metrics.py (85%) create mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flow_result.py rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/flows.py (92%) create mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/query_manager.py create mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/util.py rename packages/code-analyzer-flow-engine/FlowScanner/{flowtest => flow_scanner}/version.py (74%) create mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/wire.py delete mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flowtest/control_flow.py delete mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flowtest/flow_result.py delete mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flowtest/query_manager.py delete mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flowtest/util.py delete mode 100644 packages/code-analyzer-flow-engine/FlowScanner/flowtest/wire.py rename packages/code-analyzer-flow-engine/FlowScanner/public/{flowtest_exceptions.py => flow_scanner_exceptions.py} (84%) create mode 100644 packages/code-analyzer-flow-engine/FlowScanner/queries/debug_query.py create mode 100644 packages/code-analyzer-flow-engine/FlowScanner/queries/optional_query.py create mode 100644 packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json diff --git a/package-lock.json b/package-lock.json index 75385bf4..ff6a8187 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10499,7 +10499,7 @@ }, "packages/code-analyzer-flow-engine": { "name": "@salesforce/code-analyzer-flow-engine", - "version": "0.28.0", + "version": "0.29.0-SNAPSHOT", "license": "BSD-3-Clause", "dependencies": { "@salesforce/code-analyzer-engine-api": "0.31.0", diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/README.md b/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/README.md deleted file mode 100644 index 707019c8..00000000 --- a/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Flow Expression Parsing - -This directory contains code to parse flows. - -`expression_parser.py` handles expressions (as well as templates) -in order to extract relevant variable names for dataflow analysis. - -For example, if an expression named `myExpression` is defined as - -```IF(varA, varB, varC)``` - -Then this should give rise to two dataflows - -```myExpression <-- varB, myExpression <-- varC``` - -Which are passed into the formula map, but we should skip varA, which is control influencing but not data influencing. - - - diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/expression_parser.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/expression_parser.py index 6f8b310a..c83e4a85 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/expression_parser.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/expression_parser.py @@ -11,7 +11,7 @@ import traceback from dataclasses import dataclass import logging -import flowtest.util as util +import flow_scanner.util as util logger: logging.Logger = logging.getLogger(__name__) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/parse.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/parse.py index 1e21a5a7..010ff698 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/parse.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_parser/parse.py @@ -5,9 +5,10 @@ from __future__ import annotations import sys +import traceback from flow_parser import expression_parser -from public.flowtest_exceptions import InvalidFlowException +from public.flow_scanner_exceptions import InvalidFlowException sys.modules['_elementtree'] = None import xml.etree.ElementTree as ET @@ -15,12 +16,12 @@ from typing import Optional import logging import public.parse_utils as parse_utils -import flowtest.util as util +import flow_scanner.util as util from public.contracts import FlowParser import public.custom_parser as CP from public.parse_utils import get_by_tag, get_tag, get_name, get_named_elems, STRING_LITERAL_TOKEN, get_conn_target_map -from public.enums import RunMode, FlowType +from public.enums import RunMode, FlowType, TriggerType from public.data_obj import VariableType from public.enums import DataType, ReferenceType @@ -80,27 +81,42 @@ def __init__(self, root): #: is this a screen or auto-launched flow self.flow_type: FlowType | None = None + #: trigger type (None if not a trigger) + self.trigger_type: TriggerType | None = None + + #: trigger object (False if not a trigger or unknown, str if a trigger but unknown) + self.trigger_object: str | bool = False + #: frozen set of all elements that have a child of and are thus flow globals #: useful for setting scopes self.all_named_elems: frozenset[ET.Element] | None = None #: set of all names (names of named elements) - self.all_names: (str,) or None = None + self.all_names: tuple[str,] | None = None #: variables marked 'available for input', as a pair (flow_path, name) - self.input_variables: frozenset[(str, str)] | None = None + self.input_variables: frozenset[tuple[str, str]] | None = None #: variables marked 'available for output', as a tuple (flow_path, name) - self.output_variables: frozenset[(str, str)] | None = None + self.output_variables: frozenset[tuple[str, str]] | None = None #: for marking string literals self.literal_var = VariableType(tag='stringValue', datatype=DataType.Literal) - #: map from (path, (resolved) parent name) --> Variable (cache) - self.__seen_parents: {(str, str): VariableType} = {} + #: cache of name resolutions encountered to this point at runtime: + # (flow_path, raw_name) --> (name, member, Variable) + # is built up as more variable resolutions are performed + self.cached_resolutions: dict[tuple[str, str], tuple[str, str, VariableType]] | None = None + + #: all name resolutions from the entire flow (done by lexical parse of flow) + #: populated when the flow is loaded. Only has parents. + self.var_types: dict[tuple[str, str], VariableType] | None = None + + def get_all_named_elems(self) -> frozenset[tuple[str, str]] | None: + return self.all_named_elems - #: cache of name resolutions: (flow_path, raw_name) --> (name, member, Variable) - self.__seen_resolutions: {(str, str): (str, str, VariableType)} = {} + def get_all_names(self) -> tuple[str] | None: + return self.all_names def get_effective_run_mode(self) -> RunMode: return self.effective_run_mode @@ -117,6 +133,131 @@ def get_root(self) -> ET.Element: def get_literal_var(self) -> VariableType: return self.literal_var + def get_action_call_map(self) -> dict[str, list[tuple[str, str]]] | None: + """Gets all actionCalls in the flow element + Returns: actionCall type -> (element name, action name) + """ + accum = {} + action_calls = parse_utils.get_by_tag(self.root, 'actionCalls') + for action_call in action_calls: + action_name_els = parse_utils.get_by_tag(action_call, 'actionName') + action_type_els = parse_utils.get_by_tag(action_call, 'actionType') + elem_name = parse_utils.get_name(action_call) + if (len(action_name_els) != 1 or len(action_type_els) != 1 or + action_name_els[0].text is None or action_type_els[0].text is None): + logger.error(f"found invalid actionCall {elem_name} in flow {self.flow_path}") + continue + to_add = (action_type_els[0].text, elem_name, action_name_els[0].text) + if to_add[0] not in accum: + accum[to_add[0]] = [(to_add[1], to_add[2])] + else: + accum[to_add[0]].append((to_add[1], to_add[2])) + + if len(accum) == 0: + return None + else: + return accum + + def get_async_scheduled_paths(self) -> list[str]: + accum = [] + start = self.get_start_elem() + if start is None or start.tag != f'{ns}start': + return accum + + sched_els = get_by_tag(start, 'scheduledPaths') + for sched_el in sched_els: + path_types = get_by_tag(sched_el,'pathType') + if (len(path_types) == 1 and + path_types[0].text is not None and + path_types[0].text.startswith('Async')): + target = sched_el.find(f'./{ns}connector/{ns}targetReference') + if target is not None and target.text is not None: + accum.append(target.text) + return accum + + def get_trigger_object(self) -> str | None: + if self.trigger_object is True: + return None + elif isinstance(self.trigger_object, str): + return self.trigger_object + + # object is not set + else: + tt = self.get_trigger_type() + if tt is TriggerType.NotTrigger or tt is TriggerType.Unknown: + self.trigger_object = True + return None + + else: + starts = parse_utils.get_by_tag(self.root, 'start') + if len(starts) != 1: + self.trigger_object = True + return None + else: + start = starts[0] + objs = parse_utils.get_by_tag(start, 'object') + if len(objs) != 1: + self.trigger_object = True + return None + obj_name = objs[0].text + if obj_name is None or len(obj_name) == 0: + self.trigger_object = True + return None + else: + self.trigger_object = obj_name + return obj_name + + def get_trigger_type(self) -> TriggerType: + if self.trigger_type is not None: + return self.trigger_type + + else: + starts = get_by_tag(self.root, 'start') + if len(starts) != 1: + self.trigger_type = TriggerType.NotTrigger + return TriggerType.NotTrigger + start = starts[0] + child_els = get_by_tag(start, 'triggerType') + + if len(child_els) != 1: + self.trigger_type = TriggerType.NotTrigger + return TriggerType.NotTrigger + + t_type = child_els[0].text + if t_type is None: + self.trigger_type = TriggerType.Unknown + return TriggerType.Unknown + else: + t_type = t_type.lower() + + if t_type == 'recordaftersave': + trigger_type = TriggerType.RecordAfterSave + + elif t_type == 'capability': + trigger_type = TriggerType.Capability + + elif t_type == 'scheduled': + trigger_type = TriggerType.Scheduled + + elif t_type == 'recordbeforesave': + trigger_type = TriggerType.RecordBeforeSave + + elif t_type == 'recordbeforedelete': + trigger_type = TriggerType.RecordBeforeDelete + + elif t_type == 'platformevent': + trigger_type = TriggerType.PlatformEvent + + elif t_type == 'segment': + trigger_type = TriggerType.Segment + + else: + trigger_type = TriggerType.Unknown + + self.trigger_type = trigger_type + return trigger_type + + def get_flow_type(self) -> FlowType: """Returns type of flow @@ -125,162 +266,170 @@ def get_flow_type(self) -> FlowType: Returns: FlowType - """ if self.flow_type is not None: return self.flow_type flow_type = None - # Process Builder - # no but - res = get_by_tag(self.root, 'startElementReference') - if len(res) == 0: - res = get_by_tag(self.root, 'start') - if len(res) == 0: - # this is an old format record trigger flow - self.flow_type = FlowType.RecordTrigger - return FlowType.RecordTrigger - start = res[0] - - # Trigger, record - # has a child - child = get_by_tag(start, 'triggerType') - if len(child) > 0: - flow_type = FlowType.RecordTrigger - - elif len(get_by_tag(start, 'schedule')) > 0: - flow_type = FlowType.Scheduled + tt = self.get_trigger_type() + if tt is not TriggerType.NotTrigger: + flow_type = FlowType.Trigger + self.flow_type = flow_type + return flow_type - else: - # We couldn't determine flow type by looking at - # elem, so now look at processType elem - pt = get_by_tag(self.root, 'processType') - if len(pt) > 0: - pt = pt[0].text - - # Screen - # Flow and start does not have trigger or schedule - if pt == 'Flow' or len(get_by_tag(self.root, 'screens')) > 0: - flow_type = FlowType.Screen - - elif pt.lower() == 'workflow': - flow_type = FlowType.Workflow - - elif pt.lower() == 'invocableprocess': - flow_type = FlowType.InvocableProcess - - # AutoLaunched - # Some teams have their own names, e.g. FooAutolaunchedFlow - # Notice this messes up capitalization from normal 'AutoLaunchedFlow' - # there are also recommendation strategies, etc. - else: - flow_type = FlowType.AutoLaunched + pt = get_by_tag(self.root, 'processType') + + if len(pt) > 0: + pt = pt[0].text.lower() - elif self.flow_path.endswith(".flow"): + # Screen + # Flow and start does not have trigger or schedule + if pt == 'flow' or len(get_by_tag(self.root, 'screens')) > 0: + flow_type = FlowType.Screen + + elif pt == 'workflow': + flow_type = FlowType.Workflow + + elif pt == 'invocableprocess': + flow_type = FlowType.InvocableProcess + + # AutoLaunched + # Some teams have their own names, e.g. FooAutolaunchedFlow + # Notice this messes up capitalization from normal 'AutoLaunchedFlow' + # there are also recommendation strategies, etc. + elif pt.endswith('autolaunchedflow'): flow_type = FlowType.AutoLaunched + elif pt.lower() == 'orchestrator': + flow_type = FlowType.Orchestrator + if flow_type is not None: self.flow_type = flow_type return flow_type else: - raise RuntimeError(f"Could not determine flow type for {self.flow_path}") + logger.critical(f"Could not determine flow type for {self.flow_path}, setting to autolaunched") + self.flow_type = FlowType.AutoLaunched + return FlowType.AutoLaunched def resolve_by_name(self, name: str, path: str | None = None, strict: bool = False) -> Optional[(str, str, VariableType)]: - """Resolves name to variable, property, VariableType. + """Resolves name to variable, property, VariableType. Does not store anything. Examples:: - "Account_var.Name" --> ("Account_var", "Name", VariableType) - "account_var" --> (account_var, None, VariableType). - (my_subflow.account.Name) --> (my_subflow.account, Name, VariableType) - (my_action_call.account) --> (my_action_call.account, None, VariableType) - Args: name: raw name as it is used in the flow xml file (e.g. foo.bar.baz) path: filename in which to resolve - strict: whether to fail hard or guess + strict: whether to resolve unknown variables to None (which can cause program execution + to terminate) or to create a best effort 'unknown' variable type resolution. Returns: ``None`` if the name cannot be resolved, else the triple (parent name, member, type) """ + + """ + First, deal with case of string literal, + already seen variable, or global variable + """ + if name == STRING_LITERAL_TOKEN: + return name, None, self.literal_var + if path is None: path = self.flow_path - # do this first, as this method will be called all the time. - # Look in cache of already resolved names, plus string literals and globals are handled here - res = self.__get_type(path=path, name=name) - if res is not None: - # the variable is already in our map - return name, None, res - - # second cache, contains properties already seen as well as names of flow elements - seen = dict.get(self.__seen_resolutions, (path, name)) + + seen = self.get_cached_resolution(name=name, path=path) if seen is not None: return seen + + """ + We have a real flow variable that needs to be resolved. + + tst1 = 'subflow_name.my_obj.Account.Name.my_obj.Account.Name' + -> ('subflow_name.my_obj', 'Account.Name.my_obj.Account.Name', VT) + + tst2 = 'my_var.Account.Name.my_obj.Account.Name' + -> ('my_var', 'Account.Name.my_obj.Account.Name', VT) + + tst3 = 'my_var.Name' + -> ('my_var', 'Name', VT) + + tst4 = 'my_var' + -> ('my_var', None, VT) + + + we start with the first element 'subflow_name' which + should have been initialized as a variable (in this case, a subflow) + and we look if this is a variable or an indirect reference. + - if it's an indirect reference, we take the parent as + 'subflow_name.my_obj' and 'Account.Name.my_obj.Account.Name' is a property. + - otherwise, we take first part, 'subflow_name' as parent and everything + else as a property. + + if the parent is not in the list of initialized variables, this means that a new + element type has been added to flows that flow scanner does not know about. + In this case, behavior is determined by the 'strict' flag. In strict usage, we return None. + Otherwise, we create a dummy 'unknown' variable type and assume it only applies + to the top level, logging the error. + + """ + # now do more complex logic splits = name.split('.') spl_len = len(splits) - # we've already checked for the name directly - if spl_len == 1: - logger.warning(f"RESOLUTION ERROR {name}") - if strict is False: - # 'strict' = False means that any unknown variable name - # is assumed to be a string literal that is hardcoded into - # the flows runtime and so not declared in flow xml file - return name, None, self.literal_var - else: - return None - - for i in range(1, spl_len): - # as index goes up, we get less specific. But no need to check the whole name, - # so start at -1 go to -len(splits)+1, e.g. -range(1, len(splits)). - # This means we check splits[0] last - tst = '.'.join(splits[0:-i]) - var_type = self.__get_type(name=tst, path=path, strict=strict) - - if var_type is not None: - # check if this is an automatic storage reference - # e.g. my_recordLookup.output or my_subflow.output - # if it is, we treat the variable name as elem.name - # - # - # NOTE: there are a number of ways to do this, all with downsides - # but we choose to look for anything other than a direct reference as indirect - if i == 1 and (var_type.reference is not ReferenceType.CollectionReference) and ( - var_type.reference is not ReferenceType.Direct): - # We have an indirect reference, e.g. subflow.foo or subflow.foo.bar - # add this to the cache of parsed variables - ref_name = tst + '.' + splits[1] - self.__seen_parents[(path, ref_name)] = var_type # TODO: try to get more precise type - - if spl_len == 2: - # while we could just continue, we can handle this now directly to save lookup - to_return = (ref_name, None, var_type) + for i in range(0, spl_len): + tst = '.'.join(splits[0:spl_len-i]) + if (path, tst) in self.var_types: + var_type = self.var_types[(path, tst)] + + if var_type.tag.startswith("$") and var_type.tag in parse_utils.GLOBALS_SINGLETON: + # These elements should not be viewed as properties of vectors + res = name, None, var_type + + elif var_type.tag in ['subflows', 'actions', 'actionCalls', 'apexPluginCalls', 'stageSteps', + "$Input", "$Output", "$Setup"]: + # these elements do not a single variable but the return of a set of variables that must each be + # referenced by name. + # If the function itself is used, then this + # could be that it returned successfully and will be a boolean, but to get the variables + # we need another param. e.g. subflow_name.varname.Id + if len(tst) == len(name): + res = tst, None, var_type else: - # We have subflow.foo.bar.. - to_return = (ref_name, name[len(tst) + 1:], var_type) + parent = tst + '.' + splits[spl_len-i] + if len(parent) == len(name): + member = None + else: + member = name[len(parent)+1:] + res = parent, member, var_type - # add this resolution to the cache - self.__seen_resolutions[(path, name)] = to_return - return to_return + else: + if len(tst) == len(name): + res = tst, None, var_type + else: + res = tst, name[len(tst)+1:], var_type - # name is foo.bar.baz so if foo.bar is best (most specific) match, - # we need to skip the period to get baz + # add to cache for future lookups + self.cached_resolutions[(path, name)] = res + return res + else: + # keep looking for parents + continue - to_return = (tst, name[len(tst) + 1:], var_type) + # If we fall through here, then we could not find a resolution + logger.critical(f"Could not resolve name {name} in path {path}") - # add to cache of resolutions, so we don't need to go through this again - self.__seen_resolutions[(path, name)] = to_return + if strict: + return None + else: + # treat as a literal value + return name, None, self.literal_var - # return - return to_return - return None @classmethod def from_file(cls, filepath: str, old_parser: Parser = None) -> Parser: @@ -319,19 +468,30 @@ def update(self, old_parser: Parser = None, is_return=False) -> Parser: """ all_named, all_names, vars_, inputs, outputs = _get_global_flow_data(self.flow_path, self.root) + # all_named_elems are ET elements that have a tag as a child self.all_named_elems = all_named + # there are the names above self.all_names = all_names - self.__seen_parents = vars_ + + # all resolutions of all variables (parents) defined in the flow + self.var_types = vars_ self.input_variables = inputs self.output_variables = outputs self.get_flow_type() # will populate flow type self.declared_run_mode = self.get_run_mode() + self.flow_type = self.get_flow_type() + self.trigger_type = self.get_trigger_type() + self.trigger_object = self.get_trigger_object() + self.cached_resolutions = dict() if old_parser is None: self.effective_run_mode = self.declared_run_mode + self.flow_type = self.get_flow_type() else: - if is_return is False: + self.flow_type = old_parser.flow_type + + if not is_return: # if returning from a function call, don't inherit sharing from the child! self.effective_run_mode = util.get_effective_run_mode( parent_sharing=old_parser.get_effective_run_mode(), @@ -339,16 +499,16 @@ def update(self, old_parser: Parser = None, is_return=False) -> Parser: ) # we always update parsed variables, so we have full resolutions available - self.__seen_parents.update(old_parser.__seen_parents) - + self.cached_resolutions.update(old_parser.cached_resolutions) + self.var_types.update(old_parser.var_types) return self - def get_output_variables(self, path: str | None = None) -> {(str, str)}: + def get_output_variables(self, path: str | None = None) -> set[tuple[str, str]]: if path is None: path = self.flow_path return {(x, y) for (x, y) in self.output_variables if x == path} - def get_input_variables(self, path: str | None = None) -> {(str, str)}: + def get_input_variables(self, path: str | None = None) -> set[tuple[str, str]]: if path is None: path = self.flow_path return {(x, y) for (x, y) in self.input_variables if x == path} @@ -356,7 +516,7 @@ def get_input_variables(self, path: str | None = None) -> {(str, str)}: def get_input_field_elems(self) -> set[ET.Element] | None: return parse_utils.get_input_fields(self.root) - def get_input_output_elems(self) -> {str: set[ET.Element]}: + def get_input_output_elems(self) -> dict[str, set[ET.Element]]: """ Returns:: {"input": input variable elements, @@ -399,7 +559,11 @@ def get_by_name(self, name_to_match: str, scope: ET.Element | None = None) -> ET def get_flow_name(self) -> str: """we assume there is always a flow label.""" - return get_by_tag(self.root, 'label')[0].text + res = get_by_tag(self.root, 'label') + if len(res) == 0: + raise InvalidFlowException(f"Flow {self.flow_path} has no name, skipping..") + else: + return res[0].text def get_run_mode(self) -> RunMode: """Get effective context of flow @@ -414,7 +578,7 @@ def get_run_mode(self) -> RunMode: # always runs in user mode return RunMode.DefaultMode - if flow_type in [FlowType.Workflow, FlowType.RecordTrigger, FlowType.Scheduled, FlowType.ProcessBuilder]: + if flow_type in [FlowType.Workflow, FlowType.Trigger, FlowType.ProcessBuilder]: # always runs in system mode return RunMode.SystemModeWithoutSharing @@ -429,107 +593,56 @@ def get_run_mode(self) -> RunMode: def get_api_version(self) -> str: return get_by_tag(self.root, 'apiVersion')[0].text - def get_all_traversable_flow_elements(self) -> [ET.Element]: + def get_all_traversable_flow_elements(self) -> list[ET.Element]: """ ignore start""" return [child for child in self.root if get_tag(child) in parse_utils.CTRL_FLOW_ELEM] - def get_all_variable_elems(self) -> [ET.Element] or None: + def get_all_variable_elems(self) -> list[ET.Element] | None: elems = get_by_tag(self.root, 'variables') if len(elems) == 0: return None else: return elems - def get_templates(self) -> [ET.Element]: + def get_templates(self) -> list[ET.Element]: """Grabs all template elements. Returns empty list if none found """ templates = get_by_tag(self.root, 'textTemplates') return templates - def get_formulas(self) -> [ET.Element]: + def get_formulas(self) -> list[ET.Element]: """Grabs all formula elements. Returns empty list if none found """ formulas = get_by_tag(self.root, 'formulas') return formulas - def get_choices(self) -> [ET.Element]: + def get_choices(self) -> list[ET.Element]: choices = get_by_tag(self.root, 'choices') return choices - def get_dynamic_choice_sets(self) -> [ET.Element]: + def get_dynamic_choice_sets(self) -> list[ET.Element]: dcc = get_by_tag(self.root, 'dynamicChoiceSets') return dcc - def get_constants(self) -> [ET.Element]: + def get_constants(self) -> list[ET.Element]: constants = get_by_tag(self.root, 'constants') return constants - def get_start_elem(self) -> ET.Element: + def get_start_elem(self) -> ET.Element | None: """Get first element of flow Returns: element or element pointed to in """ - start_elements = parse_utils.START_ELEMS - # These are used in obsolete flow versions that exist only in unit tests in core - blacklisted_elements = parse_utils.BANNED_ELEMS - - blacklist_max = max([len(get_by_tag(self.root, x)) for x in blacklisted_elements]) - if blacklist_max > 0: - msg = "Cannot process a flow with blacklisted elements, skipping flow" - logger.warning(msg) - raise InvalidFlowException(msg, flow=self.flow_path) - - else: - start_res = {x: get_by_tag(self.root, x) for x in start_elements} - - for key in start_res: - if len(start_res[key]) == 1: - return start_res[key][0] - - # no start element, so guess - possible = self._guess_start_element() - - if possible is None: - msg = "Cannot find start element for this flow, skipping flow." - logger.warning(msg) - raise InvalidFlowException(msg, flow=self.flow_path) - else: - return possible - - def _guess_start_element(self) -> ET.Element | None: - """ - When the flow has no start or startElementReference we try to guess by looking for a - traversable flow element that is not pointed to by a connector. - - Returns: - Start Element or None if none can be found - """ - traversables = self.get_all_traversable_flow_elements() - if len(traversables) == 0: - return None - - # Only one option and no connector can point to it - if len(traversables) == 1: - return traversables[0] - - traversable_map = {get_name(x): x for x in traversables} - connector_names = set() - for x in traversables: - targets = get_conn_target_map(x) - if targets is not None and len(targets) > 0: - connector_names.update({x[0] for x in targets.values()}) - - candidates = [traversable_map[x] for x in traversable_map if x not in connector_names] - - if len(candidates) == 1: - return candidates[0] + res = parse_utils.get_start_element(self.root) + if res is None: + raise InvalidFlowException(f"No start element found in {self.flow_path}") else: - return None + return res def get_all_indirect_tuples(self) -> list[tuple[str, ET.Element]]: """returns a list of tuples of all indirect references, e.g. @@ -554,11 +667,12 @@ def get_all_indirect_tuples(self) -> list[tuple[str, ET.Element]]: influencers = expression_parser.parse_expression(expr) [accum.append((var, elem)) for var in influencers] if expr is None: - raise RuntimeError(f"could not find expression for {elem}") + # we have seen empty expressions in flows + continue return accum - def __get_type(self, name: str, path: str | None = None, strict: bool = False) -> VariableType | None: + def get_cached_resolution(self, name: str, path: str | None = None) -> tuple[str, str | None, VariableType] | None: """Gets the VariableType for the named Flow Element Only looks in cache. @@ -571,30 +685,37 @@ def __get_type(self, name: str, path: str | None = None, strict: bool = False) - VariableType or None if not present in cache """ if name == STRING_LITERAL_TOKEN: - return self.literal_var - - if name == 'User': - return self.literal_var + return name, None, self.literal_var if path is None: path = self.flow_path - if (path, name) in self.__seen_parents: - return self.__seen_parents[(path, name)] - - if name.startswith('$'): - global_type = _resolve_globals(name) - if global_type is not None: - # add to cache - self.__seen_parents[(path, name)] = global_type - return global_type - + if (path, name) in self.cached_resolutions: + return self.cached_resolutions[(path, name)] else: - logger.info(f"Auto-resolving {name} in file {self.flow_path}") - if strict is True: - return self.literal_var + return None + + def get_called_descendents(self, elem_name: str) -> list[str]: + """Returns empty list if no descendents + """ + el = self.get_by_name(elem_name) + return [x[0] for x in get_conn_target_map(el).values()] + + def get_traversable_descendents_of_elem(self, elem_name: str) -> list[str]: + """includes the original elem name""" + visited = [] + worklist = [] + curr_name = elem_name + while True: + visited.append(curr_name) + to_add = [x for x in self.get_called_descendents(curr_name) if + (x not in visited and x not in worklist)] + worklist = worklist + to_add + + if worklist: + curr_name = worklist.pop(0) else: - return None + return visited def build_vartype_from_elem(elem: ET.Element) -> VariableType | None: @@ -628,94 +749,140 @@ def build_vartype_from_elem(elem: ET.Element) -> VariableType | None: try: if tag == 'actionCalls': + # needs wiring is_ = parse_utils.is_auto_store(elem) - if is_ is True: + if is_: reference = ReferenceType.ActionCallReference - # TODO: see if we can get datatype info from return value - return VariableType(tag=tag, datatype=DataType.StringValue, - reference=reference, is_optional=False) + return VariableType(tag=tag, + reference=reference) + else: + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'actions': + # needs wiring + # These are actions associated to screen flows + # and can have an actionType of "flow" so that they + # behave also as subflows, with a return value of ".Results" + # Also have input parameters + return VariableType(tag=tag, reference=ReferenceType.ElementReference) - if tag == 'actions': - pass + elif tag == 'apexPluginCalls': + # needs wiring + # old action type but needs output parameters + # defined, however it evaluates to 'true' if called. + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'apexPluginCalls': - pass + elif tag == 'assignments': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'capabilityTypes': - pass + elif tag == 'capabilityTypes': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'choices': + elif tag == 'choices': # TODO: handle this better, now put in a stub datatype = parse_utils.get_datatype(elem) - return VariableType(tag=tag, datatype=datatype) + return VariableType(tag=tag, datatype=datatype, reference=ReferenceType.ElementReference) - if tag == 'collectionProcessors': - if elem.find(f'{ns}elementSubtype').text == 'FilterCollectionProcessor': + elif tag == 'collectionProcessors': + subtype = elem.find(f'{ns}elementSubtype') + if subtype is not None and subtype.text == 'FilterCollectionProcessor': # These always store automatically # TODO: Better type inferences needed. Defer this for now. return VariableType(tag=tag, reference=ReferenceType.CollectionReference, is_collection=True) + else: + return VariableType(tag=tag,reference=ReferenceType.NodeReference) - if tag == 'constants': + elif tag == 'constants': datatype = parse_utils.get_datatype(elem) return VariableType(tag=tag, datatype=datatype, reference=ReferenceType.Constant) - if tag == 'customErrors': - pass + elif tag == 'customErrors': + # Displays an error message + # Does not hold value + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'customProperties': + return VariableType(tag='stringValue', datatype=DataType.Literal) - if tag == 'customProperties': - pass + elif tag == 'decisions': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'dynamicChoiceSets': + elif tag == 'dynamicChoiceSets': # These are effectively record lookups # TODO: handle this better, right now we just have a stub datatype = parse_utils.get_datatype(elem) obj_type = parse_utils.get_obj_name(elem) return VariableType(tag=tag, datatype=datatype, object_name=obj_type) - if tag == 'fields': + elif tag == 'exitActionInputParameters': + #todo: check + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'entryActionInputParameters': + #todo: check + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'fields': # TODO: support more vars as time allows. Screens have many possible components. # every field should have a field type # TODO: decide on nullable policy -- say declare nullable if no default? - res = elem.find(f'{ns}fieldType').text - if res == 'InputField': - is_not_required = elem.find(f'{ns}isRequired').text == 'false' - return VariableType(tag=tag, datatype=DataType.StringValue, - reference=ReferenceType.ElementReference, - is_collection=False, - is_optional=is_not_required) - else: - # put in a stub - # TODO: revisit this against corpus (e.g. componentInstance fields) - return VariableType(tag=tag, datatype=DataType.StringValue, - reference=ReferenceType.ElementReference, - is_collection=False) + field_type = elem.find(f'{ns}fieldType') + if field_type is not None: + fld_type = elem.find(f'{ns}fieldType').text + if fld_type == 'InputField' or fld_type == 'ComponentInstance': + is_not_required_t = elem.find(f'{ns}isRequired') + if is_not_required_t is not None and is_not_required_t.text == 'true': + is_not_required = True + else: + is_not_required = False + return VariableType(tag=tag, + reference=ReferenceType.ElementReference, + is_collection=False, + is_optional=is_not_required) + + # put in a stub + # TODO: revisit this against corpus + return VariableType(tag=tag, datatype=DataType.StringValue, + reference=ReferenceType.ElementReference, + is_collection=False) - if tag == 'formulas' or tag == 'textTemplates': + elif tag == 'formulas' or tag == 'textTemplates': return VariableType(tag=tag, datatype=DataType.StringValue, reference=ReferenceType.Formula, is_collection=False) - if tag == 'inputs': - pass + elif tag == 'inputAssignments': + #todo: check + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'loops': + elif tag == 'inputs': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'inputParameters': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'loops': return VariableType(tag=tag, reference=ReferenceType.CollectionReference, is_optional=False, is_collection=True) - if tag == 'orchestratedStages': - pass + elif tag == 'orchestratedStages': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'outputAssignments': - pass + elif tag == 'outputAssignments': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'outputParameters': - pass + elif tag == 'outputParameters': + # use in action calls to assign outputs + # to variables. The subelement + # does not refer to the output parameter + # element + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'recordCreates': + elif tag == 'recordCreates': # Todo: get collection parsing correct, look if record being created is itself # a collection element - do examples of bulkified versions of commands. is_ = parse_utils.is_auto_store(elem) @@ -723,21 +890,21 @@ def build_vartype_from_elem(elem: ET.Element) -> VariableType | None: if is_ is True and obj_ is not None: reference = ReferenceType.ElementReference else: - reference = None + reference = ReferenceType.NodeReference return VariableType(tag=tag, datatype=DataType.StringValue, reference=reference, object_name=obj_, is_optional=False) - if tag == 'recordDeletes': - pass + elif tag == 'recordDeletes': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'recordLookups': + elif tag == 'recordLookups': type_ = DataType.Object nulls_provided = parse_utils.is_assign_null(elem) is_ = not parse_utils.is_get_first_record_only(elem) if is_ is None: - logger.warning("Error parsing recordLookups") - return + logger.critical(f"Error parsing recordLookups {parse_utils.get_name(elem)}") + return None # Todo: once we support second order flows, we'll need to add all of recordLookups if parse_utils.is_auto_store(elem) is True: # this is a valid element reference to the return value of the lookups @@ -749,34 +916,52 @@ def build_vartype_from_elem(elem: ET.Element) -> VariableType | None: # put in a stub else: return VariableType(tag=tag, datatype=type_, is_collection=is_, + reference=ReferenceType.NodeReference, object_name=parse_utils.get_obj_name(elem), is_optional=nulls_provided is not None and nulls_provided is False) - if tag == 'recordRollbacks': - pass + elif tag == 'recordRollbacks': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'recordUpdates': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'recordUpdates': - pass + elif tag == 'rules': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'stageSteps': + elif tag == 'scheduledPaths': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'screens': + # the variable is not held by the screen but by elements + # within it, such as 'field' + return VariableType(tag=tag, reference=ReferenceType.NodeReference) + + elif tag == 'stageSteps': + # needs wiring # output as foo.Outputs.output_var_name - pass + return VariableType(tag=tag,reference=ReferenceType.ElementReference) - if tag == 'stages': - pass - if tag == 'subflows': + elif tag == 'stages': + return VariableType(tag=tag,reference=ReferenceType.ElementReference) + + elif tag == 'subflows': if parse_utils.is_auto_store(elem) is True: # todo: we need a None field for booleans we don't know return VariableType(tag=tag, reference=ReferenceType.SubflowReference) else: - return VariableType(tag=tag) + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'transforms': - pass + elif tag == 'transforms': + # needs wiring + return VariableType(tag=tag, + is_collection=parse_utils.is_collection(elem), + datatype=parse_utils.get_datatype(elem), + reference=ReferenceType.ElementReference) - if tag == 'variables': + elif tag == 'variables': # TODO: handle default variable values in wiring module datatype = parse_utils.get_datatype(elem) @@ -797,58 +982,68 @@ def build_vartype_from_elem(elem: ET.Element) -> VariableType | None: is_input=input_, is_output=output_, properties=None) - if tag == 'waitEvents': - pass + elif tag == 'waitEvents': + # needs wiring + # return values are explicitly assigned to other variables + return VariableType(tag=tag, reference=ReferenceType.NodeReference) - if tag == 'wait': - pass + elif tag == 'waits': + return VariableType(tag=tag, reference=ReferenceType.NodeReference) except Exception as e: # Todo: create flow exception here - logger.error(f"Error parsing variable element {e.args[0]}") + logger.critical(f"Error parsing variable element {traceback.format_exc()}") + # Pass through + logger.critical(f"Variable type cannot find match for elem {parse_utils.get_name(elem)} with tag {tag}") return None -def _get_global_flow_data(flow_path, root: ET.Element) -> ([ET.Element], {str: VariableType}): +def _get_global_flow_data(flow_path, root: ET.Element) \ + -> tuple[list[ET.Element], tuple[str,...], dict[tuple[str, str], VariableType], + frozenset[tuple[str, str]], frozenset[tuple[str, str]]]: + all_named = get_named_elems(root) # all named cannot be None, each flow must have at least one named element. assert all_named is not None - name_dict = {x: get_name(x) for x in all_named} - all_names = tuple(name_dict.values()) - vars_ = {} + name_dict = {x: get_name(x) for x in all_named if x is not None} + all_names = tuple(list(name_dict.values())) + vars_ = dict() inputs = [] outputs = [] + # add in globals + for x in parse_utils.ALL_GLOBALS: + vars_[(flow_path, x)] = VariableType(tag=x, reference=ReferenceType.Global) + + # add in named elements that reference a value for x in all_named: try: var = build_vartype_from_elem(x) except Exception: - logger.error(f"ERROR parsing element {ET.tostring(x, encoding='unicode')}") + logger.error(f"ERROR parsing element {parse_utils.get_elem_string(x)}") continue if var is not None: - vars_[(flow_path, name_dict[x])] = var + if var.tag == 'actions': + vars_[(flow_path, name_dict[x] + ".Results")] = var - if var.is_input is True: + elif var.tag == 'stageSteps': + # e.g. if a stage called 'my_stage' is found in the xml file + # we add the variable my_stage.Outputs. to the list of variables + vars_[(flow_path, name_dict[x] + ".Outputs")] = var + + else: + vars_[(flow_path, name_dict[x])] = var + + if var.is_input: inputs.append((flow_path, name_dict[x])) - if var.is_output is True: + if var.is_output: outputs.append((flow_path, name_dict[x])) return all_named, all_names, vars_, frozenset(inputs), frozenset(outputs) -def _resolve_globals(name: str): - if not name.startswith("$"): - return None - # right now, we allow normal "." to proceed and just add all top level - res = name.split(".") - if len(res) == 1: - if res is not None: - var_type = VariableType(reference=ReferenceType.Global, tag=name) - return var_type - else: - return None diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/ESAPI.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/ESAPI.py similarity index 93% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/ESAPI.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/ESAPI.py index 8ac1a729..76b99701 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/ESAPI.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/ESAPI.py @@ -2,7 +2,7 @@ """ -def html_encode(msg: str) -> str: +def html_encode(msg: str) -> str | int | None: """Performs html encoding Args: msg: unicode message to encode @@ -10,6 +10,9 @@ def html_encode(msg: str) -> str: Returns: html encoded message """ + if msg is None: + return msg + if isinstance(msg, int): return msg diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/__init__.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/__init__.py similarity index 100% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/__init__.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/__init__.py diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/__main__.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/__main__.py similarity index 57% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/__main__.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/__main__.py index 4835dca4..21e88df7 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/__main__.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/__main__.py @@ -1,17 +1,21 @@ +import argparse import json +import logging +import math import os import re import sys -import logging -import argparse import traceback -import flowtest.executor as executor -import flowtest.util as util -import flowtest.version as version +import flow_scanner.executor as executor +import flow_scanner.query_manager +import flow_scanner.util as util +import flow_scanner.version as version import queries.default_query as default_query -from flowtest.util import make_id +from flow_scanner.query_manager import validate_qry_list, get_all_optional_queries +from flow_scanner.util import make_id from public.data_obj import PresetEncoder +from public.parse_utils import quick_validate """ Status reporting will be written to stdout and prepended with @@ -23,8 +27,12 @@ STATUS_DISCOVERY = "Discovering flows**" # search for flows in dir # When flows are being scanned, a percentage will be shown via get_status_msg STATUS_REPORT_GEN = "Generating Report**" # generating report +STATUS_CHUNK_GEN = "Generating Report for Chunk**" STATUS_COMPLETE = "Job Complete**" # job stop +MINIMUM_CHUNK_SIZE = 5 +MAXIMUM_CHUNK_SIZE = 1000 + CURR_DIR = os.getcwd() @@ -47,7 +55,7 @@ def check_file_exists(x: str) -> str: return x -def check_dir_exists(x: str) -> str: +def check_dirs_exist(x: str) -> str: """Checks if the argument is a valid directory. Raises ArgumentTypeError if not. Args: @@ -56,9 +64,11 @@ def check_dir_exists(x: str) -> str: Returns: None """ - if not os.path.isdir(x): - raise argparse.ArgumentTypeError("{0} is not a directory".format(x)) - return x + dir_list = clean_str_list(x.split(',')) + for entry in dir_list: + if not os.path.isdir(entry): + raise argparse.ArgumentTypeError(f"{entry} is not a directory") + return ','.join(dir_list) def check_dir_exists_or_create(x: str) -> str: @@ -73,7 +83,6 @@ def check_dir_exists_or_create(x: str) -> str: os.makedirs(x, exist_ok=True) return x - def check_not_exist(x: str) -> str: """lambda that checks if this path exists or not. Raises an error if it does exist. @@ -88,12 +97,12 @@ def check_not_exist(x: str) -> str: return x -def get_flow_paths_from_file(file_path: str): +def get_flow_paths_from_file(file_path: str) -> list[str]: try: with open(file_path, 'r', encoding='utf-8') as fp: data = fp.read() except UnicodeDecodeError: - # CP1252 is used on older windows systems + # CP1252 is used on older Windows systems try: with open(file_path, 'r', encoding='cp1252') as fp: data = fp.read() @@ -101,13 +110,50 @@ def get_flow_paths_from_file(file_path: str): print("Unable to input file") raise - splits = re.split(r',|\n', data) + splits = re.split(r'[,\n]', data) trimmed = [x.strip() for x in splits] - cleaned = [os.path.abspath(check_file_exists(x)) for x in trimmed if x is not None and len(x) > 0] + cleaned = [os.path.abspath(check_file_exists(x)) for x in trimmed if + x is not None and len(x) > 0 and quick_validate(x) is True] return cleaned -def get_flow_paths(args: argparse.Namespace) -> (list[str], {str: str}): +def get_tokens_from_csv_file(file_path: str) -> list[str]: + try: + with open(file_path, 'r', encoding='utf-8') as fp: + data = fp.read() + except UnicodeDecodeError: + # CP1252 is used on older Windows systems + try: + with open(file_path, 'r', encoding='cp1252') as fp: + data = fp.read() + except UnicodeDecodeError: + print("Unable to input file") + raise argparse.ArgumentTypeError("Unable to read file %s" % file_path) + except: + raise + + return get_validated_queries(unsplit(data)) + + +def get_validated_queries(data: list[str]) -> list[str]: + cleaned_data = de_kebab_list(clean_str_list(data)) + validation = validate_qry_list(cleaned_data) + if validation is True: + return cleaned_data + else: + if len(validation) == 1: + raise argparse.ArgumentTypeError("Unrecognized query requested: %s" % validation[0]) + else: + raise argparse.ArgumentTypeError("Unrecognized queries requested: %s" % + ",".join(validation)) + + +def unsplit(msg: str) -> list[str]: + splits = re.split(r'[,\n]', msg) + return [x.strip() for x in splits] + + +def get_flow_paths(args: argparse.Namespace) -> tuple[list[str], util.Resolver]: """Given the arguments parsed by argparse, returns the flow filenames and names that should be processed. @@ -122,8 +168,6 @@ def get_flow_paths(args: argparse.Namespace) -> (list[str], {str: str}): arg_workspace = args.workspace arg_flow = args.flow arg_dir = args.dir - count = 0 - resolver_map = None flow_workspace = None flow_paths = None @@ -136,38 +180,30 @@ def get_flow_paths(args: argparse.Namespace) -> (list[str], {str: str}): # next are the flows passed as a csv list in an argument if arg_flow is not None and arg_file is None: - flow_paths = [os.path.abspath(x) for x in arg_flow.split(",")] + flow_paths_raw = [os.path.abspath(x) for x in arg_flow.split(",")] + flow_paths = [x for x in flow_paths_raw if quick_validate(x) is True] # finally we scan an entire directory (including subdirectories) if flow_paths is None: if arg_dir is None: arg_dir = CURR_DIR - resolver_map = util.get_flows_in_dir(arg_dir) - flows = list(resolver_map.values()) - flow_paths = [os.path.abspath(x) for x in flows] + flow_paths_raw = util.get_flows_in_dirs(arg_dir) + flow_paths = [x for x in flow_paths_raw if quick_validate(x) is True] if flow_workspace is None: flow_workspace = flow_paths - # Once we have flow paths, we determine labels and namespaces - if resolver_map is None: - resolver_map = {} - for a_flow in flow_workspace: - label = util.get_label(os.path.dirname(a_flow), os.path.basename(a_flow)) - if label is not None: - if label in resolver_map: - print(f"alert, label {label} in map already") - else: - resolver_map[label] = a_flow + # Instantiate resolver for subflow lookup + resolver = util.Resolver(flow_workspace) count = len(flow_paths) if count > 0: print(f"found {count} flows to scan") - return flow_paths, resolver_map + return flow_paths, resolver else: print("No flow files found to scan. Exiting...") - sys.exit(1) + sys.exit(0) def parse_args(my_args: list[str], default: str = None) -> argparse.Namespace: @@ -196,6 +232,8 @@ def parse_args(my_args: list[str], default: str = None) -> argparse.Namespace: version='%(prog)s ' + version.__version__) parser.add_argument("-p", "--preset_info", action='store_true', help="return information on default preset and exit") + parser.add_argument("--optional_query_info", action='store_true', + help="display which optional queries are supported and exit") """ Options for which flows to scan @@ -205,11 +243,11 @@ def parse_args(my_args: list[str], default: str = None) -> argparse.Namespace: paths.add_argument("-f", "--flow", help=("path of flow files scan, csv separated. " "If provided, only these will be scanned or used for resolution."), required=False) - paths.add_argument("-d", "--dir", help=("directory containing flow-meta.xml files " + paths.add_argument("-d", "--dir", help=("csv list of directories containing flow-meta.xml files " "subdirectories are also searched. Defaults to working directory. If no" "flows specified all flows in directory will be scanned, otherwise only" "subflows in this directory will be scanned"), - type=check_dir_exists) + type=check_dirs_exist) paths.add_argument("--target", help="path of file containing csv separated lists of flows to scan." "No other flows will be processed", type=check_file_exists) @@ -218,15 +256,15 @@ def parse_args(my_args: list[str], default: str = None) -> argparse.Namespace: """ parser.add_argument("--workspace", help=("path of file containing csv separated lists of " - "flows in workspace that may be resolved as subflow targets. " - "If empty this defaults to flows target csv file, the specified directory, " - "or contents of flow directory or flows listed in commandline."), - type=check_file_exists) + "flows in workspace that may be resolved as subflow targets. " + "If empty this defaults to flows target csv file, the specified directory, " + "or contents of flow directory or flows listed in commandline."), + type=check_file_exists) """ Options for debug/log handling """ - parser.add_argument("--log_file", default=f".flowtest_log_{default}.log", + parser.add_argument("--log_file", default=f".flow_scanner_log_{default}.log", help="path to store logs. If missing, one will be generated", type=check_not_exist) @@ -238,7 +276,7 @@ def parse_args(my_args: list[str], default: str = None) -> argparse.Namespace: """ parser.add_argument("--crawl_dir", default=None, help="where to store crawl specification", - type=check_dir_exists) + type=check_dirs_exist) """ Options for storing reports """ @@ -250,6 +288,10 @@ def parse_args(my_args: list[str], default: str = None) -> argparse.Namespace: type=check_not_exist) parser.add_argument("-t", "--html", required=False, help="Path to store html report", type=check_not_exist) + parser.add_argument("-c", "--chunk", required=False, + help=(f"chunk scan into groups of files, with one report generated for each group. " + "Reports will be appended with the chunk number. Useful for processing " + "large numbers of files."), type=int) """ Options for labeling reports @@ -260,13 +302,23 @@ def parse_args(my_args: list[str], default: str = None) -> argparse.Namespace: parser.add_argument("-u", "--url", required=False, help="URL to put into report for more information.") parser.add_argument("-l", "--label", required=False, help="human readable label to put in report.") - parser.add_argument("--service_version", default=version.__version__, help="version of system running the command") + parser.add_argument("--service_version", default=version.__version__, + help="version of system running the command") """ - Options for custom query loads + Options for specifying queries and custom query loads """ parser.add_argument("--query_path", required=False, help="path of custom query python file") parser.add_argument("--query_class", required=False, help="name of class to instantiate in query_path") parser.add_argument("--preset", required=False, help="name of preset to use (consumed by query code)") + parser.add_argument("--optional_queries", required=False, + help="comma separated list of optional queries to execute in addition to the preset.") + parser.add_argument("--optional_queries_path", required=False, + help="path of file containing a comma separated list of optional queries to " + "execute in addition to the preset.", type=check_file_exists) + parser.add_argument("--all_optional", required=False, action='store_true', help=("run all optional queries. " + "WARNING: this is noisy.")) + parser.add_argument("--debug_flow", required=False, help=("For expert use only. Run a debug flow with" + "the supplied parameter.")) return parser.parse_args(my_args[1:]) @@ -303,6 +355,12 @@ def main(argv: list[str] = None) -> str | None: return + # Check if user wants list of optional queries + if args.optional_query_info is True: + desc = flow_scanner.query_manager.get_all_optional_descriptions() + print(desc) + return + # logging if args.no_log is True: logging.getLogger().setLevel(logging.CRITICAL + 1) @@ -321,9 +379,32 @@ def main(argv: list[str] = None) -> str | None: elif args.query_path is None and args.query_class is not None: raise argparse.ArgumentTypeError("A query_path must be provided if a query_class is set") + + """ + Handle Optional Queries + """ + if args.all_optional is True: + optional_qry_l = get_all_optional_queries() + + elif args.optional_queries_path is not None: + optional_qry_l = get_tokens_from_csv_file(args.optional_queries_path) + + elif args.optional_queries is not None: + optional_qry_l = get_validated_queries(unsplit(args.optional_queries)) + else: + optional_qry_l = None + + """ + Handle chunking + """ + if args.chunk is not None: + chunk = args.chunk + else: + chunk = None + print(f"{STATUS_LABEL} {STATUS_DISCOVERY}") - flow_paths, all_flows = get_flow_paths(args) + flow_paths, resolver = get_flow_paths(args) if args.label is None: if len(flow_paths) == 1: label = f"scan of {flow_paths[0]}" @@ -340,8 +421,28 @@ def main(argv: list[str] = None) -> str | None: if args.html is None and args.xml is None and args.json is None: raise argparse.ArgumentTypeError("No report format chosen") + chunk_counter = 0 + total_paths = len(flow_paths) + + if chunk is None or chunk > total_paths: + chunk = total_paths + + elif chunk < MINIMUM_CHUNK_SIZE: + print(f"requested chunk size {chunk} is too small, changing to {MINIMUM_CHUNK_SIZE}") + chunk = MINIMUM_CHUNK_SIZE + + number_chunks = math.ceil(total_paths / chunk) + + if chunk < total_paths: + print(f"Scans will be broken into {number_chunks} chunks, with one report per chunk") + + if total_paths> MAXIMUM_CHUNK_SIZE: + print(f"CAUTION: You have requested a scan of {total_paths} flows. It is strongly recommended that" + f"you use the `--chunk` switch to break this scan up into smaller pieces to avoid excessively large" + f"reports and to reduce scan memory usage. Chunked scans have no reduction in scan accuracy.") + for (index, flow_path) in enumerate(flow_paths): - total_paths = len(flow_paths) + status_message = get_status_msg(index, total_paths) print(f"{status_message} scanning {flow_path}...") try: @@ -358,39 +459,85 @@ def main(argv: list[str] = None) -> str | None: query_module_path=args.query_path, query_class_name=args.query_class, query_preset=args.preset, + optional_queries=optional_qry_l, crawl_dir=args.crawl_dir, - all_flows=all_flows) + resolver=resolver) + + except KeyboardInterrupt: + # Program could be long-running and should be interruptible by the user + return + except: - print(f"error processing flow {flow_path}") - print(traceback.format_exc()) - print("...continuing to next flow..") + msg = (f"error processing flow {flow_path}" + f"{traceback.format_exc()}" + "...continuing to next flow..") + print(msg) - if query_manager is None: - print("No flow could be scanned. Exiting.") - sys.exit(-1) + if (index % chunk == 0 and index > 0) or index == total_paths-1: + chunk_counter += 1 + try: + gen_reports(args, query_manager, chunk_counter, number_chunks) + + except KeyboardInterrupt: + return + + except: + print("error generating reports") + print(traceback.format_exc()) + + query_manager = None print("scanning complete.") - print(f"{STATUS_LABEL} {STATUS_REPORT_GEN}") + print(f"{STATUS_LABEL} {STATUS_COMPLETE}") + +def gen_reports(args, query_manager, chunk_counter, number_chunks): + + # we are not chunking, we are generating a single report for everything + if query_manager is None: + print("No flow could be scanned in this chunk") + return + + if number_chunks > 1: + print(f"{STATUS_LABEL} {STATUS_CHUNK_GEN}") + to_insert = str(chunk_counter) + else: + # fall back to old messages so as not to break SCA + print(f"{STATUS_LABEL} {STATUS_REPORT_GEN}") + to_insert = '' + if args.xml is not None: + rep_path = add_chunk_to_path(args.xml, to_insert) xml_rep = query_manager.results.get_cx_xml_str() - with open(args.xml, 'w') as fp: + if os.path.exists(rep_path): + os.remove(rep_path) + with open(rep_path, 'w') as fp: fp.write(xml_rep) - print(f"xml result file written to {args.xml}") + print(f"xml result file written to {rep_path}") if args.html is not None: - query_manager.results.write_html(args.html) + rep_path = add_chunk_to_path(args.html, to_insert) + if os.path.exists(rep_path): + os.remove(rep_path) - print(f"html result file written to {args.html}") + query_manager.results.write_html(rep_path) + print(f"html result file written to {rep_path}") if args.json is not None: - with open(args.json, 'w') as fp: + rep_path = add_chunk_to_path(args.json, to_insert) + if os.path.exists(rep_path): + os.remove(rep_path) + with open(rep_path, 'w') as fp: query_manager.results.dump_json(fp) - print(f"json result file written to {args.json}") - - print(f"{STATUS_LABEL} {STATUS_COMPLETE}") + print(f"json result file written to {rep_path}") +def add_chunk_to_path(old_path: str, to_insert)-> str: + if to_insert == '': + return old_path + new_l = old_path.split('.') + new_l[0] = f"{new_l[0]}-{to_insert}" + return '.'.join(new_l) def setup_logger(level, log_file: str): """Setup logger for scan run @@ -425,6 +572,18 @@ def setup_logger(level, log_file: str): print(f"logfile is {log_file}") +def kebab_to_camel_case(msg: str)-> str: + if "_" in msg: + return msg.replace('_', ' ').replace('-', ' ').title().replace(' ', '') + else: + return msg + +def de_kebab_list(str_l: list[str])-> list[str]: + return [kebab_to_camel_case(x) for x in str_l] + + +def clean_str_list(data: list[str])->list[str]: + return [x for x in data if x.strip() != ''] if __name__ == "__main__": main() diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/branch_state.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/branch_state.py similarity index 80% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/branch_state.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/branch_state.py index 72506e36..46de0e8b 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/branch_state.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/branch_state.py @@ -12,18 +12,72 @@ import copy import logging import traceback -from operator import ifloordiv +from typing import TypeAlias from flow_parser import parse -import public.custom_parser as CP -from public.custom_parser import ET -from flowtest.control_flow import Crawler -from flowtest.flows import FlowVector -from flowtest.util import propagate +from flow_scanner.control_flow import Crawler +from flow_scanner.flows import FlowVector +from flow_scanner.util import propagate from public.contracts import State -from public.data_obj import DataInfluencePath, DataInfluenceStatement, CrawlStep +from public.custom_parser import ET +from public.data_obj import InfluencePath, InfluenceStatement, CrawlStep from public.parse_utils import get_name, get_elem_string, get_line_no +""" + * * Important Type Aliases * * +""" + +""" +A unique flow vector exists for each variable, and variables are globally +define by a tuple (flow path, variable name) to support dataflow +analysis across subflows. Therefore the full state of the system at any +stage of execution can be defined by a "flow_map" which is a map +from (str, str) -> FlowVector + +""" +flow_map_t: TypeAlias = dict[tuple[str, str], FlowVector] + + +""" +An influence path is also defined only for each variable. Moreover, +each variable can have a set of influence paths, due to different +execution flow histories. The difference between an influence map +and a flow map is that the influence map is never vectorized, e.g. +it applies only to scalars or the common case, and so is suitable +for things like formulas, not vectors. + +The flow map is built from influence maps by having a general case +and then property overrides. See the flows module for how this is done. +""" +infl_map_t: TypeAlias = dict[tuple[str, str], set[InfluencePath]] + + +""" +A local influence map drops the flow path and only considers lists, + +variable name --> list of influence paths influencing this variable. + +this is when there is no need for looking at cross flow dataflows. This +is only used as a building block for flow maps and influence maps. +""" + +local_infl_map_t: TypeAlias = dict[str, list[InfluenceStatement]] + +""" +Flow variables are a set of tuples (flow path, variable name) as two flows +can have variables of the same name. +""" + +flow_vars_t: TypeAlias = set[tuple[str, str]] + +""" +a variable in a flow is tracked by a tuple (flow path, variable name) as two flows +can have variables of the same name. +""" + +var_t: TypeAlias = tuple[str, str] + + #: string to use in :class:`public.DataInfluenceStatement` comments SUBFLOW_WIRE_COMMENT = "output via subflow assignment" @@ -45,9 +99,8 @@ class BranchState(State): All interaction with influence flows must be done via public APIs exposed by BranchState. Instantiate only with a builder method. - A clone of the state should be made whenever there is a branch within - a flow, so that when we exit the branch, we recover the state - before branching. + A shallow copy of the influence map is made at each crawl + step (the influence map contains only immutable elements). Prior to exiting a subflow, all branches must be consolidated so that all execution paths are available as return values. @@ -64,20 +117,20 @@ def __init__(self, parser: parse.Parser): self.current_crawl_step: CrawlStep | None = None #: CrawlStep -> map[(flow_path, variable_name)--> FlowVectors] - self.__influence_map: {CrawlStep: {(str, str): FlowVector}} = {} + self.__influence_map: dict[CrawlStep, flow_map_t] = {} #: default map populated with globals available to the flow - self.__default_map: {(str, str): FlowVector} = {} + self.__default_map: flow_map_t = {} #: Name of Element being currently processed (for convenience) #: (The first element in a flow is start and has no name.) self.current_elem_name: str = '*' #: current Flow element (xml) - self.current_elem: ET.Element = None + self.current_elem: ET.Element | None = None #: resolves formulas and templates (late binding indirect references) - self.formula_map: {(str, str): {DataInfluencePath}} = {} + self.formula_map: infl_map_t = {} #: flow label (unique within a given package) self.flow_name: str | None = None @@ -133,7 +186,7 @@ def get_current_elem_name(self) -> str: """ return self.current_elem_name - def filter_maps(self, steps: [CrawlStep]): + def filter_maps(self, steps: list[CrawlStep]): """Removes all influence maps except those in `steps` .. WARNING:: Destructive operation, only call after flow @@ -151,7 +204,7 @@ def filter_maps(self, steps: [CrawlStep]): for step in to_delete: del self.__influence_map[step] - def get_all_output_vectors(self) -> [((str, str), FlowVector)]: + def get_all_output_vectors(self) -> list[tuple[var_t, FlowVector]]: """Return output variable FlowVectors Returns: @@ -165,7 +218,7 @@ def get_all_output_vectors(self) -> [((str, str), FlowVector)]: return [(var_tuple, self.get_or_make_vector(name=var_tuple[1], flow_path=var_tuple[0])) for var_tuple in self.parser.output_variables] - def propagate_flows(self, statement: DataInfluenceStatement, + def propagate_flows(self, statement: InfluenceStatement, assign: bool = True, store: bool = True, **type_replacements @@ -228,7 +281,7 @@ def propagate_flows(self, statement: DataInfluenceStatement, if accum is None: accum = self.get_or_make_vector(statement_flow.influenced_name, store=False) - if store is True: + if store: # add to influence map self._get_influence_map()[(self.flow_path, statement_flow.influenced_name)] = accum @@ -246,39 +299,57 @@ def load_crawl_step(self, crawler: Crawler, crawl_step: CrawlStep = None) -> Cra """ if crawl_step is None: - cs = crawler.get_crawl_step() + next_cs = crawler.load_crawl_step() else: - cs = crawl_step + next_cs = crawl_step - if cs is None: + if next_cs is None: # nothing left to crawl return None # find the appropriate parent map to clone: if self.current_crawl_step is None: + # we are just starting the crawl old_map = self.__default_map - elif cs.visitor == self.current_crawl_step.visitor: + + elif next_cs.visitor == self.current_crawl_step.visitor: + # the next element is on the same branch as current elem old_map = self.__influence_map[self.current_crawl_step] + else: - old_cs = crawler.get_last_ancestor(cs) - if old_cs is None: - # no predecessor, so we use default - old_map = self.__default_map + old_history = self.current_crawl_step.visitor.history + new_history = next_cs.visitor.history + + if old_history == (): + # we are on the first branch, so no backtracking + old_map = self.__influence_map[self.current_crawl_step] + + elif len(new_history) >= len(old_history) and new_history[0:len(old_history)] == old_history: + # the new branch is a continuation of old branch so no backtracking + old_map = self.__influence_map[self.current_crawl_step] + else: - old_map = self.__influence_map[old_cs] + # the new history is a different branch, and we need to backtrack + old_cs = crawler.get_last_ancestor(next_cs) + if old_cs is None: + # no predecessor, so we use default + old_map = self.__default_map + else: + old_map = self.__influence_map[old_cs] - # make shallow copy - self.__influence_map[cs] = copy.copy(old_map) + # make shallow copy because flow vectors are immutable + self.__influence_map[next_cs] = copy.copy(old_map) # load current element and step info - self.current_crawl_step = cs - self.current_elem = self.parser.get_by_name(cs.element_name) - self.current_elem_name = cs.element_name + self.current_crawl_step = next_cs + self.current_elem = self.parser.get_by_name(next_cs.element_name) + self.current_elem_name = next_cs.element_name - return cs + return next_cs def get_flows_from_sources(self, influenced_var: str, - source_vars: {(str, str)}, all_steps=False) -> set[DataInfluencePath] | None: + source_vars: set[var_t], + all_steps=False, restrict: str | None=None) -> set[InfluencePath] | None: """Finds which flows originate in the source variables. returns all flows into influencer_var that originate in the @@ -292,7 +363,8 @@ def get_flows_from_sources(self, influenced_var: str, but with a path tuple. all_steps: whether flows should be loaded from all crawl steps or just the current step - + restrict: only consider flows that originate with the variable restricted to the + specified property of the source vars. Notes: * Only queries for names in the current flow-path (This should be handled automatically as sources @@ -306,14 +378,17 @@ def get_flows_from_sources(self, influenced_var: str, entire flow is provided to assist in type analysis if needed. + """ if source_vars is None or len(source_vars) == 0: return None if influenced_var is None: return None + # we assume this is being called in the same flow as the influenced variable + path = self.flow_path (parent, member, type_info) = self.parser.resolve_by_name(influenced_var) - var_tuple = (self.flow_path, parent) + var_tuple = (path, parent) if var_tuple in self.formula_map: # (The issue is that influencer_var may be a formula field, @@ -322,15 +397,15 @@ def get_flows_from_sources(self, influenced_var: str, else: formula_flows = None - if all_steps is False: + if not all_steps: steps_to_check = [None] else: steps_to_check = [self.__influence_map.keys()] to_return = set() for step in steps_to_check: - tgt_vec = self._get_or_make_from_type(parent, member, type_info, - store=False, step=step) + tgt_vec = self._get_or_make_FlowVector(parent=parent, type_info=type_info, path=path, + store=False, step=step) if formula_flows is not None: vec_influencers = self._propagate_flows_to_vec(flows=formula_flows, vec=tgt_vec, assign=True, @@ -346,7 +421,11 @@ def get_flows_from_sources(self, influenced_var: str, for path in candidate_flows: if (path.history[0].flow_path, path.history[0].influencer_var) in source_vars: - to_return.add(path) + # now filter if requested + if restrict is not None and path.influencer_property != restrict: + continue + else: + to_return.add(path) # end of for-loop return all results if len(to_return) == 0: @@ -393,7 +472,7 @@ def get_or_make_vector(self, name: str, flow_path: str = None, return res else: # TODO: this is ugly, we need to rework this - return self._get_or_make_from_type(parent=parent, type_info=type_info, store=store, step=step) + return self._get_or_make_FlowVector(parent=parent, type_info=type_info, store=store, step=step) def is_in_map(self, var_name: str) -> bool: """checks if the name is in map @@ -411,10 +490,10 @@ def is_in_map(self, var_name: str) -> bool: else: return (self.flow_path, var_name) in influence_map - def add_vectors_from_other_flow(self, src_flow_path: str, output_vector_map: {(str, str): FlowVector}, - src2tgt_variable_map: {str: str}, transition_elem: ET.Element, + def add_vectors_from_other_flow(self, src_flow_path: str, output_vector_map: flow_map_t, + src2tgt_variable_map: dict[str, str], transition_elem: ET.Element, is_return=False - ) -> {(str, str): FlowVector} or None: + ) -> flow_map_t | None: """Pushes vectors in the source to vectors in the target, by wiring a flow across flow boundaries:: old path: src A (=terminal in src) @@ -458,6 +537,7 @@ def add_vectors_from_other_flow(self, src_flow_path: str, output_vector_map: {(s added = {} subflow_name = get_name(transition_elem) subflow_src = get_elem_string(transition_elem) + # noinspection PyUnresolvedReferences subflow_line_no = transition_elem.sourceline out_path = self.flow_path @@ -480,8 +560,8 @@ def add_vectors_from_other_flow(self, src_flow_path: str, output_vector_map: {(s target_var = src2tgt_variable_map[src_name] (tgt_parent, tgt_member, tgt_type) = self.parser.resolve_by_name(target_var) - connect_path = DataInfluencePath( - history=(DataInfluenceStatement( + connect_path = InfluencePath( + history=(InfluenceStatement( influenced_var=target_var, influencer_var=src_name, element_name=subflow_name, @@ -501,7 +581,7 @@ def add_vectors_from_other_flow(self, src_flow_path: str, output_vector_map: {(s ) # grab a reference to the target vector: - tgt_vec = self._get_or_make_from_type(parent=target_var, type_info=tgt_type, path=out_path) + tgt_vec = self._get_or_make_FlowVector(parent=target_var, type_info=tgt_type, path=out_path) # now push the src vector into the target via the connecting flow: tgt_vec_new = src_vec.push_via_flow(extension_path=connect_path, @@ -517,13 +597,35 @@ def add_vectors_from_other_flow(self, src_flow_path: str, output_vector_map: {(s return added + + def initialize_variables_from_elems(self, elems: list[ET.Element] | None) -> None: + """Adds variables to the influence map (if not already present) + + .. WARNING:: Expert use. Only to initialize from named elements that are actually + Initialized. API users should use the :meth:`get_or_make` method + + Args: + elems: XML elements to initialize + + Returns: + None + + Raises: + ValueError if element cannot be resolved + """ + if elems is None: + return None + + [self._init_vec_from_elem(x, store=True) for x in elems] + return None + # # End of BranchState Public API # # - def _get_or_make_from_type(self, parent: str, type_info: parse.VariableType, path: str = None, - store=False, step: CrawlStep = None): + def _get_or_make_FlowVector(self, parent: str, type_info: parse.VariableType, path: str = None, + store=False, step: CrawlStep = None)-> FlowVector: """Retrieve or make vector based on Variable Type Args: @@ -538,26 +640,31 @@ def _get_or_make_from_type(self, parent: str, type_info: parse.VariableType, pat Flow Vector """ - infl_map = self._get_influence_map(crawl_step=step) if path is None: path = self.flow_path - var_t = (path, parent) + infl_map = self._get_influence_map(crawl_step=step) + - if var_t in infl_map: - return infl_map[var_t] + var_ = (path, parent) + if var_ in infl_map: + return infl_map[var_] + + logger.info(f"variable {var_} not found in influence map at step {step} in flow {path}, " + f"creating new flow vector for it.") # try to get the element for better reporting: var_elem = self.parser.get_by_name(parent) if var_elem is not None: + # noinspection PyUnresolvedReferences line_no = var_elem.sourceline source_text = get_elem_string(var_elem) else: line_no = 0 source_text = "[builtin]" - dfr = DataInfluenceStatement( + dfr = InfluenceStatement( influenced_var=parent, influencer_var=parent, element_name=parent, @@ -568,17 +675,18 @@ def _get_or_make_from_type(self, parent: str, type_info: parse.VariableType, pat comment=INITIALIZATION_COMMENT ) - flow_path = DataInfluencePath(history=(dfr,), influenced_name=parent, influenced_filepath=path, - influencer_name=parent, influencer_filepath=path, influencer_property=None, - influenced_property=None, influenced_type_info=type_info - ) + flow_path = InfluencePath(history=(dfr,), influenced_name=parent, influenced_filepath=path, + influencer_name=parent, influencer_filepath=path, influencer_property=None, + influenced_property=None, influenced_type_info=type_info + ) flow_vector = FlowVector.from_flows(default={flow_path}) - if store is True: + if store: # add to influence map - infl_map[var_t] = flow_vector + infl_map[var_] = flow_vector + logger.info(f"Added {var_} to influence map at step {step} in flow {path}") return flow_vector - def _get_influence_map(self, crawl_step: CrawlStep = None) -> {(str, str): FlowVector} or None: + def _get_influence_map(self, crawl_step: CrawlStep = None) -> flow_map_t | None: """retrieves current influence map instance for the given crawl step Args: @@ -598,28 +706,8 @@ def _get_influence_map(self, crawl_step: CrawlStep = None) -> {(str, str): FlowV else: return dict.get(self.__influence_map, cs, None) - def _initialize_variables_from_elems(self, elems: set[ET.Element] | None) -> None: - """Adds variables to the influence map (if not already present) - - .. WARNING:: Expert use. Only to initialize from named elements that are actually - Initialized. API users should use the :meth:`get_or_make` method - - Args: - elems: XML elements to initialize - - Returns: - None - Raises: - ValueError if element cannot be resolved - """ - if elems is None: - return None - - [self._init_vec_from_elem(x, store=True) for x in elems] - return None - - def _propagate_flows_to_vec(self, flows: {DataInfluencePath}, + def _propagate_flows_to_vec(self, flows: set[InfluencePath], vec: FlowVector, assign: bool = True, step: CrawlStep = None) -> FlowVector: """Pushes influencers into vec @@ -644,8 +732,8 @@ def _propagate_flows_to_vec(self, flows: {DataInfluencePath}, logger.warning(f"variable used in assignment {end_parent} is not initialized.") # Initialization happens here, as the tail must be in the map - tail_vec = self._get_or_make_from_type(parent=end_parent, - type_info=head_flow.influenced_type_info, store=True, step=step) + tail_vec = self._get_or_make_FlowVector(parent=end_parent, + type_info=head_flow.influenced_type_info, store=True, step=step) # Push all flows to the head of the statement for each element of head-flows new_vec = tail_vec.push_via_flow(influenced_vec=vec, extension_path=head_flow, assign=assign, @@ -715,7 +803,7 @@ def _init_vec_from_elem(self, elem: ET.Element, store=True) -> FlowVector | None if el_tuple in influence_map: return influence_map[el_tuple] - dfr = DataInfluenceStatement( + dfr = InfluenceStatement( influenced_var=parent, influencer_var=parent, element_name=parent, @@ -728,7 +816,7 @@ def _init_vec_from_elem(self, elem: ET.Element, store=True) -> FlowVector | None flow_path = _build_path_from_history(history=(dfr,), parser=self.parser) flow_vector = FlowVector.from_flows(default={flow_path}) - if store is True: + if store: # add to influence map influence_map[el_tuple] = flow_vector return flow_vector @@ -737,7 +825,7 @@ def _init_vec_from_elem(self, elem: ET.Element, store=True) -> FlowVector | None Utility methods for unit tests """ - def _test_only_set_influence_map(self, another_map: {CrawlStep: {(str, str): FlowVector}}) -> None: + def _test_only_set_influence_map(self, another_map: dict[CrawlStep, flow_map_t]) -> None: """set influence map for state .. DANGER:: Test only @@ -750,7 +838,7 @@ def _test_only_set_influence_map(self, another_map: {CrawlStep: {(str, str): Flo """ self.__influence_map = another_map - def _test_only_get_influence_map(self) -> {CrawlStep: {(str, str): FlowVector}}: + def _test_only_get_influence_map(self) -> dict[CrawlStep, flow_map_t]: """get influence map .. DANGER:: Test only function @@ -761,7 +849,7 @@ def _test_only_get_influence_map(self) -> {CrawlStep: {(str, str): FlowVector}}: # only for testing return self.__influence_map - def filter_input_variables(self, output_vars: {(str, str): FlowVector}) -> {(str, str): FlowVector}: + def filter_input_variables(self, output_vars: flow_map_t) -> flow_map_t | None: """filters vectors to remove flows starting in input variables in the current flow Args: @@ -795,6 +883,7 @@ def filter_input_variables(self, output_vars: {(str, str): FlowVector}) -> {(str continue if default.influencer_filepath == flow_path: + if overrides is None: if (flow_path, default.influencer_name) not in self.parser.input_variables: # keep these @@ -811,15 +900,17 @@ def filter_input_variables(self, output_vars: {(str, str): FlowVector}) -> {(str filtered = True continue - if filtered is True: + if filtered: # we skipped some properties, so we need to create a new # flow vector with the purged properties and add to the return accum + if default not in new_maps: + new_maps[default] = dict() if prop in new_maps[default]: new_maps[default][prop].add(influence_path) else: new_maps[default][prop] = {influence_path} - if filtered is True: + if filtered: to_return[var_tuple] = FlowVector(property_maps=new_maps) else: @@ -829,8 +920,8 @@ def filter_input_variables(self, output_vars: {(str, str): FlowVector}) -> {(str return to_return -def _build_path_from_history(parser: parse.Parser, history: tuple[DataInfluenceStatement, ...], - strict=False, **type_replacements) -> DataInfluencePath: +def _build_path_from_history(parser: parse.Parser, history: tuple[InfluenceStatement, ...], + strict=False, **type_replacements) -> InfluencePath: """Creates a Dataflow Influence Path from the tuple of influence statements Args: @@ -860,19 +951,19 @@ def _build_path_from_history(parser: parse.Parser, history: tuple[DataInfluenceS my_type = propagate(src_type=first_type, dest_type=last_type, **type_replacements) - return DataInfluencePath(history=history, - influenced_name=last_parent, - influencer_name=first_parent, - influencer_filepath=first.flow_path, - influenced_filepath=last.flow_path, - # this is critical or else it will go in the wrong slot - influenced_property=last_member, - influencer_property=first_member, - influenced_type_info=my_type - ) + return InfluencePath(history=history, + influenced_name=last_parent, + influencer_name=first_parent, + influencer_filepath=first.flow_path, + influenced_filepath=last.flow_path, + # this is critical or else it will go in the wrong slot + influenced_property=last_member, + influencer_property=first_member, + influenced_type_info=my_type + ) -def _build_formula_map(parser: parse.Parser, flow_path: str) -> dict[(str, str):set[DataInfluencePath]]: +def _build_formula_map(parser: parse.Parser, flow_path: str) -> infl_map_t: """Formulas and Templates need to be resolved at each invocation, so this map returns a ready-made set of dataflows to wire in case a formula appears in a data influence statement. @@ -893,7 +984,7 @@ def _build_formula_map(parser: parse.Parser, flow_path: str) -> dict[(str, str): return to_return -def _get_raw_formula_map(parser: parse.Parser, flow_path: str) -> dict[str:list[DataInfluenceStatement]]: +def _get_raw_formula_map(parser: parse.Parser, flow_path: str) -> local_infl_map_t: """ Args: parser: parser instance @@ -909,7 +1000,7 @@ def _get_raw_formula_map(parser: parse.Parser, flow_path: str) -> dict[str:list[ for (var_name, elem) in tuples: formula_name = parse.get_name(elem) short_tag = elem.tag[ns_len:] - stmt = DataInfluenceStatement( + stmt = InfluenceStatement( influenced_var=formula_name, influencer_var=var_name, element_name=formula_name, @@ -927,9 +1018,9 @@ def _get_raw_formula_map(parser: parse.Parser, flow_path: str) -> dict[str:list[ return accum -def _extend_formula_map_by_flows(start_flows: set[DataInfluencePath], - formula_map: {(str, str): {DataInfluencePath}}, - add_missing: bool = True) -> {DataInfluencePath}: +def _extend_formula_map_by_flows(start_flows: set[InfluencePath], + formula_map: infl_map_t, + add_missing: bool = True) -> set[InfluencePath]: """Resolves a flow if the influencer is a formula in terms of real variables. .. WARNING:: This function does not perform any vectorization! @@ -965,18 +1056,18 @@ def _extend_formula_map_by_flows(start_flows: set[DataInfluencePath], flow_influencer = (flow.influencer_filepath, flow.influencer_name) if flow_influencer in formula_map: map_influencing = formula_map[flow_influencer] - [accum.add(DataInfluencePath.combine(x, flow)) for x in map_influencing] + [accum.add(InfluencePath.combine(x, flow)) for x in map_influencing] - elif add_missing is True: + elif add_missing: # a match was not found, so we add the original flow: accum.add(flow) return accum -def _resolve_influencers(elem_ref_name: ET.Element, - raw_formula_map: {str: [DataInfluenceStatement]}, - parser: parse.Parser) -> {DataInfluencePath}: +def _resolve_influencers(elem_ref_name: str, + raw_formula_map: local_infl_map_t, + parser: parse.Parser) -> set[InfluencePath]: """Resolves indirect references This function exists to handle recursion in formulas/templates:: @@ -1027,7 +1118,7 @@ def _resolve_influencers(elem_ref_name: ET.Element, seen_resolvers.add(curr_flow.influencer_name) - to_resolve = to_resolve + [DataInfluencePath.combine( + to_resolve = to_resolve + [InfluencePath.combine( _build_path_from_history(history=(x,), parser=parser), curr_flow ) for x in raw_formula_map[curr_flow.influencer_name] ] @@ -1097,5 +1188,5 @@ def _populate_defaults(state: BranchState, parser: parse.Parser) -> None: all_formulas = parser.get_formulas() all_choices = parser.get_choices() all_constants = parser.get_constants() - state._initialize_variables_from_elems(all_vars + all_templates + all_formulas - + all_choices + all_constants) + state.initialize_variables_from_elems(all_vars + all_templates + all_formulas + + all_choices + all_constants) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/control_flow.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/control_flow.py new file mode 100644 index 00000000..76ee85b0 --- /dev/null +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/control_flow.py @@ -0,0 +1,1053 @@ +"""Module to generate control flow graphs and crawl schedules + +""" +from __future__ import annotations + +import dataclasses +import json +import logging +import traceback +from collections.abc import Generator +from dataclasses import dataclass, field +from typing import TextIO, TypeAlias + +import flow_parser.parse as parse +from flow_parser.parse import Parser +from public.contracts import AbstractSegment, AbstractControlFlowGraph, AbstractCrawler +from public.data_obj import BranchVisitor, CrawlStep, Jump, JSONSerializable +from public.enums import ConnType +from public.parse_utils import (ET, get_name, get_conn_target_map, + is_subflow, is_loop, get_tag) + +MAX_VISITS_PER_SEGMENT = 20 + +logger = logging.getLogger(__name__) + +# a tuple of (flow filename, flow element name) +var_t: TypeAlias=tuple[str, str] + +# (element name, element tag) +el_t: TypeAlias=tuple[str, str] + +@dataclass(frozen=True, eq=True, slots=True) +class Segment(JSONSerializable, AbstractSegment): + # name of element at the start of the segment (jump target) + label: str + + # list of (element names, element tags) (including label) in this segment (in order) + traversed: list[el_t] + + # list of traversal indexes that are subflow elements + subflows: list[int] + + # connectors at the end of this segment + jumps: list[Jump] + + # whether this segment may end execution + is_terminal: bool + + # for tracking whether it has been visited + seen_tokens: list[tuple[tuple[str, str], ...]] = field(default_factory=list) + + + def accept(self, visitor: BranchVisitor, multiple_inbound: bool=False) -> list[BranchVisitor] | None: + """does the node accept the visitor + + Also updates visitor state + + Args: + visitor: Branch Visitor trying to jump into node + multiple_inbound: Whether this segment accepts more than + a single inbound, in which case it will + need to assign tokens for the extra inbounds. + Whether an inbound is 'extra' is decided by + order of visit. + Returns: + list of labels to process or None + + """ + if not self.jumps: + return None + + if visitor.token in self.seen_tokens: + return None + + else: + self.seen_tokens.append(visitor.token) + if multiple_inbound: + return self._send_outbound(visitor, add_token=True) + else: + return self._send_outbound(visitor) + + + def _send_outbound(self, visitor, add_token=False): + jumps = self.jumps + + to_return = [] + loop_context = visitor.loop_context or tuple() + + history = visitor.history + ((visitor.previous_label, visitor.current_label),) + + for jmp in jumps: + current_label = jmp.target + previous_label = self.label + + if jmp.is_loop: + # we are entering a loop context + loop_context = loop_context + ((jmp.src_name,ConnType.Loop),) + + if jmp.is_fault: + loop_context = loop_context + ((jmp.src_name, ConnType.Exception),) + + if jmp.is_no_more_values: + # exiting loop context + z = _right_find(my_iter=loop_context, val_to_find=ConnType.Loop) + if z == -1: + logger.critical("Found a loop exit without loop entrance") + continue + else: + # remove everything before the entrance to the loop + loop_context = loop_context[:z] + + if add_token: + new_token = visitor.token + to_add = (visitor.previous_label, visitor.current_label) + if visitor.token is None: + new_token = (to_add,) + + elif to_add not in visitor.token: + new_token = (to_add,) + visitor.token + + outbound_to_add = dataclasses.replace(visitor, + token=new_token, + current_label=current_label, + previous_label=previous_label, + history=history, + loop_context=loop_context + ) + else: + outbound_to_add = dataclasses.replace(visitor, + current_label=current_label, + previous_label=previous_label, + history=history, + loop_context=loop_context + ) + + to_return.append(outbound_to_add) + return to_return + + @classmethod + def build_from_parser(cls, parser: parse.Parser, elem: ET.Element) -> Segment: + """Build a segment starting at this element + + Args: + parser: flow parser instance + elem: first element in this segment + + Returns: + segment + """ + + label = get_name(elem) + start_tag = get_tag(elem) + jumps = [] + + if is_subflow(elem): + subflows = [0] + else: + subflows = [] + + conn_map = _get_connector_map(elem, parser=parser) + optional_values = [x[2] for x in conn_map.values() if x[2] is True] + is_optional = len(optional_values) > 0 + curr_elem = elem + + # elements traversed within this segment, + # so always initialized to zero + traversed = [] + + if len(conn_map) == 0: + return Segment(label=label, + subflows=subflows, + traversed=[(label, start_tag)], + jumps=[], + is_terminal=True) + index = 0 + + while len(conn_map) > 0: + curr_name = get_name(curr_elem) + curr_tag = get_tag(curr_elem) + assert curr_tag is not None + + if (curr_name, curr_tag) in traversed: + # we are looping back in the segment. break here, and + # the element will not be added to this segment. + # It will then appear in some other segment pointing to this segment. + # + # If it points to an element somewhere in the middle of this segment, + # that will be addressed in the `fix_duplicates` function below. + break + else: + traversed.append((curr_name, curr_tag)) + + if is_subflow(curr_elem): + subflows.append(index) + + if is_loop(curr_elem): + # loops always terminate a segment + for conn, val in conn_map.items(): + elem_is_loop = False + no_more_seen = False + + if get_tag(conn) == 'noMoreValuesConnector': + is_optional = False + no_more_seen = True + + if get_tag(conn) == 'nextValueConnector': + elem_is_loop = True + # there may be no values at all, + # in which case this branch may never be taken + is_optional = True + + jumps.append(Jump(src_name=curr_name, + target=val[0], + is_goto=val[1] is ConnType.Goto, + is_loop=elem_is_loop, + is_no_more_values=no_more_seen, + is_fault=False + ) + ) + break + + elif len(conn_map) == 1: + vals = list(conn_map.values()) + is_optional = vals[0][2] + + if (vals[0][1] is not ConnType.Goto and + not is_optional): + + # this is a normal connector that must always be followed + curr_elem = parser.get_by_name(vals[0][0]) + conn_map = _get_connector_map(curr_elem, parser=parser) + continue + + else: + # although there is only one connector, it is optional + # which means the current element terminates the segment + # and *may* terminate the flow + # and the connector is turned into a jump + jumps.append(Jump(src_name=curr_name, + is_goto=vals[0][1] is ConnType.Goto, + target=vals[0][0], + is_loop=False, + is_no_more_values=False, + is_fault=vals[0][1] is ConnType.Exception)) + + break + + elif len(conn_map) > 1: + is_optional = True + # There is more than one connector, so + for val in conn_map.values(): + + # a single non-optional connector makes the segment non-terminal + if not val[2]: + is_optional = False + jumps.append(Jump(src_name=curr_name, + target=val[0], + is_goto=val[1] is ConnType.Goto, + is_loop=False, + is_no_more_values=False, + is_fault=val[1] is ConnType.Exception + ) + ) + break + + # end of conditionals + index += 1 + + # end of while loop + + # Check if the last element in a segment that may also end the flow + if len(conn_map) == 0: + curr_tag = get_tag(curr_elem) + curr_name = get_name(curr_elem) + if (curr_name, curr_tag) not in traversed: + traversed.append((curr_name, curr_tag)) + + if is_subflow(curr_elem): + subflows.append(index) + + if len(jumps) == 0: + # if there are no more jumps, this is a terminal element + is_optional = True + else: + # sort jumps so nextValue is taken first + jumps.sort(key=lambda x: x.priority()) + + return Segment(label=label, + subflows=subflows, + jumps=jumps, + traversed=traversed, + is_terminal=is_optional) + + +@dataclass(frozen=True, eq=True, slots=True) +class ControlFlowGraph(JSONSerializable, AbstractControlFlowGraph): + # where to start + start_label: str + + # map from segment label -> inbound jumps + inbound: dict[str, list[Jump]] + + # label -> segment + segment_map: dict[str, Segment] + + @classmethod + def from_parser(cls, parser: parse.Parser): + start_elem = parser.get_start_elem() + start_label = get_name(start_elem) + visited_labels = [] + visited_elems = set() + segment_map = {} + to_visit = [start_elem] + + while len(to_visit) > 0: + + curr_elem = to_visit.pop(0) + curr_segment = Segment.build_from_parser(parser=parser, + elem=curr_elem) + + segment_map[curr_segment.label] = curr_segment + + # add segment label to visited + if curr_segment.label not in visited_labels: + visited_labels.append(curr_segment.label) + + visited_elems.update(curr_segment.traversed) + + # update to_visit with new jumps + for jmp in curr_segment.jumps: + tgt = jmp.target + tgt_elem = parser.get_by_name(tgt) + if tgt not in visited_labels and tgt_elem not in to_visit: + to_visit.append(tgt_elem) + + # The resulting Segments are fine except for + # gotos leading to duplicates. These are fixed here. + _fix_duplicates(segment_map) + + # Now generate inbound: + inbound = {} + + for seg in segment_map.values(): + for jmp in seg.jumps: + if jmp.target in inbound: + inbound[jmp.target].append(jmp) + else: + inbound[jmp.target] = [jmp] + + return ControlFlowGraph(start_label=start_label, + inbound=inbound, + segment_map=segment_map) + +def get_crawl_data(cfg: ControlFlowGraph) -> \ + tuple[tuple[CrawlStep, ...], + tuple[CrawlStep, ...], + dict[str, list[CrawlStep]]]: + """Builds crawl schedule + + Args: + cfg: Control Flow Graph + + Returns: + (tuple of crawl steps, tuple of terminal steps, dict of element to list of crawl steps) + + """ + + generator = _crawl_iter(cfg) + crawl_steps = [] + terminal_steps = [] + el_2_cs = dict() # mapping el_name to list of crawl steps + step = 0 + max_visit = 100 + + for (visitor, segment) in generator: + + if segment.is_terminal: + el = segment.traversed[-1][0] + tag = segment.traversed[-1][1] + cs = CrawlStep( + step=step + len(segment.traversed) - 1, + visitor=visitor, + element_name=el, + element_tag=tag, + local_index= len(segment.traversed) - 1 + ) + + terminal_steps.append(cs) + + for local_index, (el_name, el_tag) in enumerate(segment.traversed, start=0): + cs = CrawlStep( + step=step, + visitor=visitor, + element_name=el_name, + element_tag=el_tag, + local_index=local_index + ) + + crawl_steps.append(cs) + + vals = el_2_cs.get(el_name, None) + if vals is None: + el_2_cs[el_name] = [cs] + else: + vals.append(cs) + + step += 1 + + return tuple(crawl_steps), tuple(terminal_steps), el_2_cs + +def get_visits_statistics(visit_map: dict[str, Jump | None], cfg: ControlFlowGraph): + # first check that every label has been visited: + missed = [] + for label in cfg.segment_map: + if len(visit_map[label]) == 0: + print(f"not visited: {label}") + missed.append(label) + + # check that every jump has been traversed: + missing_inbound = [] + inbound = cfg.inbound + for label in inbound: + label_visits = visit_map[label] + try: + visit_tuples = {(x.current_label, cfg.segment_map[x.previous_label].traversed[-1][0]) for x in label_visits + if x.previous_label is not None} + inbound_tuples = {(x.target, x.src_name) for x in inbound[label]} + except: + print(f"{traceback.format_exc()}") + raise + for inbound_t in inbound_tuples: + if inbound_t not in visit_tuples: + missing_inbound.append(inbound_t) + if len(missing_inbound) > 0: + [print(f"missing inbound jumps: {x}") for x in missing_inbound] + + # get total number of visits: + all_visits = 0 + for visit in visit_map.values(): + all_visits = all_visits + len(visit) + + all_inbound = 0 + for x in cfg.inbound.values(): + all_inbound = all_inbound + len(x) + + report_str = (f"total number of visits: {all_visits}\n" + f"total number of visits per node: {all_visits / len(visit_map)}\n" + f"total number of visits per inbound: {all_visits / max(all_inbound, 1)}\n" + f"total number of missed inbound: {len(missing_inbound)}") + + return missed, missing_inbound, report_str + + +def _get_crawl_visits(cfg: ControlFlowGraph) -> dict[str, list[BranchVisitor]]: + """For testing and analysis. + + Args: + cfg: control flow graph + + Returns: + map from label to BranchVisitor + """ + # for testing and analysis + # initialize visits + visits = {label: [] for label in cfg.segment_map.keys()} + visits[cfg.start_label] = [BranchVisitor(cfg.start_label, previous_label=None)] + + for visitor, segment_names in _crawl_iter(cfg=cfg): + visits[visitor.current_label].append(visitor) + + return visits + + +def _crawl_iter(cfg: ControlFlowGraph) -> Generator[tuple[BranchVisitor, Segment], None, None]: + """crawls CFG + + Args: + cfg: control flow graph + + Yields: + current Branch visitor (that points to the current segment), + the segment (list of flow elements to process, and outgoing visitors) + """ + + label = cfg.start_label + visitor = BranchVisitor(label, previous_label=None) + worklist = [] + visitor_counts = {} + visited_jumps = [] + first_seen_inbound = {} #: segment_label -> visitor.previous_label + + while len(worklist) > 0 or visitor is not None: + if visitor is None and len(worklist) > 0: + # nowhere to jump, so pull from worklist, but pull intelligently to prioritize + # unvisited edges. If no edge is unvisited, then pick the first element in the list + x = next((x for x in enumerate(worklist) + if (x[1].previous_label, x[1].current_label) not in visited_jumps), + (0, worklist[0]) + ) + + worklist.pop(x[0]) + visitor = x[1] + + curr_label = visitor.current_label + prev_label = visitor.previous_label + visited_jumps.append((prev_label, curr_label)) + + # skip orphaned references + if curr_label not in cfg.segment_map: + visitor = None + continue + + # emergency brake + if curr_label not in visitor_counts: + visitor_counts[curr_label] = 1 + else: + visitor_counts[curr_label] += 1 + if visitor_counts[curr_label] > MAX_VISITS_PER_SEGMENT: + logger.critical(f"Attempting to visit {curr_label} {visitor_counts[curr_label]} " + f"times, stopping this visitor.") + visitor = None + continue + + segment = cfg.segment_map[visitor.current_label] + + yield visitor, segment + + # todo: cache this + if segment.label == '*': + is_multiple = False + else: + inbounds = cfg.inbound[segment.label] + if len(inbounds) <= 1: + is_multiple = False + + elif curr_label not in first_seen_inbound: + is_multiple = False + first_seen_inbound[curr_label] = prev_label + + else: + is_multiple = prev_label != first_seen_inbound[curr_label] + + next_visitors = segment.accept(visitor, multiple_inbound=is_multiple) + + if next_visitors is None or len(next_visitors) == 0: + visitor = None + """ + # no more visitors means current branch is exhausted + # if the current branch was not visited, then yield it now + history = visitor.history + ((visitor.previous_label, visitor.current_label),) + last_visitor = dataclasses.replace(visitor, + previous_label=label, + current_label=None, + history=history + ) + yield last_visitor, segment + visitor = None + """ + else: + # depth-first search so take first branch and assign as current + visitor = next_visitors[0] + + # Add to worklist if not already in worklist + for i in range(1, len(next_visitors)): + if next_visitors[i] not in worklist: + worklist.append(next_visitors[i]) + + + + +def _find_segments_with_elem(val: str, segment_map: dict[str, Segment]) -> list[tuple[str, Segment, int]]: + """Find segments that also contain an element. + + Args: + val: string name of element + segment_map: label -> segment + + Returns: + + * list of segments that have this element along with their label + and the index of the found element in the form + (label, segment, dupe_index) + + * Empty set if no segments found + + """ + if segment_map is None or len(segment_map) == 0: + return [] + + to_return = [] + for label, seg in segment_map.items(): + if seg.traversed is None or len(seg.traversed) == 0: + continue + # Note segment gen. algorithm doesn't allow a value to appear + # more than once in the traversed history + for index, dupe_tuple in enumerate(seg.traversed): + if dupe_tuple == val: + to_return.append((label, seg, index)) + break + + return to_return + + +def _fix_duplicates(segment_map: dict[str, Segment]) -> None: + """segment surgery to merge duplicate paths + + Sometimes we have:: + + segment 1: A->B->C + segment 2: X->A->B->C + + Which should be turned into:: + + segment 1: A->B->C + segment 2': X :jump A + + Or if we have:: + + segment 3: X->Y->A + segment 4: W->B->A + + Then this should be merged into: + + segment 3': X->Y jump A + segment 4': W->B jump A + new segment: A + + Args: + segment_map: label -> Segment + + Returns: + None. (Segments updated in place) + """ + crawled = [] + segments = segment_map.values() + for segment in segments: + crawled = crawled + segment.traversed + + dupes = {x for x in crawled if crawled.count(x) > 1} + if len(dupes) == 0: + return + # el: string name of dupe flow element + # val: list (segment, index of traversed in segment) + processed = [] + for val in dupes: + if val in processed: + continue + + dupes = _find_segments_with_elem(val, segment_map) + new_segment = None + + # (segment, index) + for (label, segment, val_index) in dupes: + if val_index == 0: + # the dupe *starts* a segment, so it is the entire segment + new_segment = segment + else: + # the dupe is partway through the segment + subflows = [x for x in segment.subflows if x < val_index] + new_jump = Jump(src_name=segment.traversed[val_index - 1][0], + target=val[0], + is_loop=False, + is_goto=False, + is_no_more_values=False, + is_fault=False + ) + # replace the segment + segment_map[label] = Segment(label=segment.label, + traversed=segment.traversed[:val_index], + subflows=subflows, + jumps=[new_jump], + is_terminal=False) + # now, make the jump target + if new_segment is not None: + # we already have it, no need to add it. + pass + else: + # make it. All dupes of the same value must end in the same way + # so take the first + (seg_index, segment, val_index) = dupes[0] + new_segment = Segment(label=val[0], + traversed=segment.traversed[val_index:], + subflows=[x for x in segment.subflows if x >= val_index], + jumps=segment.jumps, + is_terminal=segment.is_terminal) + + segment_map[val[0]] = new_segment + + # add all the traversed elems to processed + # so we don't make more new segments unnecessarily + processed = processed + new_segment.traversed + +class CrawlEncoder(json.JSONEncoder): + def default(self, obj): + if (isinstance(obj, JSONSerializable) or isinstance(obj, BranchVisitor) + or isinstance(obj, CrawlStep)): + return obj.to_dict() + else: + return json.JSONEncoder.default(self, obj) + + +class Crawler(AbstractCrawler): + """Class representing the crawl of a graph + + """ + + def __init__(self, total_steps: int, cfg: ControlFlowGraph, + crawl_schedule: tuple[CrawlStep,...], + terminal_steps: tuple[CrawlStep,...], + history_maps: dict[tuple[tuple[str, str], ...], CrawlStep] | None, + flow_path: str, + el_2_cs: dict[str, list[CrawlStep]] | None = None): + """Constructor + + .. WARNING:: For module use only + + Args: + total_steps: how many steps in crawl + cfg: control flow graph + crawl_schedule: tuple of :class:`public.data_obj.CrawlStep` in order of execution + terminal_steps: tuple of :class:`public.data_obj.CrawlStep` + that can end program (note, *not* in any specific order) + """ + #: int current step of crawl + self.current_step: int = 0 + + #: int total number of steps + self.total_steps: int = total_steps + + #: control flow graph + self.cfg: ControlFlowGraph = cfg + + #: tuple(:ref:`public.data_obj.CrawlStep`) all crawl steps in order of execution + self.crawl_schedule: tuple[CrawlStep, ...] = crawl_schedule + + #: tuple(:ref:`public.data_obj.CrawlStep`) steps that can terminate the program + self.terminal_steps: tuple[CrawlStep, ...] = terminal_steps + + #: crawl_step -> last seen ancestor + self.history_maps: dict[tuple[tuple[str, str], ...], CrawlStep] = history_maps or {} + + #: previous crawlers, None if this is the first + #: if we are 3 frames deep, this is descending order: history = [(crawler 2, int 2), (crawler 1, int 1)] + self.crawler_history: list[tuple[Crawler, int]] | None = None + + #: file path + self.flow_path: str | None = flow_path + + #: + self.el_2_cs: dict[str, list[CrawlStep]] | None = el_2_cs + + @classmethod + def from_parser(cls, parser: parse.Parser): + """Builds a crawl schedule (recommended builder) + + Args: + parser: :obj:`flow_parser.parse.Parser` instance + + Returns: + :obj:`Crawler` instance + + """ + cfg = ControlFlowGraph.from_parser(parser) + crawl_schedule, terminal_steps, el_2_cs = get_crawl_data(cfg) + total_steps = len(crawl_schedule) + + return Crawler( + total_steps=total_steps, + cfg=cfg, + crawl_schedule=crawl_schedule, + terminal_steps=terminal_steps, + history_maps=None, + flow_path=parser.flow_path, + el_2_cs=el_2_cs + ) + + def get_control_influence_from_source(self, influenced_var: str, + source_var: var_t) ->tuple[var_t, ...] | None: + """Both the influenced and source variables are top level flow elements. + The influenced variable is in the current flow path, the source variable may be in + a different flow path (so we need the tuple (path, varname)). If the source variable control + influences the influenced_variable, then a chain of (path, element name) will be returned starting + at the source and leading to the influenced variable. The chain only contains branches + and subflow chains, not every step, but every step could be reconstructed if desired + by adding in the segment traversals. + + This must be run at every frame load, because a subflow may be loaded multiple times, with a larger + set of control influencers each time it is called. + + For example, in frame A, we have start --> branch 1, branch 2, and each branch may call the same subflow. + So a given element in the subflow will be control influenced by branch 1 the first time it is called, + and by both branch 1 and branch 2 the second time it is entered. So to get a global control + influencing answer, you need to call this function on every subflow load. You do not need to wait + until a given subflow is fully crawled, as the full crawl info is generated by the parser when the + flow is loaded. + + Args: + influenced_var (str): top level (traversable) flow element name in the flow crawled by this current crawler. + source_var (str, str): flow_path, element name in either the current flow or in another flow that may or + may not be an ancestor in the call chain. + + Returns: + None if there is no influence, or a set of crawl steps linking the source to the influenced. + Only a single chain of crawl steps is returned, there may be other control influence chains. + + """ + if source_var is None or influenced_var is None: + return None + + # case 1: (Local Analysis) everything is in the same flow + src_path = source_var[0] + if self.flow_path == src_path: + if influenced_var == source_var[1]: + # trivial case + return (src_path, influenced_var) + + res = self._get_local_control(influenced_var, source_var) + if res is None: + return None + else: + return tuple([(src_path, x) for x in res]) + + # case 2: the source is in a subflow descendent. Because all elements + # are connected to the start, we only care about the chain of start elements/subflow + # elems connecting to the source elem. + + # set of (call-chain height, (crawler, crawler_step_index)) + candidates = [(index, x) for (index, x) in enumerate(self.crawler_history) if x[0].flow_path == src_path] + + if len(candidates) == 0: + return None + else: + for index, (crawler, step_index) in candidates: + tail = self._get_local_control(crawler.crawl_schedule[step_index].element_name, source_var[1]) + if tail is not None: + result = [(self.flow_path, influenced_var)] + # we have a chain from source -> subflow that exited the frame. + # Now, fill in until we get to the start element of the current frame. + for i in range(index): + c = self.crawler_history[i][0] + step = self.crawler_history[i][1] + elem = c.crawl_schedule[step].element_name + path = c.flow_path + result.append((path, elem)) + + [result.append((src_path, el)) for el in tail] + return tuple(result) + + return None + + def get_crawl_schedule(self)->tuple[CrawlStep, ...]: + return self.crawl_schedule + + def get_flow_path(self) -> str | None: + return self.flow_path + + def get_crawler_history_unsafe(self) -> list[tuple[Crawler, int]]: + """READ ONLY + + Returns: + history of crawlers encountered during crawl, together with the current step (int) + when they entered a child flow. + """ + return self.crawler_history + + def get_cfg(self)-> ControlFlowGraph: + return self.cfg + + def get_current_step_index(self)->int: + """Retrieve current crawl step (read-only)""" + return self.current_step + + + def load_crawl_step(self) -> CrawlStep | None: + """Retrieve the current crawl step and advance counter (irreversible) + + Returns: + :obj:`public.data_obj.BranchVisitor` and flow element name to process + + """ + if self.current_step >= self.total_steps: + return None + else: + to_return = self.crawl_schedule[self.current_step] + self.history_maps[to_return.visitor.history] = to_return + self.current_step += 1 + return to_return + + + def get_last_ancestor(self, crawl_step) -> CrawlStep | None: + """Get latest ancestor branch that was last visited at crawl_step + + Useful for knowing which influence map to clone + + Args: + crawl_step: step whose history is sought + + Returns: + CrawlStep instance or None + + """ + history = crawl_step.visitor.history + + # first check if we are moving forward or if we need to backtrack + + res = None + while res is not None: + res = dict.get(self.history_maps, history, None) + if len(history) == 0: + break + else: + history = history[:-1] + if res is None: + # not present + return None + else: + return res + + def get_elem_to_crawl_step(self, elem_name: str) -> list[CrawlStep]: + """returns a list of all crawlsteps in which this element has been visited + during the crawl of this flow. If not visited, the empty list is returned. + + Args: + elem_name (str): element name (use '*' for the start element) + + Returns: + list of :obj:`CrawlStep` instances that visit this element + + """ + if self.el_2_cs is None: + logger.error(f"requested element to crawlstep but " + f"the map has not been set for the crawler at {self.flow_path}") + return [] + else: + return dict.get(self.el_2_cs, elem_name, list()) + + def _get_local_control(self, influenced_el: str, influencer_el) -> tuple[var_t, ...] | None: + sink_crawl_steps = self.el_2_cs.get(influenced_el, []) + source_crawl_steps = self.el_2_cs.get(influencer_el, []) + + if len(sink_crawl_steps) == 0 or len(source_crawl_steps) == 0: + return None + else: + for source in source_crawl_steps: # + # Because of how this info is built, the first element is likely + # to have the smallest branch history, which speeds things up. + sink = sink_crawl_steps[0] + + sin_l = len(sink.visitor.history) + src_l = len(source.visitor.history) + if sin_l > src_l and (sink.visitor.history[0:len(source.visitor.history)] == source.visitor.history): + + # we choose jump target arbitrarily - it doesn't matter + # as long as we are consistent since this is for the auditor's own info + return ((source.element_name,) + tuple([x[1] for x in sink.visitor.history[src_l + 1:]]) + + (sink.element_name,)) + + elif sin_l == src_l and sink.visitor.history == source.visitor.history: + # Both the sink and source are already on the same segment, so + # we only need to check if src is dominant. + if sink.local_index > source.local_index: + # sink is downstream of source, so source dominates + return source.element_name, sink.element_name + else: + return None + + return None + +def dump_cfg(cfg: ControlFlowGraph, fp: TextIO) -> None: + """Writes to file pointer + + Args: + cfg (ControlFlowGraph): graph to serialize (JSON) + fp (TextIO): file pointer: + + Returns: + None + + """ + json.dump(cfg, indent=4, fp=fp, cls=CrawlEncoder) + +def validate_cfg(cfg: ControlFlowGraph, + parser: parse.Parser, missing_only=False) -> list[tuple[str, str]] | bool: + + # check that all elements are covered exactly once: + all_elems = parser.get_all_traversable_flow_elements() + all_elem_tuples = [(get_name(x), get_tag(x)) for x in all_elems] + crawled_elems = [] + + for segment in cfg.segment_map.values(): + crawled_elems = crawled_elems + segment.traversed + + # ..check there are no missing crawlable elements + missing = [x for x in all_elem_tuples if x not in crawled_elems] + + if missing_only: + return missing + + else: + # continue to gather other statistics + counts = {x: crawled_elems.count(x) for x in crawled_elems} + + # ..check there are no duplicates + duplicates = [x for x in crawled_elems if counts[x] > 1] + + if len(duplicates) != 0: + valid = False + print("invalid crawl info") + for x in duplicates: + print(f"duplicate: {x}") + else: + valid = True + for x in missing: + # some flows include disconnected elements that can't be crawled. + print(f"caution missing element found: {x}") + + return valid + +def _get_connector_map(elem: ET.Element, + parser: Parser) -> dict[ET.Element, tuple[str, ConnType, bool]]: + """ + Wrapper for getting connectors that handles start elements and missing + connector targets, which requires a parser. + + Args: + elem: element to search for connectors + parser: parser containing global file data + + Returns: + connector map (connector elem: name of target, type of connector, is_optional) + + """ + raw = get_conn_target_map(elem) + + # make sure the target elem exists + return {x: v for x, v in raw.items() if v[0] in parser.all_names} + +def tuple_trace(x: tuple[tuple[str, str], ...]) -> frozenset[tuple[str, str]]: + return frozenset([t for t in x]) + +def _right_find(my_iter: tuple[str, ConnType], val_to_find) -> int: + """ + returns -1 if val_to_find is not in the second value of my_iter + """ + iter_len = len(my_iter) + if iter_len == 0: + return -1 + else: + for index, x in enumerate(reversed(my_iter)): + if x[1] == val_to_find: + return iter_len - index + return -1 + diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/FlowSecurity_preset.txt b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/FlowSecurity_preset.txt similarity index 100% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/FlowSecurity_preset.txt rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/FlowSecurity_preset.txt diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/flowtest_query_data.txt b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/flow_scanner_query_data.txt similarity index 100% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/flowtest_query_data.txt rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/flow_scanner_query_data.txt diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/footer.out b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/footer.out similarity index 99% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/footer.out rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/footer.out index 9ea09e7d..1e5f496a 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/footer.out +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/footer.out @@ -2,7 +2,7 @@
-The flowtest result was generated by code at https://git.soma.salesforce.com/SecurityTools/FlowSecurityLinter
+The flow scanner result was generated by code at https://git.soma.salesforce.com/SecurityTools/FlowScanner
diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/header.out b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/header.out similarity index 100% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/data/header.out rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/data/header.out diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/executor.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/executor.py similarity index 74% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/executor.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/executor.py index 6997e9b3..61c907fc 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/executor.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/executor.py @@ -12,39 +12,60 @@ import logging import os import traceback -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, TypeAlias -import flowtest.control_flow as crawl_spec import flow_parser.parse as parse +import flow_scanner.control_flow as crawl_spec import public.parse_utils -from flowtest.control_flow import Crawler, ControlFlowGraph -from flowtest.branch_state import BranchState -from flowtest.query_manager import QueryManager, QueryAction +from flow_scanner.branch_state import BranchState +from flow_scanner.control_flow import Crawler, ControlFlowGraph +from flow_scanner.flows import FlowVector +from flow_scanner.query_manager import QueryManager, QueryAction +from flow_scanner.util import Resolver from public import parse_utils -from flowtest.util import resolve_name +from public.flow_scanner_exceptions import InvalidFlowException if TYPE_CHECKING: from public.parse_utils import ET from datetime import datetime -import flowtest.flows as flows +import flow_scanner.flows as flows -from flowtest import wire -from flowtest.flow_result import ResultsProcessor as Results +from flow_scanner import wire +from flow_scanner.flow_result import ResultsProcessor as Results -#: for debugging the flow being analyzed +#: Controls whether subflows are followed or not, useful for debugging. FOLLOW_SUBFLOWS: bool = True -#: whether we should rely on stored outputs instead of re-running the subflow +#: When calling subflows, we store the input variables that go into the subflow. +#: If subflows are pure functions, then calling them repeatedly with the same inputs +#: should result in the same output. This is the Carnac prediction. +#: When trust carnac is set to true, we skip running subflows if there is a cached +#: invocation with the same dataflow inputs for the same subflow. +#: +#: To disable this behavior, set to False. TRUST_CARNAC: bool = True -#: store outputs when re-running subflows and compare with predicted +#: For testing purposes, we store outputs when re-running subflows and compare with +#: the carnac prediction to see if there is a match. We have run carnac extensively on +#: on the flow corpus in the try only mode before deciding to trust carnac in production. TRY_CARNAC: bool = True #: logger for current module logger: logging.Logger = logging.getLogger(__name__) +# variables are identified by a tuple (flow_path, var name) +# as two flows can have the same variable. +var_g: TypeAlias = tuple[str, str] + +# At each variable there is a vectorized data structure called +# 'FlowVector' that contains all dataflows influencing that variable +# at any point in program execution. +# Therefore, the global list of all dataflows at any point in program +# execution is dict[tuple[str,str], FlowVector] +flow_vec_g: TypeAlias = dict[var_g, FlowVector] + class Stack(object): """The Stack handles subflow invocation. @@ -52,13 +73,13 @@ class Stack(object): When we pass to a subflow a new frame is pushed on the stack and when we return it is popped.""" - def __init__(self, root_flow_path: str, all_flow_paths: {str: str}, + def __init__(self, root_flow_path: str, resolver: Resolver, query_manager: QueryManager): """Constructor (can be used) Args: root_flow_path: current filename of flow being processed - all_flow_paths: map[flow_name] -> flow_path of all files in scope + resolver: map[flow_name] -> flow_path of all files in scope query_manager: invokes queries and stores results Results: @@ -67,25 +88,27 @@ def __init__(self, root_flow_path: str, all_flow_paths: {str: str}, """ #: tracks list of frames that need to be processed *after* current frame - self.__frame_stack: [Frame] = [] + self.__frame_stack: list[Frame] = [] #: stores frames that have been fully processed - self.__collected_frames: [Frame] = [] + self.__collected_frames: list[Frame] = [] #: subflows that have already been processed - #: subflow name --> Flow Vectors - #: {(str, str): flows.FlowVector}: - self.resolved_subflows: {str: {(str, str): flows.FlowVector}} = {} + #: subflow name --> global Flow Vectors at program exit + self.resolved_subflows: dict[str, flow_vec_g] = {} #: map from flow name to flow filepath (for subflow path lookup) - self.all_flow_paths: {str: str} = all_flow_paths + self.resolver: Resolver = resolver #: current frame being processed + self.current_frame: Frame = Frame.build(current_flow_path=root_flow_path, - all_flow_paths=all_flow_paths, + resolver=resolver, resolved_subflows=self.resolved_subflows, query_manager=query_manager) + + #: pointer to query manager so that it can be returned on exit self.query_manager: QueryManager = query_manager @@ -123,6 +146,7 @@ def run(self) -> QueryManager: """ while True: next_frame = self.current_frame.execute() + if next_frame is not None and not self.is_circular_reference(next_frame): # we have a function call and need to store the current frame on the stack self.push(self.current_frame) @@ -176,31 +200,38 @@ def is_circular_reference(self, next_frame: Frame) -> bool: return False flow_path = next_frame.flow_path seen = False + matching_frame = None # don't allow reference to something on the current stack - if flow_path in [f.flow_path for f in self.__frame_stack]: - seen = True - elif flow_path in [f.flow_path for f in self.__collected_frames]: - seen = True - - if seen is True: + # all_frames = self.__frame_stack + self.__collected_frames + for f in self.__frame_stack: + if flow_path == f.flow_path: + seen = True + matching_frame = f + break + + if seen: logger.critical(f"found circular reference in {next_frame.flow_path}") - + self.query_manager.lexical_accept("CyclicSubflow", + next_flow_path=flow_path, + current_frame=self.current_frame, + matching_frame=matching_frame, + all_frames=self.__frame_stack) return seen -def add_inputs_to_call_cache(cache: {str: [[{(str, str): flows.FlowVector}]]}, +def add_inputs_to_call_cache(cache: dict[str, list[list[flow_vec_g]]], sub_path: str, - val: {(str, str): flows.FlowVector} - ) -> {str: [[{(str, str): flows.FlowVector}]]}: + val: flow_vec_g, + ) -> dict[str, list[list[flow_vec_g]]]: """Store input values to subflow in cache Args: - cache: cache of subflow calls + cache: cached return values (subflow name -> flow_vec_g) sub_path: path of subflow val: input values to store Returns: - cache + updated cache """ if cache is None: @@ -210,18 +241,22 @@ def add_inputs_to_call_cache(cache: {str: [[{(str, str): flows.FlowVector}]]}, cache[sub_path] = [[val]] elif call_carnac(cache, val, subflow_path=sub_path, outputs=None) is None: + # we've checked that the inputs are not already in the cache cache[sub_path].append([val]) return cache -def add_outputs_to_call_cache(cache, inputs, added, flow_path) -> [[{(str, str): flows.FlowVector}]]: +def add_outputs_to_call_cache(cache: dict[str, list[list[flow_vec_g]]], + inputs: flow_vec_g, + added: flow_vec_g, + flow_path: str) -> dict[str,list[list[flow_vec_g]]]: """Store return values of subflow in call cache Args: cache: cached subflow inputs and outputs - inputs: inputs whose outputs are being added - added: vars to flow vectors to add + inputs: inputs whose outputs are being added to cache + added: vars to flow vectors to add to cache flow_path: filename of flow Returns: @@ -235,15 +270,16 @@ def add_outputs_to_call_cache(cache, inputs, added, flow_path) -> [[{(str, str): return cache -def call_carnac(input_cache: {str: [[{(str, str): flows.FlowVector}]]} or None, - vector_map: {(str, str): flows.FlowVector}, +def call_carnac(input_cache: dict[str, list[list[flow_vec_g]]] | None, + vector_map: flow_vec_g, subflow_path: str, - outputs: {(str, str): flows.FlowVector} = None) -> {(str, str): flows.FlowVector} or None: + outputs: flow_vec_g = None) -> flow_vec_g | None: """Predicts what the subflow will return Args: input_cache: cache of previous flow inputs - vector_map: subflow inputs + subflow_path ->[[input1, output1], [input2, output2], ] + vector_map: subflow inputs being called now subflow_path: filepath of subflow to be called outputs: outputs to add @@ -252,20 +288,25 @@ def call_carnac(input_cache: {str: [[{(str, str): flows.FlowVector}]]} or None, """ if input_cache is None or subflow_path not in input_cache: - # Carnac not ready + # Carnac not ready as cache is not populated yet return None - to_match = input_cache[subflow_path] + cached_flows = input_cache[subflow_path] # a list of flow_vec_g - for inputs in to_match: - if vector_map == inputs[0]: + for in_out_list in cached_flows: + + if vector_map == in_out_list[0]: + # we are calling the flow with input variables that + # are already in the cache if outputs is not None: - assert len(inputs) == 1 - inputs.append(outputs) + assert len(in_out_list) == 1 + + # add to input cache + in_out_list.append(outputs) return outputs - elif len(inputs) > 1: - return inputs[1] + elif len(in_out_list) > 1: + return in_out_list[1] return None @@ -296,14 +337,14 @@ class Frame(object): """ - def __init__(self, current_flow_path: str | None = None, all_flow_paths: {str: str} = None): + def __init__(self, current_flow_path: str | None = None, resolver: Resolver = None): #: this is a map : `(local flow_name, namespaced flow_name)` -> `flow_path` so #: that when we encounter a subflow we can load the file - self.all_flow_paths: {(str, str): str} = all_flow_paths + self.resolver: Resolver = resolver #: placeholder for fast-forward scans (not currently used) - self.resolved_subflows: {} = {} + self.resolved_subflows: dict[Any, Any] = {} #: path of flow we are working on, needed when labelling inputs/outputs self.flow_path: str = current_flow_path @@ -329,17 +370,25 @@ def __init__(self, current_flow_path: str | None = None, all_flow_paths: {str: s #: current state being processed self.state: BranchState | None = None - #: cache of input values of subflows called from this frame. - #: These are the input variables to each subflow, mapping - #: subflow_path -> [[input flow map, output flow map]] - #: where the flow map is the map from tuples to flow vectors - `{(str, str): FlowVector}` - #: corresponding to inputs and outputs each time the subflow is called - #: (hence a list of lists) - self.subflow_input_cache: {str: [[{(str, str): flows.FlowVector}]]} or None = None - - #: cache of output variables in the subflow + #: Caches the results of calling a subflow: + #: subflow_path -> [input flow map, output flow map]. + #: + #: Ideally a cache would be represented as a dict: key -> stored value + #: in this case (flow input dataflows -> flow output dataflows) but python + #: doesn't support dicts of dicts, and we are not certain of the carnac + #: assumption, e.g. that subflows are pure functions for purpose of our flow analysis + #: so in theory we may have (input vars -> output vars1, output vars2, ...) + #: Therefore we use a list, where the first entry is the input and all subsequent + #: entries are output. + #: + #: Thus, a list of lists [[input1, outpu1], [input2, output2], ...] + #: + #: Ideally, there will be only one or two entries in each list entry. + self.subflow_call_cache: dict[str, list[list[flow_vec_g]]] | None = None + + #: cache of output *variables* in the subflow #: subflow path -> [(path, var name)] - self.subflow_output_variable_cache = {str: [(str, str)]} + self.subflow_output_variable_cache: dict[str, list[tuple[str, str]]] | None = None #: store prediction of subflow outputs in child frame (for testing only) self.prediction = None @@ -347,18 +396,20 @@ def __init__(self, current_flow_path: str | None = None, all_flow_paths: {str: s #: store inputs of subflow in child frame (testing only) self.inputs = None + #: whether execution is async or no + self.is_async: bool = False + @classmethod def build(cls, current_flow_path: str | None = None, - all_flow_paths: {str: str} = None, - resolved_subflows: {} = None, + resolver: Resolver = None, + resolved_subflows: dict[Any, Any] = None, parent_subflow: ET.Element = None, query_manager: QueryManager = None) -> Frame: """Call this whenever program analysis starts or a subflow is reached Args: current_flow_path: current path of flow - all_flow_paths: map[(global flow_name, local flow_name): flow_path] for all flows in - scope to be scanned + resolver: Resolves subflows to be scanned resolved_subflows: subflows that have been already processed parent_subflow: current subflow element that spawned this frame @@ -372,7 +423,7 @@ def build(cls, current_flow_path: str | None = None, if current_flow_path is None: raise ValueError("called with null argument") - frame = Frame(current_flow_path=current_flow_path, all_flow_paths=all_flow_paths) + frame = Frame(current_flow_path=current_flow_path, resolver=resolver) # store subflow resolutions frame.resolved_subflows = resolved_subflows @@ -419,12 +470,12 @@ def update_parent_frame(self, parent_frame: Frame, output_vector_map) -> None: # update query_manager so it has the correct parser self.query_manager.parser = parent_frame.parser - subflow_output_vars = self.parser.output_variables + subflow_output_vars = list(self.parser.output_variables) # convert frozenset to list if parent_frame.subflow_output_variable_cache is None: - parent_frame.subflow_output_variable_cache = {self.flow_path: subflow_output_vars} + parent_frame.subflow_output_variable_cache = {self.flow_path: list(subflow_output_vars)} elif self.flow_path not in parent_frame.subflow_output_variable_cache: - parent_frame.subflow_output_variable_cache[self.flow_path] = subflow_output_vars + parent_frame.subflow_output_variable_cache[self.flow_path] = list(subflow_output_vars) output_variable_map = get_output_variable_map(subflow_elem=self.parent_subflow, subflow_output_vars=subflow_output_vars) @@ -440,10 +491,10 @@ def update_parent_frame(self, parent_frame: Frame, output_vector_map) -> None: prediction = self.prediction if prediction is None: logger.info("Have not seen these inputs before. Adding to cache.") - parent_frame.subflow_input_cache = add_outputs_to_call_cache(parent_frame.subflow_input_cache, - self.inputs, - output_vector_map, - self.flow_path) + parent_frame.subflow_call_cache = add_outputs_to_call_cache(parent_frame.subflow_call_cache, + self.inputs, + output_vector_map, + self.flow_path) elif prediction is not None and prediction == output_vector_map: logger.info("Carnac is right!") else: @@ -458,7 +509,7 @@ def update_parent_frame(self, parent_frame: Frame, output_vector_map) -> None: are propagated to the parent. """ - def get_consolidated_output_vars(self) -> {(str, str): flows.FlowVector}: + def get_consolidated_output_vars(self) -> dict[tuple[str, str], flows.FlowVector]: """get all output variable vectors from all terminal BranchStates. Call this method after flow processing has completed for a subflow @@ -496,8 +547,8 @@ def get_consolidated_output_vars(self) -> {(str, str): flows.FlowVector}: def spawn_child_frame(self, subflow: ET.Element, sub_path: str, - input_map: {str: str}, - vector_map: {(str, str): flows.FlowVector} + input_map: dict[str, str], + vector_map: dict[tuple[str, str], flows.FlowVector] ) -> Frame: """Spawn a child frame when entering subflow. @@ -526,7 +577,7 @@ def spawn_child_frame(self, subflow: ET.Element, Args: sub_path: filepath of subflow being called - input_map: map of output variables in child to input variables of subflow + input_map: map of output variables in child to input variables of subflow elem in parent vector_map: map from tuple to the flow vectors that will be pushed into the child subflow: subflow xml element @@ -541,7 +592,7 @@ def spawn_child_frame(self, subflow: ET.Element, self.query_manager.parser = new_parser new_frame = Frame.build(current_flow_path=sub_path, - all_flow_paths=self.all_flow_paths, + resolver=self.resolver, parent_subflow=subflow, query_manager=self.query_manager ) @@ -550,6 +601,16 @@ def spawn_child_frame(self, subflow: ET.Element, output_vector_map=vector_map, src2tgt_variable_map=input_map, transition_elem=subflow) + # propagate crawl history to child + history = self.crawler.get_crawler_history_unsafe() + last_index = self.crawler.get_current_step_index()-1 # index always points to *next* step + + if history is None: + new_history = [(self.crawler, last_index)] + else: + new_history = history.insert(0, (self.crawler, last_index)) + + new_frame.crawler.crawler_history = new_history self.child_spawned = True return new_frame @@ -568,11 +629,11 @@ def handle_subflows(self, current_elem: ET.Element) -> Frame | None: """ - if FOLLOW_SUBFLOWS is False: + if not FOLLOW_SUBFLOWS: # For testing/debugging, turn off FOLLOW_SUBFLOWS return None - if self.child_spawned is True: + if self.child_spawned: # we are re-entering from a function call so update info and return: self.child_spawned = False @@ -593,7 +654,8 @@ def execute(self) -> Frame | None: """ # once, we run queries at flow start: - self.query_manager.query(action=QueryAction.flow_enter, state=self.state) + self.query_manager.lexical_query(parser=self.parser, crawler=self.crawler) + self.query_manager.query(action=QueryAction.flow_enter, state=self.state, crawler=self.crawler) while True: @@ -603,12 +665,6 @@ def execute(self) -> Frame | None: # we are done processing this flow return None - child_frame = self.handle_subflows(self.state.current_elem) - - if child_frame is not None: - # child frame only returned if handling a subflow element - return child_frame - # Now we have an element loaded and can proceed report(self.state, self.crawler.current_step, self.crawler.total_steps) @@ -618,6 +674,14 @@ def execute(self) -> Frame | None: # must be done *after* wiring. self.query_manager.query(action=QueryAction.process_elem, state=self.state) + # follow subflows if necessary + child_frame = self.handle_subflows(self.state.current_elem) + + if child_frame is not None: + # child frame only returned if handling a subflow element + return child_frame + + def process_subflow(self, current_elem): # If there is a problem, we return None and the parent @@ -625,14 +689,14 @@ def process_subflow(self, current_elem): try: sub_name = parse_utils.get_subflow_name(current_elem) - sub_path = resolve_name(self.all_flow_paths, sub_name=sub_name) - if sub_path == self.flow_path: - # Don't follow subflows that point to the same flow - return None + sub_path = self.resolver.get_subflow_path(sub_name=sub_name, flow_path=self.flow_path) + + assert sub_path != self.flow_path if sub_path is None: # We can't find the path of the sub flow, so don't process - # A log was already filed. + logger.critical(f"No subflow path found for subflow {sub_name} " + f"called in flow {self.flow_path}") return None # parent variable name --> child input variable name @@ -641,16 +705,16 @@ def process_subflow(self, current_elem): # this is the vector map we want to push into the child: vector_map = {(self.flow_path, x): self.state.get_or_make_vector(x) for x in input_map} - prediction = call_carnac(input_cache=self.subflow_input_cache, + prediction = call_carnac(input_cache=self.subflow_call_cache, vector_map=vector_map, subflow_path=sub_path, outputs=None) - if TRY_CARNAC is True: + if TRY_CARNAC: # add inputs to cache: - self.subflow_input_cache = add_inputs_to_call_cache(self.subflow_input_cache, - sub_path, - vector_map) + self.subflow_call_cache = add_inputs_to_call_cache(self.subflow_call_cache, + sub_path, + vector_map) if TRUST_CARNAC is True and prediction is not None: output_variable_map = get_output_variable_map( subflow_elem=current_elem, @@ -681,7 +745,7 @@ def process_subflow(self, current_elem): return child_frame - except Exception as e: + except Exception: logger.critical("Error processing subflow:\n" + traceback.format_exc()) return None @@ -695,9 +759,11 @@ def parse_flow(flow_path: str, query_module_path: str = None, query_class_name: str = None, query_preset: str = None, + optional_queries: list[str] | None = None, query_manager: QueryManager | None = None, crawl_dir: str = None, - all_flows: {str: str} = None) -> QueryManager: + resolver: Resolver = None, + debug_query: str | None = None) -> QueryManager: """Main loop that performs control and dataflow analysis Args: @@ -710,10 +776,12 @@ def parse_flow(flow_path: str, query_module_path: path of module where custom queries are stored query_class_name: name of query class to instantiate query_preset: name of preset to run + optional_queries: list of optional queries to run query_manager: existing instance that invokes queries across entire run. Start with None and one will be created. crawl_dir: directory of where to store crawl specifications - all_flows: map flow name -> path of flow (used for looking up flow paths of subflows) + resolver: used for looking up flow paths of subflows + debug_query (str): pass this string to the query_manager constructor Returns: instance of ger_report.Result class that can be used to generate reports @@ -725,7 +793,7 @@ def parse_flow(flow_path: str, if crawl_dir is not None: cfg = ControlFlowGraph.from_parser(parser) - schedule = crawl_spec.get_crawl_schedule(cfg) + schedule = crawl_spec.get_crawl_data(cfg) cleaned_path = flow_path.replace(os.sep, "_") with open(os.path.join(crawl_dir, f"{cleaned_path}__crawl_schedule.json"), @@ -749,16 +817,22 @@ def parse_flow(flow_path: str, query_manager = QueryManager.build(results=results, parser=parser, requested_preset=query_preset, + additional_queries=optional_queries, module_path=query_module_path, - class_name=query_class_name) + class_name=query_class_name, + debug_query=debug_query) else: # we are continuing a run, so update parser to work on new file query_manager.parser = parser # build stack - stack = Stack(root_flow_path=flow_path, - all_flow_paths=all_flows, - query_manager=query_manager) + try: + stack = Stack(root_flow_path=flow_path, + resolver=resolver, + query_manager=query_manager) + except InvalidFlowException: + logger.error(f"Error parsing flow {flow_path}, skipping") + return query_manager # run program query_manager = stack.run() @@ -780,10 +854,10 @@ def report(state: BranchState, current_step: int, total_steps: int) -> None: logger.debug(msg) -def get_output_variable_map(subflow_elem: ET.Element, subflow_output_vars: [(str, str)]) -> {str: str}: +def get_output_variable_map(subflow_elem: ET.Element, subflow_output_vars: list[var_g]) -> dict[str, str]: # output_variable_map: child name --> parent name the child influences auto, output_variable_map = public.parse_utils.get_subflow_output_map(subflow_elem) - if auto is True: + if auto: # the output variable map will not be populated if auto is True, # so populate it now with output_var_name (in source) -> subflow_name.name (in parent) subflow_name = parse_utils.get_name(subflow_elem) @@ -793,7 +867,7 @@ def get_output_variable_map(subflow_elem: ET.Element, subflow_output_vars: [(str return output_variable_map -def _consolidate_collected_frames(old_frames: [Frame]) -> (BranchState,): +def _consolidate_collected_frames(old_frames: list[Frame]) -> tuple[BranchState,]: to_return = [] for frame in old_frames: to_keep = list(frame.crawler.terminal_steps) @@ -802,5 +876,5 @@ def _consolidate_collected_frames(old_frames: [Frame]) -> (BranchState,): return tuple(to_return) -def report_map(vec_map: {(str, str): flows.FlowVector}) -> str: +def report_map(vec_map: flow_vec_g) -> str: return '\n'.join([x.short_report() for x in vec_map.values()]) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/flow_metrics.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flow_metrics.py similarity index 85% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/flow_metrics.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flow_metrics.py index 0fae3238..058c793b 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/flow_metrics.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flow_metrics.py @@ -8,25 +8,22 @@ from __future__ import annotations +import configparser +import datetime import io +import logging +import os import pathlib - import pkgutil - +import shutil +import traceback from typing import TYPE_CHECKING # noinspection PyUnresolvedReferences import public.custom_parser as CP -import logging -import traceback +from flow_scanner import version from . import ESAPI -import os -import shutil -import codecs -import datetime -import configparser -from flowtest import version if TYPE_CHECKING: from public.data_obj import QueryDescription # Compatability: @@ -44,11 +41,11 @@ QUERY_FAILED = 'Query Failed to Complete' QUERY_TRUNCATED = 'Query Results Truncated' DEFAULT_PRIORITY = -1 -FLOWTEST_HOME = pathlib.Path(__file__).parent.resolve() +FLOW_SCANNER_HOME = pathlib.Path(__file__).parent.resolve() MAX_RESULTS = 500 QUERY_DESC = configparser.ConfigParser() -DEFAULT_DESC_CONFIG_PATH = "flowtest_query_data.txt" +DEFAULT_DESC_CONFIG_PATH = "flow_scanner_query_data.txt" SOFTWARE_PRESETS = {} # Query Sort dictionary @@ -67,7 +64,7 @@ } -def add_to_query_config(list_of_desc: [QueryDescription]) -> None: +def add_to_query_config(list_of_desc: list[QueryDescription]) -> None: """Adds query descriptions to module-level config file if not present Call after loading any queries from disk. Must pass @@ -127,13 +124,13 @@ def load_query_desc_from_config(path: str | None): ) -def add_to_presets(presets: [str], preset_name: str) -> None: +def add_to_presets(presets: list[str], preset_name: str) -> None: global SOFTWARE_PRESETS SOFTWARE_PRESETS[preset_name] = presets pass -def get_software_presets(preset_name: str) -> [str]: +def get_software_presets(preset_name: str) -> list[str]: """Returns empty list if no software presets found with this name Args: @@ -298,8 +295,8 @@ def _safe_prepend(filepath, data): # TODO: email exception? try: temp_file = filepath + "_tmp" - with codecs.open(filepath, mode='r', encoding='utf-8') as fp: - with codecs.open(temp_file, mode='w', encoding='utf-8') as tmp_fp: + with open(filepath, mode='r', encoding='utf-8') as fp: + with open(temp_file, mode='w', encoding='utf-8') as tmp_fp: tmp_fp.write('%s\n' % data) for line in fp: tmp_fp.write(line) @@ -425,7 +422,7 @@ def make_html(self, scan_results): '
' '
' # panel contains cols and rows '
' - '

Flowtest Results

' + '

Flow Scanner Results

' '
' '
' '
Job Type: ' @@ -547,6 +544,7 @@ def _report_append(element, report_fp, source_dir='None', tallies=None): source = snippet.find('Line').find('Code').text.strip() filename = element.find('FileName').text + flow_type = element.find('FlowType').text node_id = str(element.find('NodeId').text) name = element.find('Name').text column = str(element.find('Column').text) @@ -565,9 +563,9 @@ def _report_append(element, report_fp, source_dir='None', tallies=None): else: data = ('
' - '
Object: ' + + '
Object: ' + ESAPI.html_encode(truncate(name)) + '' - ' in file: ' + ESAPI.html_encode(filename) + + +' in ' + flow_type + ' flow at: ' + ESAPI.html_encode(filename) + '
' + ESAPI.html_encode(source) + '
\n') _safe_append(report_fp, data) @@ -611,7 +609,7 @@ def _add_source(source_dir, filename, target_line_no, obj_name): curr_source = prev_source = None - with codecs.open(normalized_path, mode='r', encoding="utf-8") as source_fp: + with open(normalized_path, mode='r', encoding="utf-8") as source_fp: for line_no, source_line in enumerate(source_fp): if line_no == (target_line_no - 1): prev_source = source_line @@ -666,7 +664,7 @@ def _update_results(scan_results, failed_scans, preset): # Get all queries all_d = [] - # with codecs.open(os.path.join(FLOWTEST_HOME, 'data', preset + '_preset.txt'), encoding='utf-8') as fp: + # with codecs.open(os.path.join(FLOW_SCANNER_HOME, 'data', preset + '_preset.txt'), encoding='utf-8') as fp: # all_d = [query_path.strip() for query_path in fp] disk_preset = os.path.join('data', preset + "_preset.txt") if os.path.exists(disk_preset): @@ -719,7 +717,7 @@ def _make_query_desc(query_path): def _make_header(scan_results, jobinfo): """TODO: change to file builder""" logger.debug("_make_header invoked with scan_results of length:" + str(len(scan_results))) - # with open(os.path.join(FLOWTEST_HOME, 'data', 'header.out'), mode='r', encoding="utf-8") as fp: + # with open(os.path.join(FLOW_SCANNER_HOME, 'data', 'header.out'), mode='r', encoding="utf-8") as fp: # data = fp.read() data = pkgutil.get_data(__name__, os.path.join('data', 'header.out')).decode() data += jobinfo.make_html(scan_results) @@ -772,7 +770,7 @@ def _present_query_results(scan_results): def _make_footer(report_fp): - report_path = os.path.join(FLOWTEST_HOME, 'data', 'footer.out') + report_path = os.path.join(FLOW_SCANNER_HOME, 'data', 'footer.out') # with codecs.open(report_path, 'r') as fp: # data = fp.read() data = pkgutil.get_data(__name__, os.path.join('data', 'footer.out')).decode() @@ -883,7 +881,7 @@ def parse_results(xml_file=None, scan_end = normalize_time(scan_end) if report_path is not None: - report_fp = codecs.open(report_path, mode='a', encoding='utf-8') + report_fp = open(report_path, mode='a', encoding='utf-8') logger.info("opening " + report_path) if xml_file is None and xml_report_str is not None: @@ -972,150 +970,6 @@ def parse_results(xml_file=None, return jobinfo, scan_results -def _pre_parse(xml_file, - out_path, - throttle=True, - code_dir=None, - min_api_version=40.0 - ): - """Parses Cx xml results and generates new result file with pruned paths. - - Purging policy (performed in order) - =================================== - 1. collapse multiple paths with same similarity id - 2. remove consecutively repeated nodes within a path - 3. remove portion of path that revisits start point - 4. (after above) collapse paths with same start and end point - - Returns: - src_data - - """ - - context = None - root = None # reference - purged_nodes = 0 - purged_paths = 0 - src_data = dict() - - if os.path.exists(out_path): - os.remove(out_path) - - out_fp = codecs.open(out_path, mode='a', encoding='utf-8') - - context = CP.ET.iterparse(xml_file, events=('end', 'start')) - - # for deduplication of consecutive pathnodes that are the same line in a given result - curr_pathnode_sig = None - prev_pathnode_sig = None - - # for deduplication of any duplicate paths within a given query - curr_path_sig = None # path sig = signature meant to identify path - known_path_sig = set() # need to remember all results - skip_path = False # to avoid processing path nodes if we know we wont process paths - - # for deduplication of paths with same start and endpoint in a given query - curr_start_node = None - curr_end_node = None - known_path_ends = set() # need to remember [start, end] for paths - - out_fp.write('\n') - - event, root = next(context) - parent = root - # render root - out_fp.write(serialize('start', root)) - - for event, element in context: - - if event == 'start': - element.getparent = lambda p=parent: p - parent = element - - if element.tag == 'Query': - # we have a new query - # reset path sig: - curr_path_sig = None - known_path_sig = set() # Flush all known paths - skip_path = False - - curr_start_node = None - curr_end_node = None - known_path_ends = set() - - total = 0 # reset total results per query - - out_fp.write(serialize('start', element)) - - elif element.tag == 'Path': - # We have a new path - # reset pathnode sig - curr_pathnode_sig = None - prev_pathnode_sig = None - - # careful, we are at start of elem, but can see attributes - curr_path_sig = _get_signature(element) - - if curr_path_sig not in known_path_sig: - skip_path = False # display this path - else: - skip_path = True - - if event == 'end': - parent = element.getparent() - - if element.tag == 'Query': - out_fp.write(serialize('end', element)) - - elif element.tag == 'Path': - """ - At the end of each path, we write the entire - result to file, which means if there are two paths - with the same result, the result is written twice - - """ - if skip_path or total >= MAX_RESULTS: - # remove result from query - purged_paths += 1 - - else: - # calculate first and last - curr_start_node = _get_signature(element.getchildren()[0]) - curr_end_node = _get_signature(element.getchildren()[-1]) - - if (curr_start_node, curr_end_node) in known_path_ends: - # remove result from query - purged_paths += 1 - else: - known_path_ends.add((curr_start_node, curr_end_node)) - known_path_sig.add(curr_path_sig) - - # print entire result (inc all paths) to file - out_fp.write(serialize(None, parent)) - total += 1 - - elif element.tag == 'PathNode': - prev_pathnode_sig = curr_pathnode_sig - curr_pathnode_sig = _get_signature(element) - - if curr_pathnode_sig == prev_pathnode_sig or curr_pathnode_sig == (None, None): - element.getparent().remove(element) - purged_nodes += 1 - - elif element.tag == RESULT_ROOT_TAG: - out_fp.write(serialize('end', root)) - - if out_fp is not None: - logger.info("closing report file pointer") - # close file handle since we will open at beginning - out_fp.close() - - if context is not None: - del context - - return src_data - - def get_issues_for_org(scan_results, vuln_map): """Counts findings for each query diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flow_result.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flow_result.py new file mode 100644 index 00000000..89fcd10e --- /dev/null +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flow_result.py @@ -0,0 +1,497 @@ +"""Serializes results and interacts with + third party report processors + + @author: rsussland@salesforce.com +""" +from __future__ import annotations + +import copy +import json +import logging +import sys +from datetime import datetime +from typing import TextIO + +sys.modules['_elementtree'] = None +from public.custom_parser import ET, clean_string +import public.custom_parser as CP + +from flow_scanner import ESAPI +from flow_scanner import flow_metrics +from flow_scanner.version import __version__ +from public.data_obj import QueryResult, Preset, InfluenceStatementEncoder, InfluenceStatement +from public.enums import FlowType +DEFAULT_HELP_URL = "https://security.secure.force.com/security/tools/forcecom/scannerhelp" +DEFAULT_JOB_TYPE = "FlowSecurityCLI" + +logger = logging.getLogger(__name__) + + +class ResultsProcessor(object): + """Class storing all the information necessary for a report. + + This includes labelling information like the report requested, + scan start time, etc., as well as the results of the findings. + + The class contains methods to take this information and generate + json, xml and html reports. + """ + + def __init__(self, preset: Preset = None, requestor="System", report_label=None, + result_id="default", service_version=__version__, help_url=DEFAULT_HELP_URL): + + self.preset: Preset | None = preset + self.help_url: str = help_url + self.result_id: str = result_id # Id to assign to scan result, appears in reports + self.service_version: str = service_version # Version of job management system running scan jobs + self.email: str = requestor # email address of result recipient + + if report_label is None: + report_label = "flowscan run at %s" % str(datetime.now())[:-7] + # report label is a human-readable label assigned to this scan + self.friendly_name: str = report_label + self.counter: int = 0 + self.scan_start: str = str(datetime.now()) # should be overriden + self.scan_end: str = self.scan_start # should be overridden + + # deduplicated stored query results + self.stored_results: list[QueryResult] = [] + + # dictionary of results sorted by query_name + self.results_dict: dict[str, dict] | None = None + + # map from filepath to root element + self.root_map: dict[str, ET.Element] | None = None + + # xml report string + self.report_xml: str | None = None + + def get_root(self, filepath: str): + if self.root_map is not None and filepath not in self.root_map: + return self.root_map[filepath] + else: + try: + root = CP.get_root(filepath) + if self.root_map is None: + self.root_map = {filepath: root} + else: + self.root_map[filepath] = root + return root + + except: + logger.error("Failed to get root element from %s" % filepath) + return None + + def write_html(self, html_report_path: str): + """Writes html report to disk + + Args: + html_report_path: where to write html report + + Returns: + metrics (results) of issues sorted and counted. + + """ + if self.report_xml is None: + self.get_cx_xml_str() + + if (self.preset is None or self.preset.preset_name is None + or len(self.preset.queries) == 0): + raise RuntimeError("Cannot generate html as no valid preset is set") + + presets = [x.query_id.strip() for x in self.preset.queries] + + # Notify metrics of which queries were run + flow_metrics.add_to_presets(preset_name=self.preset.preset_name, + presets=presets) + + # Load query descriptions in metrics + flow_metrics.add_to_query_config(list(self.preset.queries)) + + # now generate report + results = flow_metrics.parse_results(xml_report_str=self.report_xml, + failed_queries=None, + throttle=False, + report_path=html_report_path, + source_dir=None, + email_add=self.email, + friendly_name=self.friendly_name, + scan_start=self.scan_start, + scan_end=self.scan_end, + preset=self.preset.preset_name, + job_type=DEFAULT_JOB_TYPE, + service_version=self.service_version or __version__, + debug=False, + result_id=self.result_id, + help_url=self.help_url + ) + return results + + def dump_json(self, fp: TextIO) -> None: + """Write json string of results to file pointer + + Returns: + None + + """ + job_result = self._make_job_result() + json.dump(job_result, indent=4, fp=fp, cls=InfluenceStatementEncoder) + + def get_json_str(self) -> str: + """get json result string + + Returns: + string that serializes list of QueryResult objects + + """ + job_result = self._make_job_result() + + return json.dumps(job_result, indent=4, cls=InfluenceStatementEncoder) + + def get_cx_xml_str(self): + """Converts results to popcrab compatible report format + + Returns: + report xml string + """ + + id2path_dict = self._make_query_id_to_path_dict() + if self.results_dict is None: + self.gen_result_dict() + + result_dict = self.results_dict + + if result_dict is None or len(result_dict) == 0: + self.report_xml = '' + return self.report_xml + + result_str = '' + for query_id in result_dict: + results = result_dict[query_id] + + if len(results) > 0: + query_path = ESAPI.html_encode(id2path_dict[query_id]) + query_name = ESAPI.html_encode(result_dict[query_id][0]['query_name']) + result_str += f'' + for flow_result in results: + statements = flow_result["flow"] + var_name = flow_result["elem_name"] + code = flow_result["elem_code"] + line = flow_result["elem_line_no"] + filename = flow_result["filename"] + flow_type = flow_result["flow_type"] + counter = flow_result["counter"] + field = flow_result["field"] + if field is not None: + var_name = field + + if statements is None: + # this is a lexical query, therefore + # the following are required + assert var_name is not None + assert code is not None + assert line is not None + assert filename is not None + result_str += make_path_node_header(filename=filename, + flow_type=flow_type, + similarity_id=counter) + result_str += render_html_pathnode(filename=filename, + flow_type=flow_type, + influenced_var=var_name, + code=code, + line=line, + node_id=counter) + + elif len(statements) == 1 and code is not None: + # We have a single statement query with code provided in the + # top level, so we want to generate a two node report with the + # first node in the query result top level and the second node + # the influencer of the statement + assert var_name is not None or filename is not None or line is not None + + result_str += make_path_node_header(filename=filename, flow_type=flow_type, similarity_id=counter) + + # make the first node from the top level query result + result_str += render_html_pathnode(filename=filename, + flow_type=flow_type, + influenced_var=var_name, + line=line, + node_id=counter, + code=code) + # add second node from statement + result_str += render_normal_dataflow_html(statements, flow_type, start_node_id=1) + + + else: + start_path = statements[0].source_path + + result_str += make_path_node_header(filename=start_path, flow_type=flow_type, similarity_id=counter) + result_str += render_normal_dataflow_html(statements, flow_type) + + # End Loop over histories (nodes within a path) + result_str += "" + result_str += "" + # End loop over results (paths) + result_str += "" + # End all loops + result_str += "" + + self.report_xml = _validate_and_prettify_xml(result_str) + + return self.report_xml + + def add_results(self, query_results: list[QueryResult]) -> None: + """Add results to processor + + Stores results internally for simple de-duplication. + All we do is use datapath equality, so please don't put + unique comment strings containing things like step number + or timestamps into influence statements, as they wont be + de-duped. + + Args: + query_results: list of Query-Result objects + + Returns: + None + """ + query_results = _validate_qr(query_results) + + if query_results is None: + return + if self.stored_results is None: + self.stored_results = list(set(query_results)) + else: + self.stored_results = list(set(self.stored_results + query_results)) + + def gen_result_dict(self) -> dict[str, dict[str, str]]: + """Sorts results into query buckets + + Used internally to generate popcrab compatible + xml and html report formats. + + Also useful for testing + + Returns: + dictionary of the form:: + + query_id -> {flow: tuple of DataInfluenceStatements or None (in case this is a dataflow) + query_name: (human_readable), + counter: (fake similarity id), + elem: source code of element, + elem_name: name of Flow Element, + elem_code: source code of element, + elem_line_no: line number of element, + field: name of influenced variable (if any) within the element, + } + + """ + + query_results = self.stored_results + accum = {} + if query_results is None or len(query_results) == 0: + return {} + + for query_result in query_results: + query_desc = self._get_query_desc_from_id(query_result.query_id) + end_stmt = query_result.influence_statement + + query_path = query_result.query_id + src_code = query_result.elem_code + src_line = query_result.elem_line_no + flow_type = query_result.flow_type.name + + + if end_stmt is not None: + src_code_end = clean_string(end_stmt.source_text) + elem_name_end = end_stmt.element_name + field_end = end_stmt.influenced_var + else: + src_code_end = None + elem_name_end = None + field_end = None + + # Initialize + if query_path not in accum: + accum[query_path] = [] + + to_append = {"query_name": query_desc.query_name, + "severity": str(query_desc.severity), + "description": query_desc.query_description, + "counter": self.counter, + "elem": clean_string(query_result.elem_name), + "elem_name": query_result.elem_name or elem_name_end, + "field": query_result.field or query_result.elem_name or field_end, + "elem_code": src_code or src_code_end, + "elem_line_no": src_line, + "filename": query_result.filename, + "flow_type": flow_type} + + if query_result.paths is None or len(query_result.paths) == 0: + if end_stmt is None: + # if there is no statement, this is a lexical query only and + # the data should be in the query top level structure. + assert (query_result.elem_name is not None and + query_result.elem_code is not None and + query_result.elem_line_no is not None and + query_result.filename is not None) + + to_append["flow"] = None + accum[query_path].append(to_append) + self.counter += 1 + # process the next query result + continue + + else: + # if there are no paths but there is a statement, then + # build the query result from the statement. + statements = [(end_stmt,)] + + + else: + # if there are paths in the query, then there must be + # a query statement that contains the last portion of the path + assert query_result.paths is not None and len(query_result.paths) > 0 + + statements = [] + for path_ in query_result.paths: + pruned_history = tuple([x for x in path_.history if x.source_text != "[builtin]"]) + if path_.history[-1] != end_stmt and end_stmt.source_text != "[builtin]": + statements.append(pruned_history + (end_stmt,)) + else: + statements.append(pruned_history + (end_stmt,)) + + # Now we have our statements normalized and are prepared to render dataflows + for path_ in statements: + new_path = copy.deepcopy(to_append) + new_path["flow"] = path_ + new_path["counter"] = self.counter + accum[query_path].append(new_path) + + # TODO: this is a placeholder for real similarity analysis, if needed. + self.counter += 1 + self.results_dict = accum + return accum + + def _make_query_id_to_path_dict(self) -> dict[str, str]: + """Generate a dictionary from query_id to query_path + + e.g. foo bar -> foo\\bar: Version X + + Returns: + dictionary + """ + return {x.query_id: x.query_id.strip().replace(".", "\\") + f" Version: {x.query_version.strip()}" + for x in self.preset.queries} + + def _make_job_result(self): + if self.results_dict is None: + self.gen_result_dict() + + job_result = {"preset": self.preset.preset_name, + "help_url": self.help_url, + "result_id": self.result_id, + "service_version": self.service_version, + "flow_scanner_version": __version__, + "report_label": self.friendly_name, + "email": self.email, + "scan_start": self.scan_start, + "scan_end": self.scan_end, + "results": self.results_dict or {} + } + return job_result + + def _get_query_desc_from_id(self, query_id: str): + descriptions = self.preset.queries + for x in descriptions: + if x.query_id == query_id: + return x + raise ValueError(f"No query with id {query_id} is in the preset provided") + + +def _validate_and_prettify_xml(xml_str: str) -> str: + """Pretty print and validate generated xml string + + Args: + xml_str: string to validate + + Returns: + validated/beautified xml_string + """ + my_root = CP.get_root_from_string(bytes(xml_str, encoding='utf-8')) + ET.indent(my_root) + return CP.to_string(my_root) + + +def render_normal_dataflow_html(statements: tuple[InfluenceStatement, ...], flow_type: str, start_node_id: int = 0) -> str: + result_str = '' + for index, node in enumerate(statements, start=start_node_id): + filename = node.source_path + line = node.line_no + code = clean_string(node.source_text) + result_str += render_html_pathnode(filename=filename, + flow_type=flow_type, + influenced_var=node.influenced_var, + line=line, + node_id=index, + code=code) + return result_str + + +def render_html_pathnode(filename: str, flow_type: str, influenced_var: str, line: int, node_id: int, code: str) -> str: + result_str = f"{ESAPI.html_encode(filename)}" + result_str += f"{flow_type}" + result_str += f"{line}" + result_str += f"1" + result_str += f"{node_id}" + result_str += f"{ESAPI.html_encode(influenced_var)}" + result_str += f"{line}" + result_str += f"{ESAPI.html_encode(code)}" + return result_str + + +def make_path_node_header(filename: str, flow_type: str, similarity_id: int = 0) -> str: + return (f'' + f'') + + +def _validate_qr(qr_list: list[QueryResult]) -> list[QueryResult] | None: + """Checks query result for correctness + + Args: + qr_list: Query Result list to validate + + Returns: + list of valid QueryResults with invalid results removed + None if the list was None + """ + if qr_list is None or len(qr_list) == 0: + return None + + to_skip = set() + for index, qr in enumerate(qr_list): + if qr is None: + logger.critical(f"ERROR: an null query result was included in the result list" + f" {qr_list}") + to_skip.add(index) + if qr.query_id is None: + logger.critical(f"ERROR: received a query result without a query: {qr}") + to_skip.add(index) + if qr.influence_statement is None and qr.elem_code is None and qr.paths is None: + logger.critical(f"ERROR: received a query result without " + f"an influence statement, query code, or paths: {qr}") + to_skip.add(index) + if qr.paths is not None and not isinstance(qr.paths, frozenset): + to_skip.add(index) + logger.critical(f"ERROR: received a query result with a non-frozenset paths field") + + if len(to_skip) == 0: + return qr_list + else: + to_return = [qr_list[i] for i in range(len(qr_list)) if i not in to_skip] + if len(to_return) == 0: + return None + else: + return to_return + + diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/flows.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flows.py similarity index 92% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/flows.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flows.py index 47958b54..d2918876 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/flows.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/flows.py @@ -11,16 +11,17 @@ from collections.abc import Callable from dataclasses import dataclass, replace -import flowtest.util -from flowtest.util import is_non_null, id_, match_all -from public.data_obj import DataInfluencePath +import flow_scanner.util +from flow_scanner.util import is_non_null, id_, match_all +from public.data_obj import InfluencePath +from public.contracts import AbstractFlowVector #: module logger logger = logging.getLogger(__name__) @dataclass(frozen=True, eq=True, slots=True) -class FlowVector: +class FlowVector(AbstractFlowVector): """Common data structure for both vectors and scalars. FlowVector supports vectorization, so that we can accurately @@ -35,13 +36,13 @@ class FlowVector: # For each default path, this list has the overrides. # An override is a map: "property name" --> {DataInfluencePaths} that # influence this property - property_maps: dict[DataInfluencePath: dict[str: set[DataInfluencePath]]] + property_maps: dict[InfluencePath, dict[str, set[InfluencePath]]] # TODO: revisit this later if a property spec is needed # property_spec: set[str] | None @classmethod - def from_flows(cls, default: {DataInfluencePath} = None) -> FlowVector: + def from_flows(cls, default: set[InfluencePath] = None) -> FlowVector: """Builds a vector from the provided flows. Flows must all have the same influencer_name and no flow can have a non-null influencer_property. @@ -64,7 +65,7 @@ def from_flows(cls, default: {DataInfluencePath} = None) -> FlowVector: ValueError if the flows have different influenced_name, or if default is empty. """ # we make an exception: - if isinstance(default, DataInfluencePath): + if isinstance(default, InfluencePath): default_ = {default} elif not isinstance(default, set): raise ValueError("Please call with set argument") @@ -120,7 +121,7 @@ def short_report(self, indent=2) -> str: return json.dumps(str_prop_map, indent=indent, sort_keys=True) - def report_dict(self) -> {str: {str: {str}}}: + def report_dict(self) -> dict[str, dict[str, set[str]]]: """get brief object dict with stringified flows flows are replaced with arrow and star notation @@ -145,7 +146,7 @@ def report_dict(self) -> {str: {str: {str}}}: return loaded - def get_flows_by_prop(self, member_name: str | None = None) -> {DataInfluencePath}: + def get_flows_by_prop(self, member_name: str | None = None) -> set[InfluencePath]: """Returns this vector's flows with the requested influenced property name. Args: @@ -160,7 +161,7 @@ def get_flows_by_prop(self, member_name: str | None = None) -> {DataInfluencePat defaults = set(self.property_maps.keys()) # match anything if None, or require an exact name match if one was requested - prop_match = flowtest.util.build_match_on_null(member_name) + prop_match = flow_scanner.util.build_match_on_null(member_name) # make sure everything matches as we need to know if there are no flows for this prop flow_match = match_all @@ -218,7 +219,7 @@ def add_vector(self, vector: FlowVector) -> FlowVector: can create a doubling of flow paths) via set-addition. Care must be taken when the generic case is the same but overrides - differ. Imagine program execution along two branches, followed + differ: Imagine program execution along two branches, followed by combining the branches (say a function return). Then we know that in reality, only one branch can be taken in an execution run, but there is the possibility of cross-contamination. E.g. @@ -235,10 +236,10 @@ def add_vector(self, vector: FlowVector) -> FlowVector: the state branch id in each path - and it is the examination of the overrides where fine-grained exclusion analysis should happen. - This choice forces us to create dummy (induced) overrides because - the nature of overrides is that they are always selected, so if `A.Name` has - an override, `foo` in path `A`, but not in path `B`, then we want to get both - `A.Name` and `foo` when requesting the override of the sum of path A and path B. + This choice forces us to create dummy (induced) overrides: If `A.Name` has + an override, `foo` in path 1, but not in path 2, then we want to get both + `A.Name` and `foo` when requesting the override of the sum. + This is because the sum is the possibility of taking either path. """ @@ -265,7 +266,7 @@ def add_vector(self, vector: FlowVector) -> FlowVector: return FlowVector(property_maps=new_property_map) - def push_via_flow(self, extension_path: DataInfluencePath, influenced_vec: FlowVector, + def push_via_flow(self, extension_path: InfluencePath, influenced_vec: FlowVector, assign: bool = True, cross_flow: bool = False) -> FlowVector: """Build new FlowVector with all influence paths in self pushed into ``vec`` via the extension_path. @@ -309,7 +310,7 @@ def push_via_flow(self, extension_path: DataInfluencePath, influenced_vec: FlowV if extension_path.influenced_property is None: # the entire vector is pushed pushed_vec = self._extend_by_path(flow=extension_path, cross_flow=cross_flow) - if assign is False: + if not assign: # we add the pushed values to the present values return influenced_vec.add_vector(pushed_vec) @@ -329,7 +330,7 @@ def push_via_flow(self, extension_path: DataInfluencePath, influenced_vec: FlowV accum = set() for flow_ in to_extend: accum.add( - DataInfluencePath.combine( + InfluencePath.combine( start_flow=flow_, end_flow=extension_path, cross_flow=cross_flow @@ -344,7 +345,7 @@ def push_via_flow(self, extension_path: DataInfluencePath, influenced_vec: FlowV # # - def _extend_by_path(self, flow: DataInfluencePath, cross_flow: bool = False) -> FlowVector: + def _extend_by_path(self, flow: InfluencePath, cross_flow: bool = False) -> FlowVector: """Creates a new flow vector by *pushing forward* this vector's flows. =========================== @@ -422,7 +423,7 @@ def _extend_by_path(self, flow: DataInfluencePath, cross_flow: bool = False) -> # structures are preserved. flow is A --> B # push default forward for curr_default in self.property_maps: - pushed_default = DataInfluencePath.combine( + pushed_default = InfluencePath.combine( start_flow=curr_default, end_flow=flow, cross_flow=cross_flow) # and push all property maps forward *if they exist* @@ -436,7 +437,7 @@ def _extend_by_path(self, flow: DataInfluencePath, cross_flow: bool = False) -> if (self.property_maps[curr_default][prop] is not None and len(self.property_maps[curr_default][prop]) > 0): new_property_maps[pushed_default][prop] = { - DataInfluencePath.combine( + InfluencePath.combine( start_flow=override, end_flow=_restrict(flow, prop), cross_flow=cross_flow @@ -459,7 +460,7 @@ def _extend_by_path(self, flow: DataInfluencePath, cross_flow: bool = False) -> # map: C.x->D # # we restrict: A.x->B.x->C.x, and then combine C.x->D - pushed_default = DataInfluencePath.combine( + pushed_default = InfluencePath.combine( start_flow=_restrict(curr_default, tgt_prop), end_flow=flow, cross_flow=cross_flow @@ -469,7 +470,7 @@ def _extend_by_path(self, flow: DataInfluencePath, cross_flow: bool = False) -> else: # There is an override for target prop, so push all its flows into the property_maps - pushed_defaults = [DataInfluencePath.combine( + pushed_defaults = [InfluencePath.combine( start_flow=x, end_flow=flow, cross_flow=cross_flow @@ -481,10 +482,10 @@ def _extend_by_path(self, flow: DataInfluencePath, cross_flow: bool = False) -> # end of if-statement return FlowVector(property_maps=new_property_maps) - def _search_props(self, defaults_matcher: Callable[[DataInfluencePath], bool] = is_non_null, + def _search_props(self, defaults_matcher: Callable[[InfluencePath], bool] = is_non_null, prop_matcher: Callable[[str | None], bool] = is_non_null, - flow_matcher: Callable[[DataInfluencePath | None], bool] = is_non_null, - action: Callable[[DataInfluencePath, str, DataInfluencePath], typing.Any] = id_ + flow_matcher: Callable[[InfluencePath | None], bool] = is_non_null, + action: Callable[[InfluencePath, str, InfluencePath], typing.Any] = id_ ) -> typing.Any: """Searches through FlowVector based on match conditions. @@ -590,7 +591,7 @@ def _search_props(self, defaults_matcher: Callable[[DataInfluencePath], bool] = else: return accum - def _assign_or_add_property_flows(self, flows: {DataInfluencePath}, assign: bool = True + def _assign_or_add_property_flows(self, flows: set[InfluencePath], assign: bool = True ) -> FlowVector: """Injects DataInfluencePaths into vector. @@ -637,7 +638,7 @@ def _assign_or_add_property_flows(self, flows: {DataInfluencePath}, assign: bool if self.property_maps[default_] is None or prop not in self.property_maps[default_]: _safe_add(new_property_maps, default_, flow, assign) - elif assign is True: + elif assign: new_property_maps[default_][prop] = {flow} else: @@ -658,9 +659,9 @@ def _sort_key(x): return x.short_report(arrows=True) -def _merge_override(default: DataInfluencePath, - first: {str: {DataInfluencePath}}, - second: {str: {DataInfluencePath}}) -> {str: {DataInfluencePath}}: +def _merge_override(default: InfluencePath, + first: dict[str, set[InfluencePath]], + second: dict[str, set[InfluencePath]]) -> dict[str, set[InfluencePath]] | None: """Take the property map for a specific default and combine it with another Args: default: default flow for this map @@ -701,8 +702,8 @@ def _merge_override(default: DataInfluencePath, def _build_action_restrict_if_no_prop(wanted_prop: str) -> Callable: - def action(default: DataInfluencePath, curr_prop: str | None, - flow: DataInfluencePath) -> (DataInfluencePath, DataInfluencePath): + def action(default: InfluencePath, curr_prop: str | None, + flow: InfluencePath) -> tuple[InfluencePath, InfluencePath] | None: # The matchers will ensure we have a prop-wanted prop match, # but we still need the wanted prop variable because a wanted prop @@ -739,9 +740,9 @@ def action(default: DataInfluencePath, curr_prop: str | None, """ -def _safe_add(my_prop_map: {DataInfluencePath: {str: {DataInfluencePath}}}, - my_default: DataInfluencePath, - flow: DataInfluencePath, assign: bool = True) -> None: +def _safe_add(my_prop_map: dict[InfluencePath, dict[str, set[InfluencePath]]], + my_default: InfluencePath, + flow: InfluencePath, assign: bool = True) -> None: """add function that provides the induced flow if needed Need to add the induced flow from the default @@ -759,7 +760,7 @@ def _safe_add(my_prop_map: {DataInfluencePath: {str: {DataInfluencePath}}}, """ prop = flow.influenced_property - if assign is True: + if assign: to_add = {flow} else: induced_flow = _restrict(my_default, prop) @@ -776,7 +777,7 @@ def _safe_add(my_prop_map: {DataInfluencePath: {str: {DataInfluencePath}}}, my_prop_map[my_default][prop].update({flow}) -def _safe_update(prop: str, x: set, old_map: {str: set}) -> None: +def _safe_update(prop: str, x: set, old_map: dict[str, set]) -> None: """Merges a set into a map at the specified property Args: @@ -796,7 +797,7 @@ def _safe_update(prop: str, x: set, old_map: {str: set}) -> None: old_map[prop].update(x) -def _restrict(dataflow: DataInfluencePath, prop: str) -> DataInfluencePath: +def _restrict(dataflow: InfluencePath, prop: str) -> InfluencePath: """Restricts path to a member property Args: diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/query_manager.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/query_manager.py new file mode 100644 index 00000000..ddd20272 --- /dev/null +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/query_manager.py @@ -0,0 +1,402 @@ +"""Responsible for loading and invoking query instances. + + @author: rsussland@salesforce.com + +""" +from __future__ import annotations + +import importlib +import json +import logging +import os +import traceback +import types +from importlib import machinery +from typing import Any + +import queries.default_query +import queries.optional_query +import queries.debug_query + + +from flow_parser.parse import Parser +from flow_scanner.util import case_insensitive_match +from flow_scanner.control_flow import Crawler +from flow_scanner.flow_result import ResultsProcessor +from public.contracts import QueryProcessor, State, AbstractQuery, AbstractCrawler, Query, LexicalQuery +from public.data_obj import Preset, PresetEncoder +from public.enums import QueryAction +logger = logging.getLogger(__name__) + + +# In the future, if other files are added, need to add here +ADDITIONAL_QUERY_MODULES = [ + (queries.optional_query, queries.optional_query.QUERIES), + (queries.debug_query, queries.debug_query.QUERIES) +] + + +class QueryManager: + # instance that performs queries and produces results + query_processor: QueryProcessor = None + + # stand-alone query_map action -> additional query instance + queries: dict[QueryAction, list[Query | LexicalQuery]] = None + + # stand-alone query map -> query_id -> query instance + flattened_queries: dict[str, Query | LexicalQuery] | None = None + + # instance that stores results and generates reports + results: ResultsProcessor = None + + # current parser associated to flow-file + parser: Parser = None + + # which preset to request + requested_preset: str = None + + # additional queries to perform + additional_queries: list[str] = None + + # lexical queries only run once per flow visit + visited_flows: list[str] = None + + query_module: Any = None + + class_name: str | None = None + + query_id_to_module_name: dict[str, str] | None = None + + debug_msg: str | None = None + + @classmethod + def build(cls, results: ResultsProcessor, + parser: Parser = None, + requested_preset: str | None = None, + additional_queries: list[str] | None = None, + module_path: str | None = None, + class_name: str | None = None, + debug_query: str | None = None) -> QueryManager: + """Only call this once to build Query Manager at scan start + """ + qm = QueryManager() + + if debug_query is not None: + qm.set_debug_query(debug_query) + + if module_path is not None: + # try to load requested query + # TODO: add better error handling + query_module = create_module(module_path=module_path) + + qm.query_module = query_module + qm.class_name = class_name + preset, instance = get_instance(query_module_=query_module, + class_name_=class_name, + preset_=requested_preset) + qm.requested_preset = requested_preset + + else: + # use default + instance = queries.default_query.DefaultQueryProcessor() + preset = instance.set_preset(preset_name=requested_preset) + + if preset is None: + raise RuntimeError(f"The loaded query module does not support preset: {preset or 'No preset provided'}") + + # store pointer to query processor + qm.query_processor = instance + res = build_query_map(additional_queries=additional_queries, debug_msg=debug_query) + qm.queries, qm.flattened_queries, qm.query_id_to_module_name = res + if qm.flattened_queries is not None: + qm.additional_queries = list(qm.flattened_queries.keys()) + # assign preset to results + results.preset = get_updated_preset(preset, additional_query_map=qm.queries) + + # store pointer to results + qm.results = results + qm.parser = parser + + return qm + + def reload(self): + """Make a new instance of the queries after completing one flow + + Returns: + None + """ + + if self.query_module is None or self.class_name is None: + # use default + self.query_processor = queries.default_query.DefaultQueryProcessor() + return + else: + preset, instance = get_instance(self.query_module, + self.class_name, self.requested_preset) + self.query_processor = instance + self.queries, self.flattened_queries, self.query_id_to_module_name = build_query_map( + additional_queries=self.additional_queries, debug_msg=self.debug_msg + ) + + def lexical_query(self, parser: Parser, crawler: AbstractCrawler=None) -> None: + if self.additional_queries is None: + return None + if QueryAction.lexical not in self.queries: + return None + flow_path = parser.flow_path + if self.visited_flows is not None and flow_path in self.visited_flows: + return None + else: + if self.visited_flows is None: + self.visited_flows = [flow_path] + else: + self.visited_flows.append(flow_path) + + to_run = self.queries[QueryAction.lexical] + for qry in to_run: + res = qry.execute(parser=parser, crawler=crawler) + if res is not None: + self.results.add_results(res) + return None + + def lexical_accept(self, query_id, **kwargs) -> None: + + if self.additional_queries is not None and query_id in self.additional_queries: + mod_name = self.query_id_to_module_name[query_id] + qry_class = getattr(mod_name,query_id) + + res = getattr(qry_class, 'accept')(**kwargs) + + if res is not None: + self.results.add_results(res) + else: + logger.info(f"The query id {query_id} is not recognized as a requested lexical query id") + + def query(self, action: QueryAction, state: State, crawler: Crawler = None) -> None: + """Invokes QueryProcessor to execute query and stores results + + Args: + action: type of invocation (flow entrance or element entrance) + state: current state + crawler: flow crawler object which has crawl schedule and cfg + + Returns: + None + """ + # TODO: add exception handling and logging as this is third party code + # when we first enter a state, there is a start elem which is not assigned and so curr elem is None. + # don't look for sinks into these start states. + if action is QueryAction.process_elem and state.get_current_elem() is not None: + + res = self.query_processor.handle_crawl_element(state=state, crawler=crawler) + if res is not None: + self.results.add_results(res) + + elif action is QueryAction.flow_enter: + res = self.query_processor.handle_flow_enter(state=state, crawler=crawler) + # TODO: better validation of result + if res is not None: + self.results.add_results(res) + + self._run_additional_queries(action=action, state=state, + crawler=crawler, all_states=None) + + + + + def final_query(self, all_states: tuple[State]=None) -> None: + res = self.query_processor.handle_final(all_states=all_states) + # TODO: better validation of result + if res is not None: + self.results.add_results(res) + self._run_additional_queries(action=QueryAction.scan_exit, + all_states=all_states) + + # delete old query instance and reload for next flow to process + self.reload() + # delete old states + + def accept(self, query_id: str, **kwargs) -> None: + if query_id not in self.additional_queries: + return None + qry = self.flattened_queries[query_id] + + res = qry.accept(**kwargs) + if res is not None: + self.results.add_results(res) + return None + + def debug_query(self, msg: str): + self.debug_msg = msg + + def _run_additional_queries(self, action: QueryAction, state: State=None, + crawler: AbstractCrawler=None, all_states: tuple[State]=None) -> None: + if self.additional_queries is None: + return None + if action not in self.queries: + return None + else: + to_run = self.queries[action] + for qry in to_run: + res = qry.execute(state=state, crawler=crawler, all_states=all_states) + if res is not None: + self.results.add_results(res) + return None + + +def create_module(module_path: str) -> Any: + """Loads and Instantiates QueryProcessor + + Args: + module_path: location of module to load + + Returns: + QueryProcessor module + + Raises: + ValueError if module name cannot be parsed or preset not accepted + ImportError if the module cannot be loaded + + """ + if module_path is None: + # we'll build default + return None + + else: + # module should have a class with the same name as the module. + filename = os.path.basename(module_path) + + if filename is None: + raise ValueError("Could not determine file to load") + + splits = filename.split('.py') + + if len(splits) != 2 or splits[-1] != '': + raise ValueError("File must end in .py") + + mod_name = splits[0] + try: + loader = importlib.machinery.SourceFileLoader(mod_name, module_path) + query_module = types.ModuleType(loader.name) + loader.exec_module(query_module) + return query_module + except Exception as e: + logger.critical(f"ERROR: could not load module {filename}: {traceback.format_exc()}") + raise e + + +def get_instance(query_module_, class_name_, preset_): + if query_module_ is None: + query_instance = queries.default_query.DefaultQueryProcessor() + + else: + try: + query_instance = getattr(query_module_, class_name_)() + + except Exception as e: + logger.critical(f"ERROR: could not instantiate module") + raise e + + try: + accepted_preset = query_instance.set_preset(preset_) + if accepted_preset is None: + raise ValueError("Could not set preset") + + else: + return accepted_preset, query_instance + + except Exception as e: + logger.critical(f"ERROR: could not set preset: {traceback.format_exc()}") + raise e + + +def build_query_map(additional_queries: list[str] | None=None, + debug_msg: str|None = None + ) -> tuple[dict[QueryAction, list[Query | LexicalQuery]], + dict[str, Query | LexicalQuery], dict[str,str]] | tuple[None, None, None]: + if additional_queries is None: + return None, None, None + else: + instance_map = {} + flat_map = {} + qry_to_mod = {} + for q_name in additional_queries: + for (my_module, qry_map) in ADDITIONAL_QUERY_MODULES: + match_ = case_insensitive_match(qry_map.keys(), q_name) + if match_ is not None: + qry_to_mod[match_] = my_module + if my_module is not queries.debug_query: + q_instance = getattr(my_module, match_)() + else: + q_instance = getattr(my_module, match_)(debug_msg) + action = q_instance.when_to_run() + if action not in instance_map: + instance_map[action] = [q_instance] + else: + instance_map[action].append(q_instance) + if match_ in flat_map.keys(): + raise ValueError(f"Duplicate query name: {q_name}") + else: + flat_map[match_] = q_instance + # stop looking in other modules for q_name + break + + if len(instance_map) == 0: + return None, None, None + else: + return instance_map, flat_map, qry_to_mod + + +def get_updated_preset(preset, additional_query_map: dict[QueryAction,list[AbstractQuery]]=None): + if additional_query_map is None: + return preset + else: + old_queries = preset.queries + for q_list in additional_query_map.values(): + for q in q_list: + if q is not None: + old_queries.add(q.get_query_description()) + + return Preset(preset_name=preset.preset_name, + preset_owner=preset.preset_owner, + queries=old_queries) + + +def get_all_optional_descriptions()-> str: + descriptions = [] + for (my_module, qry_map) in ADDITIONAL_QUERY_MODULES: + for q_name in qry_map.keys(): + q_instance = getattr(my_module, q_name)() + descriptions.append(q_instance.get_query_description()) + return (json.dumps(descriptions, indent=4, cls=PresetEncoder) + .replace('\\"', '"').replace('\\n', "\n")) + + +def validate_qry_list(qry_list: list[str]) -> bool | list[str]: + query_keys = [x[1].keys() for x in ADDITIONAL_QUERY_MODULES] + found_tkns = [] + missed_tkns = [] + for tkn in qry_list: + for query_key in query_keys: + match_ = case_insensitive_match(query_key, tkn) + if match_ is not None: + found_tkns.append(match_) + break + # tkn not found in any query key + missed_tkns.append(tkn) + valid = len(found_tkns) == len(qry_list) + if valid: + return True + else: + assert len(missed_tkns) != 0 + return missed_tkns + +def get_all_optional_queries() -> list[str]: + """Does not return debug queries + """ + accum = [] + for x in ADDITIONAL_QUERY_MODULES: + if x[0] is not queries.debug_query: + accum = accum + list(x[1].keys()) + + return accum diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/util.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/util.py new file mode 100644 index 00000000..98fa2005 --- /dev/null +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/util.py @@ -0,0 +1,445 @@ +# +# +# +from __future__ import annotations + +import json +import logging +import os +import traceback +import typing +import uuid +from collections.abc import Callable +from dataclasses import fields +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any as Any + +from public.data_obj import VariableType +from public.enums import RunMode + +if TYPE_CHECKING: + pass + +FLOW_EXTENSION = ".flow-meta.xml" +PACKAGE_FLOW_EXTENSION = ".flow" +PROJECT_JSON_NAME = "sfdx-project.json" +PACKAGE_XML_NAME = "package.xml" + +CURR_DIR = os.getcwd() + +""" + Crawling limits +""" +MAX_WORKLIST_SIZE = 10000 # Emergency brake +MAX_STEP_SIZE = 100000 # Emergency brake + +logger = logging.getLogger(__name__) + + +def get_flows_in_dirs(root_dirs: str) -> list[str]: + """Searches recursively through for flows + + Args: + root_dirs: csv list of directories in which to search + + Returns: + list of all flows (recursively) + """ + flow_paths = [] + for root_dir in root_dirs.split(','): + for root, dir_names, filenames in os.walk(root_dir): + for filename in filenames: + if filename.endswith(".flow") or filename.endswith(".flow-meta.xml"): + f_path = os.path.abspath(os.path.join(root, filename)) + flow_paths.append(f_path) + + return flow_paths + + +def get_local_label(filename: str) -> str: + if filename.endswith(PACKAGE_FLOW_EXTENSION): + short = filename[:-5] + elif filename.endswith(FLOW_EXTENSION): + short = filename[:-14] + else: + short = filename + + local_label = short.split('-')[0] + return local_label + + +""" + Simple Variable Type Propagation +""" + + +def propagate(src_type: VariableType, dest_type: VariableType, **replacements) -> VariableType: + """Propagate attributes across flows. + + For example, if we know that a variable + of type 'Account' is passed into loop, then we want to remember + that the object type of this loop is Account. This works if we leave + all properties none unless we are certain of their values and then + adopt this simple method. Longer term, we may need to put conditional + logic, but now add a replacement field for manual override. + + Args: + src_type: start Variable Type + dest_type: end Variable Type + replacements: property overrides + + Returns: + Variable Type, modified with sources populating empty dest entries. + + """ + + prop_names = [x.name for x in fields(VariableType) if x is not None] + new_props = {x: dict.get(replacements, x) or getattr(dest_type, x) or getattr(src_type, x) for x in prop_names} + + return VariableType(**new_props) + + +""" + Transmission of context in subflows + + A master Flow running in system context will cause actions run in the SubFlow + to be run in system context as well, + regardless of whether the SubFlow was originally created and configured to run in user context. + A master Flow running in user context that has a SubFlow running in system context + will proceed to run the actions in the SubFlow in system context. +""" + + +def make_id() -> str: + """Generates unique id strings + + Returns: + 8 digit unique id as str + + """ + return str(uuid.uuid4())[:8] + + +def get_effective_run_mode(parent_sharing: RunMode | None, current_sharing: RunMode) -> RunMode: + if (parent_sharing is None or current_sharing is RunMode.SystemModeWithoutSharing or + current_sharing is RunMode.SystemModeWithSharing): + return current_sharing + else: + return parent_sharing + + +def sane_index(my_tuple: tuple, to_match): + try: + index = my_tuple.index(to_match) + except ValueError: + index = -1 + + return index + + +""" + Callables for dealing with property maps +""" + + +def is_non_null(entry) -> bool: + return entry is not None + + +def is_null(entry) -> bool: + return entry is None + + + +def id_(*entry) -> typing.Any: + return entry + + +def build_match_on_null(prop: str = None) -> Callable: + def prop_match(prop_to_match: str): + if prop is None: + return True + else: + return prop == prop_to_match + + return prop_match + + +def build_action_filter(include_default: bool = True, + include_prop: bool = True, + include_flow: bool = True) -> Callable: + def action(default, prop, flow): + accum = [] + if include_default: + accum.append(default) + if include_prop: + accum.append(prop) + if include_flow: + accum.append(flow) + return tuple(accum) + + return action + + +def build_equality_match(to_match) -> Callable: + def equ_match(obj_to_match): + return obj_to_match == to_match + + return equ_match + + +def match_all(x) -> bool: + return True + +def safe_dict_list_append(a_dict: dict[Any, Any], key:Any, val: Any) -> dict[Any, list[Any]]: + if key not in dict: + a_dict[key] = [val] + else: + a_dict[key].append(val) + return a_dict + +def safe_dict_list_add(a_dict: dict[Any, Any], key:Any, list_val: list[Any]) -> dict[Any, list[Any]]: + if key not in dict: + a_dict[key] = list_val + else: + a_dict[key] = a_dict[key] + list_val + return a_dict + + +def safe_list_add(a_list, b_list) -> list | None: + # this should be in standard library + if a_list is None and b_list is None: + return None + elif a_list is None: + return b_list + elif b_list is None: + return a_list + else: + return a_list + b_list + + +class Resolver(object): + def __init__(self, all_flow_paths: list[str]) -> None: + self.all_flow_paths = all_flow_paths + + #: folder path -> namespace + cached_namespace_lookups = dict() + + resolver_map = dict() + for f_path in all_flow_paths: + # first check cache + try: + ns = next((cached_namespace_lookups[x] for x in cached_namespace_lookups.keys() if f_path.startswith(x)), None) + + # then check for sfdx-project.json or package.xml files + if ns is None: + ns, cached_namespace_lookups = update_folder_ns(f_path, cached_namespace_lookups) + + # Now check directory structure + scope, ns2, label = get_scope_ns_label(f_path, cached_namespace_lookups) + + # pick the best guess (ns2 is always not None) + ns = ns or ns2 + + if scope not in resolver_map: + resolver_map[scope] = {(ns, label): f_path} + else: + # we assume the same directory/namespace does not have two files + # with the same label + resolver_map[scope][(ns, label)] = f_path + except: + logger.critical(f"Failed to resolve {f_path}, it will be skipped.") + continue + + self.resolver_map = resolver_map + self.cached_namespace_lookups = cached_namespace_lookups + + def get_subflow_path(self, sub_name: str, flow_path: str) -> str | None: + # check if there is a namespace in the sub_name + to_match = sub_name.lower() + splits = to_match.split("__") + + if len(splits) > 1: + # the target subflow is referenced by namespace + ns = splits[0] + label = splits[1] + # check all scopes for this namespace + for scope in self.resolver_map: + if (ns, label) in self.resolver_map[scope]: + return self.resolver_map[scope][(ns, label)] + # No match found + return None + + else: + # There is no namespace defined in the target subflow, + # so check in the local scope and local namespace + # for a full name match + self_scope, self_ns, label = get_scope_ns_label(flow_path, self.cached_namespace_lookups) + if (self_ns, to_match) in self.resolver_map[self_scope]: + return self.resolver_map[self_scope][(self_ns, to_match)] + else: + return None + +def get_scope_ns_label(f_path, cached_namespace_lookups): + ns = next((cached_namespace_lookups[x] for x in cached_namespace_lookups.keys() if f_path.startswith(x)), None) + + # then check for sfdx-project.json or package.xml files + if ns is None: + ns, cached_namespace_lookups = update_folder_ns(f_path, cached_namespace_lookups) + + # Now check directory structure + parts = Path(f_path).parts + if len(parts) < 3: + raise RuntimeError("paths must be absolute and contained in a folder: %s" % f_path) + + if parts[-3] == 'flows': + namespace = ns or parts[-2].replace('-', '_').lower() + scope = '' + else: + scope = os.path.dirname(f_path) + namespace = ns or '' + + label = get_local_label(parts[-1]).lower() + + return scope, namespace, label + + +def update_folder_ns(f_path: str, cached_namespace_lookups: dict[str, str]) -> tuple[str | None, dict[str, str]]: + """looks at the filepath and tries to find the namespace definition from either + the package manifest or project-json file. Stores the results in a cache + + We do not infer from folder structure yet, just from the config files. + + package manifest assumes a folder structure like this: + + top -> /flows/file + package.xml + -- xml and can load and look at PT1 + under the xml root, with xmlns: "http://soap.sforce.com/2006/04/metadata" + + but we also support + top -> second -> flows/file + + project-json assumes + sfdx-project.json at the top of the project next to force-app + -- can load and look at loaded["namespace"] + + and assumes a project structure of: + force-app -> first -> second -> flows/file + + Args: + f_path (): + cached_namespace_lookups (): + + Returns: + updated dict string (parent directory containing json/xml file) -> namespace + the parent directory is an absolute path normalized so that + we can tell whether a child flow is in this namespace by looking at + path_of_child.startswith(path_in_dict) and then assign the corresponding namespace to it. + + """ + f = Path(f_path) + # cached_namespace_lookups + if f_path.endswith('.flow-meta.xml'): + """ + this is a code-style layout or disorganized layout + in code we look for sfdx-project.json at root. + + layout can be: + 1) root -> force-app -> main -> default -> flows/myflow.flow + 'main' is for production code, and also replace with 'test' for test code + + 2) root -> pkg_dir -> main -> default -> flows/myflow.flow + 'pkg_dir' can live alongside force-app for multiple packages + 3) 'default' represents where code is pulled from, but packages can work with + other directories. + + so in general we want to look for 'flows' as the immediate directory containing + the code and then look up to 4 levels above where myflow.flow lives. + if no package-json is found, we return with no defined namespace. + + """ + f_parent = f + for i in range(5): + f_parent = f_parent.parent + if PROJECT_JSON_NAME in os.listdir(f_parent): + ns = get_ns_from_package_json(os.path.abspath(os.path.join(f_parent, PROJECT_JSON_NAME))) + if ns is not None: + cached_namespace_lookups[str(f_parent)] = ns + return ns, cached_namespace_lookups + return None, cached_namespace_lookups + + elif f_path.endswith('.flow'): + """ + This is either a package-zip layout or repo layout. Only package zip + has a package xml. + + We look for package.xml in the parent + of the dir (where 'flows') is stored. E.g. + root -> flows -> my_flow.flow + and search for package.xml in the root + + """ + f_parent = f + for i in range(3): + f_parent = f_parent.parent + if PACKAGE_XML_NAME in os.listdir(f_parent): + ns = get_ns_from_package_xml(os.path.abspath(os.path.join(f_parent, PACKAGE_XML_NAME))) + if ns is not None: + cached_namespace_lookups[str(f_parent)] = ns + return ns, cached_namespace_lookups + return None, cached_namespace_lookups + + else: + logger.critical(f"found illegal extension on flow file, skipping {f_path}") + return None, cached_namespace_lookups + + +def get_ns_from_package_xml(package_path: str) -> str | None: + """returns namespace or None + + Args: + package_path (str): path to 'package.xml' + + Returns: + namespace or None (if no namespace prefix in package xml) + + """ + try: + with open(package_path, 'r') as package_xml: + lines = package_xml.readlines() + for line in lines: + index = line.find('') + if index == -1: + continue + index_end = line.find('') + namespace_prefix = line[index + 17:index_end] + return namespace_prefix + + return None + except: + logger.error(f"Failed to read package xml {package_path}\n{traceback.format_exc()}") + return None + + +def get_ns_from_package_json(package_path: str) -> str | None: + try: + with open(package_path, 'r') as p: + json_data = json.load(p) + namespace_prefix = json_data['namespace'] + if namespace_prefix is None or len(namespace_prefix) == 0: + return None + else: + return namespace_prefix + except: + logger.error(f"Failed to read package json {package_path}\n{traceback.format_exc()}") + return None + + +def case_insensitive_match(list_a: list[str], to_match: str) -> str | None: + for item in list_a: + if item.lower() == to_match.lower(): + return item + return None + diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/version.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/version.py similarity index 74% rename from packages/code-analyzer-flow-engine/FlowScanner/flowtest/version.py rename to packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/version.py index d5509a4c..694ee88e 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/version.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/version.py @@ -1,5 +1,5 @@ -1# Store the version here so: +# Store the version here so: # 1) we don't load dependencies by storing it in __init__.py # 2) we can import it in setup.py for the same reason # 3) we can import it into your module -__version__ = '0.9.5' +__version__ = '0.9.9' diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/wire.py b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/wire.py new file mode 100644 index 00000000..1ba5bd0a --- /dev/null +++ b/packages/code-analyzer-flow-engine/FlowScanner/flow_scanner/wire.py @@ -0,0 +1,446 @@ +"""performs dataflow wiring for flow elements + + ------------- + Wiring Policy + ------------- + + Wiring policy for flow elements + + 1. When we encounter any variable defined or initialized in an element, + we add that variable to the influence map + + 2. When dataflows _out_ of an element to another element, we wire the flow. + + We do not presently wire flows _into_ the current element, as we don't support + second order dataflow analysis. + + For example, user data may flow into the filter field + of a Get Records, and the return value of the function may be assigned to another variable. + Only the second dataflow is wired. If both flows were wired, we would have a second order flow, + e.g. assuming that the inputs to a function are part of the same dataflow as the return values, + which is rarely useful for dataflow analysis and generates misleading flows. + + If we are searching for dangerous flows *into* elements, this is done by the query processor, + which does not wire anything, it only searches for the flows. This is why our dataflow results + generally contain one missing step, which must be added by the QueryProcessor. + + This allows us to know, at any point in program execution which variables have been + initialized and also what the dataflow history of each variable is. + + +""" +import logging + +import flow_parser.parse as parse +from flow_scanner.branch_state import BranchState +from public import parse_utils +from public.data_obj import InfluenceStatement +from public.parse_utils import ET +from public.parse_utils import ns +from enum import Enum + +#: module logger +logger = logging.getLogger(__name__) + +class QueryResult(Enum): + IsAutoStore = 10 + OutputReferenceEls = 20 + OutputParametersEls = 30 + OutputAssignmentsEls = 40 + + +def initialize(state: BranchState, elem: ET.Element, elem_name: str) -> dict[QueryResult, bool | str | ET.Element]: + """Add this element name to influence map if it represents its own output data + + (Element name is passed in so we don't need to keep looking it up) + + Args: + state: current branch state + elem: current xml elem + elem_name: element name + + Returns: + None + + """ + tag = parse_utils.get_tag(elem) + auto_store = parse_utils.is_auto_store(elem) + result = { + QueryResult.IsAutoStore: auto_store, + QueryResult.OutputReferenceEls: parse_utils.get_by_tag(elem, 'outputReference'), + QueryResult.OutputParametersEls: parse_utils.get_by_tag(elem, 'outputParameters'), + QueryResult.OutputAssignmentsEls: parse_utils.get_by_tag(elem, 'outputAssignments') + } + + # always store if auto-store + if auto_store: + state.get_or_make_vector(name=elem_name, store=True) + + # also store if this is an element that performs some action, because then there + # will always be a created ref that evaluates to true if the action succeeds. + elif tag not in ['start', 'startElementReference', 'subflows', 'decisions', 'assignments']: + #TODO: need to audit this list for completeness + state.get_or_make_vector(name=elem_name, store=True) + + return result + +def wire(state: BranchState, elem: ET.Element) -> None: + """Wires influence statements and variable initialization. + + When the value of one variable changes based on another. + Once detected, this module extends the influence map by + each statement. + + Args: + state: current instance of Branch State + elem: Flow Element to be wired + + Returns: + None + + """ + if elem is None: + return None + + el_type = parse.get_tag(elem) + el_name = parse.get_name(elem) + + # handle here + stored = initialize(state, elem, elem_name=el_name) + + if el_type == 'actionCalls': + wire_action_calls(state, elem, el_name, stored) + + elif el_type == 'apexPluginCalls': + wire_apex_plugin_calls(state, elem, el_name, stored) + + elif el_type == 'assignments': + wire_assignment(state, elem, el_name, stored) + + # loops and collection processors work with collection references + elif el_type == 'collectionProcessors': + wire_collection_processor(state, elem, el_name, stored) + + elif el_type == 'dynamicChoiceSets': + wire_dynamic_choice_sets(state, elem, el_name, stored) + + elif el_type == 'loops': + wire_loop(state, elem, el_name, stored) + + elif el_type == 'orchestratedStages': + # Inside Orchestrated stages, stageSteps are wired as step_name.Outputs.var + wire_orchestrated_stages(state, elem, el_name, stored) + + elif el_type == 'recordCreates': + # look for passing id to variable in create + wire_record_creates(state, elem, el_name, stored) + + elif el_type == 'recordDeletes': + # these only auto-wired as the standard boolean (e.g. true if success) + pass + + elif el_type == 'recordLookups': + wire_record_lookups(state, elem, el_name, stored) + + elif el_type == 'recordUpdates': + # record do hold values, they evaluate to true if the update + # was successful, but only via the el name, so no wiring needed + # beyond adding the element. + pass + + elif el_type == 'screens': + # TODO: need to handle output from screen to action or vice-versa + # This should wait until we crawl actions separately + wire_screens(state, elem, el_name, stored) + + elif el_type == 'subflows': + # subflow wiring is done in the executor + # TODO: move the creation of the element here and wiring to local outputs + # so the executor can focus on cross-flow wiring and we can handle + # actions and subflows in the same way + pass + + elif el_type == 'transforms': + wire_transforms(state, elem, el_name, stored) + + elif el_type == 'waits': + wire_waits(state, elem, el_name, stored) + + return None + +def wire_waits(state: BranchState, elem: ET.Element, el_name: str, stored): + """Wait events can fire events on exit which is handled via output ref + """ + wait_events = parse_utils.get_by_tag(elem, 'waitEvents') + for event in wait_events: + event_name = parse_utils.get_name(event) + if event_name is None: + continue + params = parse_utils.get_by_tag(event, 'outputParameters') + state.get_or_make_vector(name=event_name, store=True) + for param_el in params: + influenced = parse_utils.get_text_of_tag(param_el,'assignToReference') + influencer = parse_utils.get_text_of_tag(param_el, 'name') + if influencer is not None and influenced is not None: + fixed_influencer = f"{event_name}.{influencer}" + wire_and_store(state, influencer=fixed_influencer, influenced=influenced, + el_name=event_name, elem=event, comment='event fired by wait element') + + + +def wire_assignment(state: BranchState, elem: ET.Element, elem_name: str, stored): + """Wires assignment statements to influence map in `state` + + Args: + state: current Branch State + elem: assignment element to be wired + elem_name: element name passed in for convenience + + Returns: + None + + """ + res = parse_utils.get_assignment_statement_dicts(elem) + if res is None: + logger.error(f"Could not obtain any assignments from element {elem_name} in flow {state.flow_path}") + return + + flow_path = state.flow_path + + for (operator, entry) in res: + # we could have just return a boolean, but maybe there will be more operators in the future + is_assign = operator == 'Assign' + + # Be aware there is something sneaky going on here: + # if the parse module detects a string literal, it sets + # the influencer name to parse.STRING_LITERAL_TOKEN + # which then the parser module picks up and assigns + # a hardcoded variable type that is not any of the actual + # flow variables, so that it will not appear as a sink + # + # But if you were to refactor this code and not + # use the parse module, then there would be issues + # as string literals might show up as variables. + # Please keep this in mind when you write other + # parse modules for other flow elements that can + # accept stringLiteral types. + # + # Always assign a variable name equal to parse.STRING_LITERAL_TOKEN + # to signify something is a literal value and not a variable. + entry["flow_path"] = flow_path + entry["source_path"] = flow_path + + stmt = InfluenceStatement(**entry) + state.propagate_flows(statement=stmt, + assign=is_assign, + store=True) + logger.debug(f"Propagated flow for {elem_name}: {entry}") + + +def wire_transforms(state, elem, el_name, stored): + res = parse_utils.get_transform_influencers(elem) + if res is None: + return + + # First, de-dup + seen = set() + for (inf_type, output_var, influencer_var_tuple) in res: + for influencer in influencer_var_tuple: + seen.add((output_var, influencer)) + + # Now handle output var and propagate + for output_var, influencer in seen: + if output_var is None: + influenced_label = el_name + else: + influenced_label = f"{el_name}.{output_var}" + + wire_and_store(state=state, influencer=influencer, + influenced=influenced_label, + elem=elem, el_name=el_name, + comment='influence via transform element') + +def wire_record_creates(state, elem, el_name, stored): + influenced = parse_utils.get_text_of_tag(elem, 'assignRecordIdToReference') + if influenced is not None: + wire_and_store(state=state, influencer=el_name, influenced=influenced, + el_name=el_name, elem=elem, comment='id from record Create') + + +def wire_record_lookups(state, elem, el_name, stored): + assignments = stored[QueryResult.OutputAssignmentsEls] + for assignment in assignments: + influenced = parse_utils.get_text_of_tag(assignment, 'assignToReference') + influencer_field = parse_utils.get_text_of_tag(assignment, 'field') + if influencer_field is not None and influenced is not None: + influencer = f"{el_name}.{influencer_field}" + wire_and_store(state=state, influencer=influencer, influenced=influenced, + el_name=el_name, elem=elem, comment='output of record Lookup') + + out_ref = parse_utils.get_text_of_tag(elem, 'outputReference') + if out_ref is not None: + wire_and_store(state=state, influencer=out_ref, influenced=el_name, + el_name=el_name, elem=elem, comment='output of record Lookup') + + +def wire_action_calls(state, elem, el_name, stored): + if not stored[QueryResult.IsAutoStore]: + # If auto-stored, then this will have already been autowired when initialized + wire_apex_plugin_calls(state, elem, el_name, stored) + + + +def wire_apex_plugin_calls(state, elem, el_name, stored): + output_params = stored[QueryResult.OutputParametersEls] + for output in output_params: + influenced = parse_utils.get_text_of_tag(output, 'assignToReference') + influencer_field = parse_utils.get_text_of_tag(output, 'name') + if influencer_field is not None and influenced is not None: + influencer = f"{el_name}.{influencer_field}" + wire_and_store(state=state, influencer=influencer, influenced=influenced, + el_name=el_name, elem=elem, comment="action output value") + + +def wire_dynamic_choice_sets(state, elem, el_name, stored): + #TODO: audit these + pass + + +def wire_orchestrated_stages(state, elem, el_name, stored): + # Todo: update this with additional wiring after audit + steps = parse_utils.get_by_tag(elem, 'stageSteps') + for step in steps: + step_name = parse_utils.get_name(step) + if step_name is not None: + fixed_name = f"{step_name}.Outputs" + state.get_or_make_vector(name=fixed_name, store=True) + + +def wire_loop(state: BranchState, elem: ET.Element, elem_name: str, stored): + """Wires collection loop is over to loop variable. + + Args: + state: current Branch State + elem: assignment element to be wired + elem_name: element name passed in for convenience + + Returns: + None + + """ + collection_ref_els = parse.get_by_tag(elem, tagname='collectionReference') + if len(collection_ref_els) != 1: + logger.warning(f"Found Loop without a collection reference in {elem_name}") + return + else: + collection_ref_el = collection_ref_els[0] + + collection_ref_var = collection_ref_el.text + loop_var = elem_name + wire_and_store(state, influencer=collection_ref_var, influenced=loop_var, + el_name=elem_name, elem=elem,comment='assign to loop variable') + + +def wire_collection_processor(state: BranchState, elem: ET.Element, elem_name: str, stored): + """Wires collection reference in collection processor to collection elem. + + Args: + state: current Branch State + elem: assignment element to be wired + elem_name: element name passed in for convenience + + Returns: + None + + """ + # every collectionProcessor must have a single collection ref + subtype = parse.get_by_tag(elem, tagname='elementSubtype') + if len(subtype) == 1 and subtype[0].text == 'FilterCollectionProcessor': + collection_el = parse.get_by_tag(elem, tagname='collectionReference')[0] + else: + return + collection_ref_var = collection_el.text + collection_var = elem_name + + wire_and_store(state, influencer=collection_ref_var, + influenced=collection_var, + el_name=elem_name, + comment='collection filter', + elem=collection_el) + + +def wire_screens(state, elem, el_name, stored): + parser = state.get_parser() + stored_els = [] + + # wire screen action output as action_name.Results.foo + # Do this first + action_els = parse_utils.get_by_tag(elem, 'actions') + + for action in action_els: + action_name = parse_utils.get_name(action) + assert action_name is not None + fixed_name = f"{action_name}.Results" + + if fixed_name not in stored_els: + state.get_or_make_vector(name=fixed_name, store=True) + stored_els.append(fixed_name) + + + # initialize all named fields + fields = elem.findall(f'.//{ns}fields') # recurse + + for fld in fields: + fld_type = parse_utils.get_text_of_tag(fld, 'fieldType') + fld_name = parse_utils.get_text_of_tag(fld, 'name') + if fld_type is None: + continue + + # wire objectProvided + elif fld_type == 'ObjectProvided': + field_ref = parse_utils.get_text_of_tag(fld, 'objectFieldReference') + + if field_ref is not None: + parent, member, v_type = parser.resolve_by_name(field_ref) + fixed_influencer = f"{el_name}.{member}" + + wire_and_store(state, influencer=fixed_influencer, influenced=field_ref, + el_name=el_name, elem=elem, + comment="assignment to object from screen") + + if fld_name is None: + continue + + # wire assignToReference + output_param = parse_utils.get_by_tag(fld, 'outputParameters') + + if output_param: + param_el = output_param[0] + assign_to_ref = parse_utils.get_text_of_tag(param_el, 'assignToReference') + influencer_fld = parse_utils.get_text_of_tag(param_el, 'name') + + if assign_to_ref is not None and influencer_fld is not None: + + if fld_name not in stored_els: + state.get_or_make_vector(name=fld_name, store=True) + stored_els.append(fld_name) + + influencer = f"{fld_name}.{influencer_fld}" + wire_and_store(state, influencer=influencer, influenced=assign_to_ref, + el_name=fld_name, elem=fld, comment="output form screen field") + continue + + +def wire_and_store(state: BranchState, influencer:str, influenced: str, + el_name: str, elem: ET.Element, comment: str) -> None: + stmt = InfluenceStatement( + influenced_var=influenced, + influencer_var=influencer, + element_name=el_name, + source_text=parse_utils.get_elem_string(elem), + line_no=elem.sourceline, + comment=comment, + flow_path=state.flow_path, + source_path=state.flow_path + ) + state.propagate_flows(statement=stmt, assign=True, store=True) \ No newline at end of file diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/control_flow.py b/packages/code-analyzer-flow-engine/FlowScanner/flowtest/control_flow.py deleted file mode 100644 index 7202d429..00000000 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/control_flow.py +++ /dev/null @@ -1,723 +0,0 @@ -"""Module to generate control flow graphs and crawl schedules - -""" -from __future__ import annotations - -import dataclasses -import json -from abc import ABC -from collections.abc import Generator -from dataclasses import dataclass, field -from typing import TextIO - -import flow_parser.parse as parse -from flow_parser.parse import Parser -from public.data_obj import BranchVisitor, CrawlStep -from public.enums import ConnType -from public.flowtest_exceptions import InvalidFlowException -from public.parse_utils import (ET, get_name, get_conn_target_map, - is_subflow, is_loop, get_tag) - - -@dataclass(frozen=True) -class JSONSerializable(ABC): - - def to_dict(self): - return {s: getattr(self, s) for s in self.__slots__} - - -@dataclass(frozen=True, eq=True, slots=True) -class Jump(JSONSerializable): - """Class representing a connector - - """ - # name of element where jump is located - src_name: str - - # where connector points to - target: str - - # true if goto connector - is_goto: bool - - # true if next-value - is_loop: bool - - def priority(self) -> int: - # lower is higher priority - if self.is_loop: - return 0 - else: - return 1 - - -@dataclass(frozen=True, eq=True, slots=True) -class Segment(JSONSerializable): - # name of element at the start of the segment (jump target) - label: str - - # list of elements (including label) in this segment (in order) - traversed: [str] - - # list of traversal indexes that are subflow elements - subflows: [int] - - # connectors at the end of this segment - jumps: [Jump] - - # whether this segment may end execution - is_terminal: bool - - # for tracking whether it has been visited - seen_visitors: [(str, str)] = field(default_factory=list) - - def accept(self, visitor: BranchVisitor) -> [BranchVisitor] or None: - """does the node accept the visitor - - Also updates visitor state - - Args: - visitor: Branch Visitor trying to jump into node - - Returns: - list of labels to process or None - - """ - - prev = visitor.previous_label - if (prev, visitor.token) not in self.seen_visitors: - self.seen_visitors.append((prev, visitor.token)) - return self._send_outbound(visitor) - - else: - return None - - def _send_outbound(self, visitor): - # don't send an element right back to where it jumped from! - jumps = [jmp for jmp in self.jumps if (jmp.src_name, jmp.target) not in visitor.history] - jumps.sort(key=lambda x: x.priority()) - - to_return = [] - for jmp in jumps: - current_label = jmp.target - previous_label = self.label - history = visitor.history + ((jmp.src_name, jmp.target),) - if jmp.is_goto is True: - token = (jmp.src_name, jmp.target) - to_add = dataclasses.replace(visitor, - token=token, - current_label=current_label, - previous_label=previous_label, - history=history - ) - else: - to_add = dataclasses.replace(visitor, - current_label=current_label, - previous_label=previous_label, - history=history - ) - to_return.append(to_add) - return to_return - - @classmethod - def build_from_parser(cls, parser: parse.Parser, - elem: ET.Element, - seen_names: [str] = None): - - label = get_name(elem) - jumps = [] - if is_subflow(elem): - subflows = [0] - else: - subflows = [] - conn_map = get_connector_map(elem, parser=parser) - optional_values = [x[2] for x in conn_map.values() if x[2] is False] - is_optional = len(optional_values) == 0 - curr_elem = elem - traversed = [] - - if len(conn_map) == 0: - return Segment(label=label, - subflows=subflows, - traversed=[label], - jumps=[], - is_terminal=True) - index = 0 - while len(conn_map) > 0: - - conn_map = get_connector_map(curr_elem, parser=parser) - curr_name = get_name(curr_elem) - if curr_name in traversed: - # we are looping back in the segment. break here, and - # the element will not be added to this segment. - # It will then appear in some other segment pointing to this segment. - # - # If it points to an element somewhere in the middle of this segment, - # that will be addressed in the `fix_duplicates` function below. - break - else: - traversed.append(curr_name) - - if seen_names is not None: - if curr_name not in seen_names: - seen_names.add(curr_name) - - if is_subflow(curr_elem): - subflows.append(index) - - if is_loop(curr_elem): - # A loop can be missing a noMoreValues - # in which case it terminates the program - for conn, val in conn_map.items(): - elem_is_loop = False - no_more_seen = False - - if get_tag(conn) == 'noMoreValuesConnector': - is_optional = False - no_more_seen = True - - if get_tag(conn) == 'nextValueConnector': - elem_is_loop = True - if no_more_seen is False: - is_optional = True - - jumps.append(Jump(src_name=curr_name, - target=val[0], - is_goto=val[1] is ConnType.Goto, - is_loop=elem_is_loop - ) - ) - break - - elif len(conn_map) == 1: - vals = list(conn_map.values()) - - if vals[0][1] is not ConnType.Goto and vals[0][0] not in seen_names and is_optional is False: - curr_elem = parser.get_by_name(vals[0][0]) - continue - else: - jumps.append(Jump(src_name=curr_name, - is_goto=vals[0][1] is ConnType.Goto, - target=vals[0][0], - is_loop=False)) - break - - elif len(conn_map) > 1: - for val in conn_map.values(): - jumps.append(Jump(src_name=curr_name, - target=val[0], - is_goto=val[1] is ConnType.Goto, - is_loop=False - ) - ) - break - - # end of conditionals - index += 1 - - # end of while loop - - if len(jumps) == 0: - # if there are no more jumps, this is a terminal element - is_optional = True - else: - # sort jumps so nextValue is taken first - jumps.sort(key=lambda x: x.priority()) - - return Segment(label=label, - subflows=subflows, - jumps=jumps, - traversed=traversed, - is_terminal=is_optional) - - -@dataclass(frozen=True, eq=True, slots=True) -class ControlFlowGraph(JSONSerializable): - # where to start - start_label: str - - # map from segment label -> inbound jumps - inbound: {str: [Jump]} - - # label -> segment - segment_map: {str: Segment} - - @classmethod - def from_parser(cls, parser: parse.Parser): - start_elem = parser.get_start_elem() - start_label = get_name(start_elem) - visited_labels = [] - visited_elems = set() - segment_map = {} - to_visit = [start_elem] - - while len(to_visit) > 0: - - curr_elem = to_visit.pop(0) - curr_segment = Segment.build_from_parser(parser=parser, - elem=curr_elem, - seen_names=visited_elems) - - segment_map[curr_segment.label] = curr_segment - - # add segment label to visited - if curr_segment.label not in visited_labels: - visited_labels.append(curr_segment.label) - - visited_elems.update(curr_segment.traversed) - - # update to_visit with new jumps - for jmp in curr_segment.jumps: - tgt = jmp.target - tgt_elem = parser.get_by_name(tgt) - if tgt not in visited_labels and tgt_elem not in to_visit: - to_visit.append(tgt_elem) - - # The resulting Segments are fine 99% of the time, but some flows - # have undocumented gotos leading to duplicates. These are fixed here. - _fix_duplicates(segment_map) - - # Now generate inbound: - inbound = {} - - for seg in segment_map.values(): - for jmp in seg.jumps: - if jmp.target in inbound: - inbound[jmp.target].append(jmp) - else: - inbound[jmp.target] = [jmp] - - return ControlFlowGraph(start_label=start_label, - inbound=inbound, - segment_map=segment_map) - - -def _get_crawl_visits(cfg: ControlFlowGraph) -> {str: [BranchVisitor]}: - """For testing and analysis. - - Args: - cfg: control flow graph - - Returns: - map from label to BranchVisitor - """ - # for testing and analysis - # initialize visits - visits = {label: [] for label in cfg.segment_map.keys()} - visits[cfg.start_label] = [BranchVisitor(cfg.start_label, previous_label=None)] - - for visitor, segment_names in crawl_iter(cfg=cfg): - visits[visitor.current_label].append(visitor) - - return visits - - -@dataclass(frozen=True, eq=True, slots=True) -class CrawlSchedule(ABC): - total_steps: int - total_branches: int - crawl: [{str: int | str}] - branch_map: {((str, str),): BranchVisitor} - branch_counts: {((str, str),): int} - - -def get_crawl_schedule(cfg: ControlFlowGraph) -> ((CrawlStep,), (CrawlStep,)): - """Builds crawl schedule - - Args: - cfg: Control Flow Graph - - Returns: - (tuple of crawl steps, tuple of terminal steps) - """ - - generator = crawl_iter(cfg) - crawl_steps = [] - terminal_steps = [] - step = 0 - - for (visitor, segment) in generator: - if segment.is_terminal is True: - terminal_steps.append( - CrawlStep( - step=step + len(segment.traversed) - 1, - visitor=visitor, - element_name=segment.traversed[-1] - ) - ) - for el_name in segment.traversed: - crawl_steps.append( - CrawlStep( - step=step, - visitor=visitor, - element_name=el_name - ) - ) - step += 1 - - return tuple(crawl_steps), tuple(terminal_steps) - - -def crawl_iter(cfg: ControlFlowGraph) -> Generator[(BranchVisitor, [Segment]), None, None]: - """crawls CFG - - Args: - cfg: control flow graph - - Yields: - current Branch visitor, list of flow elements to process, outgoing Branch Visitors - """ - - label = cfg.start_label - visitor = BranchVisitor(label, previous_label=None) - worklist = [] - - while len(worklist) > 0 or visitor is not None: - if visitor is None and len(worklist) > 0: - # nowhere to jump, so pull from worklist - visitor = worklist.pop(0) - - # skip orphaned references - if visitor.current_label not in cfg.segment_map: - visitor = None - continue - - segment = cfg.segment_map[visitor.current_label] - next_visitors = segment.accept(visitor) - - yield visitor, segment - - if next_visitors is not None and len(next_visitors) > 0: - # depth-first search so take first branch and assign as current - visitor = next_visitors[0] - - # Add to worklist - [worklist.append(next_visitors[i]) for i in range(1, len(next_visitors)) - if next_visitors[i] not in worklist] - - else: - # no more visitors means current branch is exhausted - visitor = None - - -def get_visits_statistics(visit_map: {str: Jump or None}, cfg: ControlFlowGraph): - # first check that every label has been visited: - missed = [] - for label in cfg.segment_map: - if len(visit_map[label]) == 0: - print(f"not visited: {label}") - missed.append(label) - - # check that every jump has been traversed: - missing_inbound = [] - inbound = cfg.inbound - for label in inbound: - label_visits = visit_map[label] - visit_tuples = {(x.current_label, cfg.segment_map[x.previous_label].traversed[-1]) for x in label_visits - if x.previous_label is not None} - inbound_tuples = {(x.target, x.src_name) for x in inbound[label]} - for inbound_t in inbound_tuples: - if inbound_t not in visit_tuples: - missing_inbound.append(inbound_t) - if len(missing_inbound) > 0: - [print(f"missing inbound jumps: {x}") for x in missing_inbound] - - # get total number of visits: - all_visits = 0 - for visit in visit_map.values(): - all_visits = all_visits + len(visit) - - all_inbound = 0 - for x in cfg.inbound.values(): - all_inbound = all_inbound + len(x) - - report_str = (f"total number of visits: {all_visits}\n" - f"total number of visits per node: {all_visits / len(visit_map)}\n" - f"total number of visits per inbound: {all_visits / max(all_inbound, 1)}\n" - f"total number of missed inbound: {len(missing_inbound)}") - - return missed, missing_inbound, report_str - - -def _find_segments_with_elem(val: str, segment_map: {str: Segment}) -> [(str, Segment, int)]: - """Find segments that also contain an element. - - Args: - val: string name of element - segment_map: label -> segment - - Returns: - - * list of segments that have this element along with their label - and the index of the found element in the form - (label, segment, dupe_index) - - * Empty set if no segments found - - """ - if segment_map is None or len(segment_map) == 0: - return [] - - to_return = [] - for label, seg in segment_map.items(): - try: - # Note segment gen. algorithm doesn't allow a value to appear - # more than once in the traversed history - to_return.append((label, seg, seg.traversed.index(val))) - except ValueError: - pass - - return to_return - - -def _fix_duplicates(segment_map: {str: Segment}) -> None: - """segment surgery to merge duplicate paths - - Sometimes we have:: - - segment 1: A->B->C - segment 2: X->A->B->C - - Which should be turned into:: - - segment 1: A->B->C - segment 2': X :jump A - - Or if we have:: - - segment 3: X->Y->A - segment 4: W->B->A - - Then this should be merged into: - - segment 3': X->Y jump A - segment 4': W->B jump A - new segment: A - - Args: - segment_map: label -> Segment - - Returns: - None. (Segments updated in place) - """ - crawled = [] - segments = segment_map.values() - for segment in segments: - crawled = crawled + segment.traversed - - dupes = {x for x in crawled if crawled.count(x) > 1} - if len(dupes) == 0: - return - # el: string name of dupe flow element - # val: list (segment, index of traversed in segment) - processed = [] - for val in dupes: - if val in processed: - continue - - dupes = _find_segments_with_elem(val, segment_map) - new_segment = None - - # (segment, index) - for (label, segment, val_index) in dupes: - if val_index == 0: - # the dupe *starts* a segment, so it is the entire segment - new_segment = segment - else: - # the dupe is partway through the segment - subflows = [x for x in segment.subflows if x < val_index] - new_jump = Jump(src_name=segment.traversed[val_index - 1], - target=val, - is_loop=False, - is_goto=False, - ) - # replace the segment - segment_map[label] = Segment(label=segment.label, - traversed=segment.traversed[:val_index], - subflows=subflows, - jumps=[new_jump], - is_terminal=False) - # now, make the jump target - if new_segment is not None: - # we already have it, no need to add it. - pass - else: - # make it. All dupes of the same value must end in the same way - # so take the first - (seg_index, segment, val_index) = dupes[0] - new_segment = Segment(label=val, - traversed=segment.traversed[val_index:], - subflows=[x for x in segment.subflows if x >= val_index], - jumps=segment.jumps, - is_terminal=segment.is_terminal) - - segment_map[val] = new_segment - - # add all the traversed elems to processed - # so we don't make more new segments unnecessarily - processed = processed + new_segment.traversed - - -def validate_cfg(cfg: ControlFlowGraph, parser: parse.Parser) -> bool: - # check that all elements are covered exactly once: - all_elems = parser.get_all_traversable_flow_elements() - all_elem_names = [get_name(x) for x in all_elems] - crawled_elems = [] - - for segment in cfg.segment_map.values(): - crawled_elems = crawled_elems + segment.traversed - - # ..check there are no missing crawlable elements - missing = [x for x in all_elem_names if x not in crawled_elems] - counts = {x: crawled_elems.count(x) for x in crawled_elems} - - # ..check there are no duplicates - duplicates = [x for x in crawled_elems if counts[x] > 1] - - if len(duplicates) != 0: - valid = False - print("invalid crawl info") - for x in duplicates: - print(f"duplicate: {x}") - else: - valid = True - for x in missing: - # some flows include disconnected elements that can't be crawled. - print(f"caution missing element found: {x}") - - return valid - - -class CrawlEncoder(json.JSONEncoder): - def default(self, obj): - if (isinstance(obj, JSONSerializable) or isinstance(obj, BranchVisitor) - or isinstance(obj, CrawlStep)): - return obj.to_dict() - else: - return json.JSONEncoder.default(self, obj) - - -class Crawler: - """Class representing the crawl of a graph - - """ - - def __init__(self, total_steps: int, crawl_schedule: (CrawlStep,), - terminal_steps: (CrawlStep,), - history_maps: {((str, str),): CrawlStep}): - """Constructor - - .. WARNING:: For module use only - - Args: - total_steps: how many steps in crawl - crawl_schedule: tuple of :class:`public.data_obj.CrawlStep` in order of execution - terminal_steps: tuple of :class:`public.data_obj.CrawlStep` - that can end program (note, *not* in any specific order) - history_maps: map from history to last seen crawl_step with this history - - """ - #: int current step of crawl - self.current_step = 0 - - #: int total number of steps - self.total_steps = total_steps - - #: crawl_step -> last seen ancestor - self.history_maps = history_maps or {} - - #: tuple(:ref:`public.data_obj.CrawlStep`) all crawl steps in order of execution - self.crawl_schedule = crawl_schedule - - #: tuple(:ref:`public.data_obj.CrawlStep`) steps that can terminate the program - self.terminal_steps = terminal_steps - - @classmethod - def from_parser(cls, parser: parse.Parser): - """Builds a crawl schedule (recommended builder) - - Args: - parser: :obj:`flow_parser.parse.Parser` instance - - Returns: - :obj:`Crawler` instance - - """ - cfg = ControlFlowGraph.from_parser(parser) - crawl_schedule, terminal_steps = get_crawl_schedule(cfg) - total_steps = len(crawl_schedule) - - return Crawler( - total_steps=total_steps, - crawl_schedule=crawl_schedule, - terminal_steps=terminal_steps, - history_maps=None - ) - - def get_crawl_step(self) -> CrawlStep | None: - """Retrieve the next crawl step - - Returns: - :obj:`public.data_obj.BranchVisitor` and flow element name to process - - """ - if self.current_step >= self.total_steps: - return None - else: - to_return = self.crawl_schedule[self.current_step] - self.history_maps[to_return.visitor.history] = to_return - self.current_step += 1 - return to_return - - def set_step(self, step: int) -> None: - self.current_step = step - - def get_last_ancestor(self, crawl_step) -> CrawlStep | None: - """Get latest ancestor branch that was last visited - - Useful for knowing which influence map to clone - - Args: - crawl_step: step whose history is sought - - Returns: - CrawlStep instance or None - - """ - history = crawl_step.visitor.history - res = None - while res is not None: - res = dict.get(self.history_maps, history, None) - if len(history) == 0: - break - else: - history = history[:-1] - if res is None: - # not present - return None - else: - return res - - -def get_connector_map(elem: ET.Element, - parser: Parser) -> {ET.Element: (str, ConnType, bool)}: - """ - Wrapper for getting connectors that handles start elements and missing - connector targets, which requires a parser. - - Args: - elem: element to search for connectors - parser: parser containing global file data - - Returns: - connector map - - """ - raw = get_conn_target_map(elem) - - # make sure the target elem exists - return {x: v for x, v in raw.items() if v[0] in parser.all_names} - - -def dump_cfg(x, fp: TextIO): - json.dump(x, indent=4, fp=fp, cls=CrawlEncoder) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/flow_result.py b/packages/code-analyzer-flow-engine/FlowScanner/flowtest/flow_result.py deleted file mode 100644 index dbca1c36..00000000 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/flow_result.py +++ /dev/null @@ -1,425 +0,0 @@ -"""Serializes results and interacts with - third party report processors - - @author: rsussland@salesforce.com -""" -from __future__ import annotations - -import json -import logging -import sys -from datetime import datetime -from typing import TextIO - -sys.modules['_elementtree'] = None -from public.custom_parser import ET, clean_string -import public.custom_parser as CP - -from flowtest import ESAPI -from flowtest import flow_metrics -from flowtest.version import __version__ -from public.data_obj import QueryResult, Preset, InfluenceStatementEncoder - -DEFAULT_HELP_URL = "https://security.secure.force.com/security/tools/forcecom/scannerhelp" -DEFAULT_JOB_TYPE = "FlowSecurityCLI" - -logger = logging.getLogger(__name__) - - -class ResultsProcessor(object): - """Class storing all the information necessary for a report. - - This includes labelling information like the report requested, - scan start time, etc., as well as the results of the findings. - - The class contains methods to take this information and generate - json, xml and html reports. - """ - - def __init__(self, preset: Preset = None, requestor="System", report_label=None, - result_id="default", service_version=__version__, help_url=DEFAULT_HELP_URL): - - self.preset: Preset | None = preset - self.help_url: str = help_url - self.result_id: str = result_id # Id to assign to scan result, appears in reports - self.service_version: str = service_version # Version of job management system running scan jobs - self.email: str = requestor # email address of result recipient - - if report_label is None: - report_label = "flowscan run at %s" % str(datetime.now())[:-7] - # report label is a human-readable label assigned to this scan - self.friendly_name: str = report_label - self.counter: int = 0 - self.scan_start: str = str(datetime.now()) # should be overriden - self.scan_end: str = self.scan_start # should be overridden - - # deduplicated stored query results - self.stored_results: [QueryResult] = [] - - # dictionary of results sorted by query_name - self.results_dict: {str: {}} = None - - # xml report string - self.report_xml: str | None = None - - def write_html(self, html_report_path: str): - """Writes html report to disk - - Args: - html_report_path: where to write html report - - Returns: - metrics (results) of issues sorted and counted. - - """ - if self.report_xml is None: - self.get_cx_xml_str() - - if (self.preset is None or self.preset.preset_name is None - or len(self.preset.queries) == 0): - raise RuntimeError("Cannot generate html as no valid preset is set") - - presets = [x.query_id.strip() for x in self.preset.queries] - - # Notify metrics of which queries were run - flow_metrics.add_to_presets(preset_name=self.preset.preset_name, - presets=presets) - - # Load query descriptions in metrics - flow_metrics.add_to_query_config(list(self.preset.queries)) - - # now generate report - results = flow_metrics.parse_results(xml_report_str=self.report_xml, - failed_queries=None, - throttle=False, - report_path=html_report_path, - source_dir=None, - email_add=self.email, - friendly_name=self.friendly_name, - scan_start=self.scan_start, - scan_end=self.scan_end, - preset=self.preset.preset_name, - job_type=DEFAULT_JOB_TYPE, - service_version=self.service_version or __version__, - debug=False, - result_id=self.result_id, - help_url=self.help_url - ) - return results - - def dump_json(self, fp: TextIO) -> None: - """Write json string of results to file pointer - - Returns: - None - - """ - job_result = self._make_job_result() - json.dump(job_result, indent=4, fp=fp, cls=InfluenceStatementEncoder) - - def get_json_str(self) -> str: - """get json result string - - Returns: - string that serializes list of QueryResult objects - - """ - job_result = self._make_job_result() - - return json.dumps(job_result, indent=4, cls=InfluenceStatementEncoder) - - def get_cx_xml_str(self): - """Converts results to popcrab compatible report format - - Returns: - report xml string - """ - - id2path_dict = self._make_query_id_to_path_dict() - if self.results_dict is None: - self.gen_result_dict() - - result_dict = self.results_dict - - if result_dict is None or len(result_dict) == 0: - self.report_xml = '' - return self.report_xml - - result_str = '' - for query_id in result_dict: - results = result_dict[query_id] - if len(results) > 0: - query_path = ESAPI.html_encode(id2path_dict[query_id]) - query_name = ESAPI.html_encode(result_dict[query_id][0]['query_name']) - result_str += f'' - for flow_result in results: - statements = flow_result["flow"] - start_path = statements[0].source_path - counter = flow_result["counter"] - - result_str += (f'' - f'') - for index, node in enumerate(statements): - filename = node.source_path - line = node.line_no - code = ESAPI.html_encode(clean_string(node.source_text)) - result_str += f"{ESAPI.html_encode(filename)}" - result_str += f"{line}" - # TODO: currently we hardcode but should get real columns - result_str += f"1" - result_str += f"{index}" - result_str += f"{ESAPI.html_encode(node.influenced_var)}" - - # Add Snippet - result_str += f"{line}" - result_str += f"{code}" - # End Loop over histories (nodes within a path) - result_str += "" - result_str += "" - # End loop over results (paths) - result_str += "" - # End all loops - result_str += "" - - self.report_xml = _validate_and_prettify_xml(result_str) - - return self.report_xml - - def add_results(self, query_results: [QueryResult]) -> None: - """Add results to processor - - Stores results internally for simple de-duplication. - All we do is use datapath equality, so please don't put - unique comment strings containing things like step number - or timestamps into influence statements, as they wont be - de-duped. - - Args: - query_results: list of Query-Result objects - - Returns: - None - """ - query_results = _validate_qr(query_results) - if query_results is None: - return - else: - self.stored_results = _merge_results( - self.stored_results + query_results - ) - - def gen_result_dict(self) -> {str: {str: str}}: - """Sorts results into query buckets - - Used internally to generate popcrab compatible - xml and html report formats. - - Also useful for testing - - Returns: - dictionary of the form:: - - query_id -> {flow: tuple of DataInfluenceStatements, - query_name: (human_readable), - counter: (fake similarity id), - elem: source code of element - elem_name: name of Flow Element - field: name of influenced variable} - - """ - - query_results = self.stored_results - accum = {} - if query_results is None or len(query_results) == 0: - return {} - - for query_result in query_results: - query_desc = self._get_query_desc_from_id(query_result.query_id) - end_stmt = query_result.influence_statement - - query_path = query_result.query_id - - # Initialize - if query_path not in accum: - accum[query_path] = [] - - if query_result.paths is None: - statements = [end_stmt] - - else: - statements = [x.history + (end_stmt,) for x in query_result.paths] - - for stmt in statements: - accum[query_path].append({"flow": stmt, - "query_name": query_desc.query_name, - "severity": str(query_desc.severity), - "description": query_desc.query_description, - "counter": self.counter, - "elem": clean_string(end_stmt.source_text), - "elem_name": end_stmt.element_name, - "field": end_stmt.influenced_var}) - - # TODO: this is a placeholder for real similarity analysis, if needed. - self.counter += 1 - self.results_dict = accum - return accum - - def _make_query_id_to_path_dict(self) -> {str: str}: - """Generate a dictionary from query_id to query_path - - e.g. foo bar -> foo\\bar: Version X - - Returns: - dictionary - """ - return {x.query_id: x.query_id.strip().replace(".", "\\") + f" Version: {x.query_version.strip()}" - for x in self.preset.queries} - - def _make_job_result(self): - if self.results_dict is None: - self.gen_result_dict() - - job_result = {"preset": self.preset.preset_name, - "help_url": self.help_url, - "result_id": self.result_id, - "service_version": self.service_version, - "flowtest_version": __version__, - "report_label": self.friendly_name, - "email": self.email, - "scan_start": self.scan_start, - "scan_end": self.scan_end, - "results": self.results_dict or {} - } - return job_result - - def _get_query_desc_from_id(self, query_id: str): - descriptions = self.preset.queries - for x in descriptions: - if x.query_id == query_id: - return x - raise ValueError(f"No query with id {query_id} is in the preset provided") - - -def _validate_and_prettify_xml(xml_str: str) -> str: - """Pretty print and validate generated xml string - - Args: - xml_str: string to validate - - Returns: - validated/beautified xml_string - """ - my_root = CP.get_root_from_string(bytes(xml_str, encoding='utf-8')) - ET.indent(my_root) - return CP.to_string(my_root) - - -def _merge_results(results: list[QueryResult]) -> [QueryResult]: - """Return new list with consolidated paths - - The crawler necessarily visits the same Flow element - a few times (because of loops, goto statements, etc.) - - This can create duplicate results. We want to remove - these using set addition. - - .. Note:: For this to work, we need to avoid putting comments - in influence statements such as "done in step X" as this - will make them be different statements and not consolidated. - - - Args: - results: list of QueryResult objects - - Returns: - deduplicated list. Incoming list is not altered. - """ - # This function is not in the mood for garbage - assert results is not None and len(results) > 0 - - # instead of merging in place, we return a new list - # and mark with skips those elements that have been merged. - - new_list = [] # accumulator for results - r_indices = list(range(len(results))) # indices of result - - while len(r_indices) > 0: - qr = results[r_indices.pop(0)] - - new_paths = set(list(qr.paths)) - - candidates = [x for x in r_indices] # make a copy for iteration - - for i in candidates: - working = results[i] - if _is_match(qr, working) is True: - assert working.paths is not None - new_paths.update(working.paths) - r_indices.remove(i) - - new_list.append(QueryResult( - query_id=qr.query_id, - influence_statement=qr.influence_statement, - paths=frozenset(new_paths) - ) - ) - - return new_list - - -def _validate_qr(qr_list: list[QueryResult]) -> list[QueryResult] | None: - """Checks query result for correctness - - Args: - qr_list: Query Result list to validate - - Returns: - list of valid QueryResults with invalid results removed - None if the list was None - """ - if qr_list is None or len(qr_list) == 0: - return None - - to_skip = set() - for index, qr in enumerate(qr_list): - if qr is None: - logger.error(f"ERROR: an null query result was included in the result list" - f" {qr_list}") - to_skip.add(index) - if qr.query_id is None: - logger.error(f"ERROR: received a query result without a query: {qr}") - to_skip.add(index) - if qr.influence_statement is None: - logger.error(f"ERROR: received a query result without " - f"an influence statement: {qr}") - to_skip.add(index) - if qr.paths is None: - logger.error(f"ERROR: received a query result without paths: {qr}") - to_skip.add(qr) - - if len(to_skip) == 0: - return qr_list - else: - to_return = [qr_list[i] for i in range(len(qr_list)) if i not in to_skip] - if len(to_return) == 0: - return None - else: - return to_return - - -def _is_match(qr_a: QueryResult, qr_b: QueryResult) -> bool: - """ Are these results pointing to the same result? - - Args: - qr_a: QueryResult - qr_b: QueryResult - - Returns: - True if they have paths which can be consolidated - """ - - # both the actual query and the influence statement must match - # ..then the paths can be combined - return (qr_a.query_id == qr_b.query_id and - qr_a.influence_statement == qr_b.influence_statement) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/query_manager.py b/packages/code-analyzer-flow-engine/FlowScanner/flowtest/query_manager.py deleted file mode 100644 index e5d7375c..00000000 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/query_manager.py +++ /dev/null @@ -1,207 +0,0 @@ -"""Responsible for loading and invoking query instances. - - @author: rsussland@salesforce.com - -""" -from __future__ import annotations - -import importlib -import logging -import os -import traceback -import types -from enum import Enum -from importlib import machinery -from typing import Any - -import queries.default_query -from flow_parser.parse import Parser -from flowtest.flow_result import ResultsProcessor -from public.contracts import QueryProcessor, State - -logger = logging.getLogger(__name__) - - -class QueryAction(Enum): - process_elem = 0 - flow_enter = 10 - scan_exit = 20 - - -class QueryManager: - # instance that performs queries and produces results - query_processor: QueryProcessor = None - - # instance that stores results and generates reports - results: ResultsProcessor = None - - # current parser associated to flow-file - parser: Parser = None - - # which preset to request - requested_preset: str = None - - query_module: Any = None - - class_name: str | None = None - - @classmethod - def build(cls, results: ResultsProcessor, - parser: Parser = None, - requested_preset: str | None = None, - module_path: str | None = None, - class_name: str | None = None) -> QueryManager: - """Only call this once to build Query Manager at scan start - """ - qm = QueryManager() - if module_path is not None: - # try to load requested query - # TODO: add better error handling - query_module = create_module(module_path=module_path) - - qm.query_module = query_module - qm.class_name = class_name - preset, instance = get_instance(query_module_=query_module, - class_name_=class_name, - preset_=requested_preset) - qm.requested_preset = requested_preset - - else: - # use default - instance = queries.default_query.DefaultQueryProcessor() - preset = instance.set_preset_name(preset_name=requested_preset) - - if preset is None: - raise RuntimeError(f"The loaded query module does not support preset: {preset or 'No preset provided'}") - - # store pointer to query processor - qm.query_processor = instance - - # assign preset to results - results.preset = preset - - # store pointer to results - qm.results = results - qm.parser = parser - - return qm - - def reload(self): - """Make a new instance of the queries after completing one flow - - Returns: - None - """ - - if self.query_module is None or self.class_name is None: - # use default - self.query_processor = queries.default_query.DefaultQueryProcessor() - return - else: - preset, instance = get_instance(self.query_module, - self.class_name, self.requested_preset) - self.query_processor = instance - - def query(self, action: QueryAction, state: State) -> None: - """Invokes QueryProcessor to execute query and stores results - - Args: - action: type of invocation (flow entrance or element entrance) - state: current state - - Returns: - None - """ - # TODO: add exception handling and logging as this is third party code - # when we first enter a state, there is a start elem which is not assigned and so curr elem is None. - # don't look for sinks into these start states. - if action is QueryAction.process_elem and state.get_current_elem() is not None: - - res = self.query_processor.handle_crawl_element(state=state) - if res is not None: - self.results.add_results(res) - - elif action is QueryAction.flow_enter: - res = self.query_processor.handle_flow_enter(state=state) - # TODO: better validation of result - if res is not None: - self.results.add_results(res) - - def final_query(self, all_states: (State,)) -> None: - res = self.query_processor.handle_final(all_states=all_states) - # TODO: better validation of result - if res is not None: - self.results.add_results(res) - - # delete old query instance and reload for next flow to process - self.reload() - - # delete old states - - -def create_module(module_path: str) -> Any: - """Loads and Instantiates QueryProcessor - - Args: - module_path: location of module to load - - Returns: - QueryProcessor module - - Raises: - ValueError if module name cannot be parsed or preset not accepted - ImportError if the module cannot be loaded - - """ - if module_path is None: - # we'll build default - return None - - else: - # module should have a class with the same name as the module. - filename = os.path.basename(module_path) - - if filename is None: - raise ValueError("Could not determine file to load") - - splits = filename.split('.py') - - if len(splits) != 2 or splits[-1] != '': - raise ValueError("File must end in .py") - - mod_name = splits[0] - try: - loader = importlib.machinery.SourceFileLoader(mod_name, module_path) - query_module = types.ModuleType(loader.name) - loader.exec_module(query_module) - return query_module - except Exception as e: - logger.critical(f"ERROR: could not load module {filename}: {traceback.format_exc()}") - raise e - - -def get_instance(query_module_, class_name_, preset_): - if query_module_ is None: - query_instance = queries.default_query.QueryProcessor() - - else: - try: - query_instance = getattr(query_module_, class_name_)() - - except Exception as e: - logger.critical(f"ERROR: could not instantiate module") - raise e - - try: - accepted_preset = query_instance.set_preset_name(preset_) - if accepted_preset is None: - raise ValueError("Could not set preset") - - else: - return accepted_preset, query_instance - - except Exception as e: - logger.critical(f"ERROR: could not set preset: {traceback.format_exc()}") - raise e - -# TODO: write up initialization and flow transition for *all* elements (including variables, caches, etc) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/util.py b/packages/code-analyzer-flow-engine/FlowScanner/flowtest/util.py deleted file mode 100644 index 432cd269..00000000 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/util.py +++ /dev/null @@ -1,318 +0,0 @@ -# -# -# -from __future__ import annotations - -import logging -import os -import pathlib -import typing -import uuid -from collections.abc import Callable -from dataclasses import fields -from typing import TYPE_CHECKING -import json -import traceback - -from public.data_obj import VariableType -from public.enums import RunMode - -if TYPE_CHECKING: - pass - -FLOW_EXTENSION = ".flow-meta.xml" -PACKAGE_FLOW_EXTENSION = ".flow" -PROJECT_JSON_NAME = "sfdx-project.json" - -CURR_DIR = os.getcwd() - - -""" - Crawling limits -""" -MAX_WORKLIST_SIZE = 10000 # Emergency brake -MAX_STEP_SIZE = 100000 # Emergency brake - -logger = logging.getLogger(__name__) - - -def get_flows_in_dir(root_dir: str) -> {str: str}: - """Searches recursively through for flows - - Args: - root_dir: directory in which to search - - Returns: - Returns a tuple of (T1, T2) where T1 is the list of flows - to be scanned and T2 is a map: - (local label, fully qualified label | None) -> [path of flow with these labels] - """ - flow_paths = dict() - for root, dir_names, filenames in os.walk(root_dir): - for filename in filenames: - if filename.endswith(".flow") or filename.endswith(".flow-meta.xml"): - f_path = os.path.join(root, filename) - flow_paths[get_label(root, filename)] = f_path - - return flow_paths - - -def get_flows_in_dir_with_ns(root_dir: str) -> ([str], {str: str}): - """Searches recursively through for flows - - Args: - root_dir: directory in which to search - - Returns: - Returns a tuple of (T1, T2) where T1 is the list of flows - to be scanned and T2 is a map: - (local label, fully qualified label | None) -> [path of flow with these labels] - """ - flow_paths = dict() - current_ns = None - for root, dir_names, filenames in os.walk(root_dir): - for filename in filenames: - if filename.endswith(".flow") or filename.endswith(".flow-meta.xml"): - flow_paths[get_label(root, filename)] = os.path.join(root, filename) - - return flow_paths - - -def extract_ns_from_project_root(project_root_dir: str) -> str | None: - """Looks in the provided directory for a project json file and tries to extract the namespace - - Args: - project_root_dir: directory in which to search - - Returns: namespace string or none - - """ - candidate_path = os.path.join(project_root_dir, PROJECT_JSON_NAME) - if (not os.path.exists(project_root_dir) or - not os.path.exists(candidate_path)): - - return None - else: - try: - obj = json.loads(project_root_dir) - ns = obj["namespace"] - logger.info(f"found namespace {ns} in project root {project_root_dir}") - return ns - - except Exception: - logger.info(f"Failed to extract namespace from project root " - f"dir when searching in f{project_root_dir} for f{PROJECT_JSON_NAME}:\n" - f"{traceback.format_exc()}") - return None - - -def get_label(root: str, filename: str) -> (str, str): - """get flow label as used in other flows to reference this subflow - - Returns a tuple (namespaced label, local label) - - For the local label, it changes `foo-1` to `foo` and - the namespaced label would be `parent_dir__foo` - - Args: - root: parent directory containing filename - filename: filename of flow - - Returns: - tuple (namespaced_label, local_label) - - """ - if filename.endswith(PACKAGE_FLOW_EXTENSION): - short_fname = filename[:-5] - elif filename.endswith(FLOW_EXTENSION): - short_fname = filename[:-14] - else: - short_fname = filename - - local_label = short_fname.split('-')[0] - full_parent_dirname = os.path.split(os.path.abspath(root))[0] - parent_dirname = os.path.split(full_parent_dirname)[-1] - if parent_dirname is not None and len(parent_dirname) > 0 and parent_dirname != 'flows': - # This works when scanning core flows - namespaced_label = f"{parent_dirname}__{local_label}" - else: - # TODO: get namespace from manifest file - namespaced_label = local_label - return namespaced_label, local_label - - -""" - Simple Variable Type Propagation -""" - - -def propagate(src_type: VariableType, dest_type: VariableType, **replacements) -> VariableType: - """Propagate attributes across flows. - - For example, if we know that a variable - of type 'Account' is passed into loop, then we want to remember - that the object type of this loop is Account. This works if we leave - all properties none unless we are certain of their values and then - adopt this simple method. Longer term, we may need to put conditional - logic, but now add a replacement field for manual override. - - Args: - src_type: start Variable Type - dest_type: end Variable Type - replacements: property overrides - - Returns: - Variable Type, modified with sources populating empty dest entries. - - """ - - prop_names = [x.name for x in fields(VariableType) if x is not None] - new_props = {x: dict.get(replacements, x) or getattr(dest_type, x) or getattr(src_type, x) for x in prop_names} - - return VariableType(**new_props) - - -""" - Transmission of context in subflows - - A master Flow running in system context will cause actions run in the SubFlow - to be run in system context as well, - regardless of whether the SubFlow was originally created and configured to run in user context. - A master Flow running in user context that has a SubFlow running in system context - will proceed to run the actions in the SubFlow in system context. -""" - - -def make_id() -> str: - """Generates unique id strings - - Returns: - 8 digit unique id as str - - """ - return str(uuid.uuid4())[:8] - - -def get_effective_run_mode(parent_sharing: RunMode | None, current_sharing: RunMode) -> RunMode: - if (parent_sharing is None or current_sharing is RunMode.SystemModeWithoutSharing or - current_sharing is RunMode.SystemModeWithSharing): - return current_sharing - else: - return parent_sharing - - -def sane_index(my_tuple: tuple, to_match): - try: - index = my_tuple.index(to_match) - except ValueError: - index = -1 - - return index - - -""" - Callables for dealing with property maps -""" - - -def is_non_null(entry) -> bool: - return entry is not None - - -def is_null(entry) -> bool: - return entry is None - - -def id_(*entry) -> typing.Any: - return entry - - -def build_match_on_null(prop: str = None) -> Callable: - def prop_match(prop_to_match: str): - if prop is None: - return True - else: - return prop == prop_to_match - - return prop_match - - -def build_action_filter(include_default: bool = True, - include_prop: bool = True, - include_flow: bool = True) -> Callable: - def action(default, prop, flow): - accum = [] - if include_default is True: - accum.append(default) - if include_prop is True: - accum.append(prop) - if include_flow is True: - accum.append(flow) - return tuple(accum) - - return action - - -def build_equality_match(to_match) -> Callable: - def equ_match(obj_to_match): - return obj_to_match == to_match - - return equ_match - - -def match_all(x) -> bool: - return True - - -def resolve_name(all_flow_paths: {(str, str): str}, sub_name: str) -> str | None: - """return path of subflow to load based on subflow label - - Args: - all_flow_paths: all flow paths in scan scope in the form (abs label, local label) --> abs_flow_path - sub_name: subflow label - - Returns: - subflow path - - """ - targets = [x for x in all_flow_paths.keys() if sub_name in x] - sub_path = None - if len(targets) == 0: - logger.critical(f"Could not find subflow to load with name: {sub_name}. " - f"Please check that all flow files are in the directory to scan. Skipping..") - return None - if len(targets) == 1: - sub_path = all_flow_paths[targets[0]] - - if len(targets) >= 1: - # we have more than one flow with a matching label. - # We choose the local label if there is a unique match - # as local resolution takes precedence. - local_targets = [x for x in targets if sub_name == x[1]] - if len(local_targets) == 1: - sub_path = all_flow_paths[local_targets[0]] - if len(local_targets) == 0: - # No local target, let's look for namespaced - namespaced_targets = [x for x in targets if sub_name == x[0]] - if len(namespaced_targets) == 1: - sub_path = all_flow_paths[namespaced_targets[0]] - - if sub_path is None: - logger.critical(f"Could not resolve subflow with name: {sub_name}. " - f"Please check that all flow files are in the directory to scan. Skipping..") - - return None - - return sub_path - - -def safe_list_add(a_list, b_list) -> list: - # this should be in standard library - if a_list is None and b_list is None: - return None - elif a_list is None: - return b_list - elif b_list is None: - return a_list - else: - return a_list + b_list diff --git a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/wire.py b/packages/code-analyzer-flow-engine/FlowScanner/flowtest/wire.py deleted file mode 100644 index 1a8961a7..00000000 --- a/packages/code-analyzer-flow-engine/FlowScanner/flowtest/wire.py +++ /dev/null @@ -1,238 +0,0 @@ -"""performs dataflow wiring for flow elements - - ------------- - Wiring Policy - ------------- - - Wiring policy for flow elements - - * - Not wired. - * - Not wired. - * - TBD. Screens are complex and deserving of a separate study. Presently we wire associated - Objects as well as input variables. But many screen components are not handled (order fulfillment) - * - wire assignment, target variable auto-added - * - (flow element should be added) - * - (flow element should be added) - * - (flow element should be added) - * - (flow element should be added) - * - (flow element name) should be added and collection variable wired to it (for filters) - * - Loop variable (flow element name) should be added and collection variable wired to it - * - Not wired - * - Handled with custom methods in :mod:`flowtest.executor` - - The following flow elements represent data initialized at flow start - and are handled by the parser, they are not wired - - * - * - * - * - * - * - - The following are uncategorized - - * - TBD - * - TBD - -""" -import logging - -import flow_parser.parse as parse -from flowtest.branch_state import BranchState -from public import parse_utils -from public.data_obj import DataInfluenceStatement -from public.parse_utils import ET - -#: module logger -logger = logging.getLogger(__name__) - - -def handle_auto_store(state: BranchState, elem: ET.Element, elem_name: str) -> None: - """Add this element name to influence map if it represents its own output data - - (Element name is passed in so we don't need to keep looking it up) - - Args: - state: current branch state - elem: current xml elem - elem_name: element name - - Returns: - None - - """ - if parse_utils.is_auto_store(elem): - state.get_or_make_vector(name=elem_name, store=True) - - ref = parse_utils.get_output_reference(elem) - if ref is not None: - state.get_or_make_vector(name=ref, store=True) - - -def wire(state: BranchState, elem: ET.Element): - """Wires influence statements and variable initialization. - - When the value of one variable changes based on another. - Once detected, this module extends the influence map by - each statement. - - Args: - state: current instance of Branch State - elem: Flow Element to be wired - - Returns: - None - - """ - if elem is None: - return None - - el_type = parse.get_tag(elem) - el_name = parse.get_name(elem) - - # handle here - handle_auto_store(state, elem, elem_name=el_name) - - if el_type == 'assignments': - wire_assignment(state, elem, el_name) - - # CRUD operations - currently we don't support second order flows, - # but the implicit values, e.g. {!recordLookup} will already be - # picked up in assignment statements. What we do *not* do is wire - # the filters/selectors to the output, as this is what the - # query_processor could do if it flips "store" to True as part - # of a policy to propagate taint through object retrieval. - if el_type == 'recordLookups': - pass - if el_type == 'recordCreates': - # look for passing id to variable in create - pass - if el_type == 'recordUpdates': - pass - if el_type == 'recordDeletes': - pass - - # loops and collection processors work with collection references - if el_type == 'collectionProcessors': - wire_collection_processor(state, elem, el_name) - - if el_type == 'loops': - wire_loop(state, elem, el_name) - - if el_type == 'screens': - # add elem to influence map - input_elems = parse_utils.get_input_fields(elem) - if input_elems is not None: - for el in input_elems: - state.get_or_make_vector(name=parse_utils.get_name(el), store=True) - - -def wire_assignment(state: BranchState, elem: ET.Element, elem_name: str): - """Wires assignment statements to influence map in `state` - - Args: - state: current Branch State - elem: assignment element to be wired - elem_name: element name passed in for convenience - - Returns: - None - - """ - res = parse_utils.get_assignment_statement_dicts(elem) - if res is None: - logger.error(f"Could not obtain any assignments from element {elem_name}") - return - flow_path = state.flow_path - for (operator, entry) in res: - # we could have just return a boolean, but maybe there will be more operators in the future - is_assign = operator == 'Assign' - - # Be aware there is something sneaky going on here: - # if the parse module detects a string literal, it sets - # the influencer name to parse.STRING_LITERAL_TOKEN - # which then the parser module picks up and assigns - # a hardcoded variable type that is not any of the actual - # flow variables, so that it will not appear as a sink - # - # But if you were to refactor this code and not - # use the parse module, then there would be issues - # as string literals might show up as variables. - # Please keep this in mind when you write other - # parse modules for other flow elements that can - # accept stringLiteral types. - # - # Always assign a variable name equal to parse.STRING_LITERAL_TOKEN - # to signify something is a literal value and not a variable. - entry["flow_path"] = flow_path - entry["source_path"] = flow_path - - stmt = DataInfluenceStatement(**entry) - state.propagate_flows(statement=stmt, - assign=is_assign, - store=True) - - -def wire_loop(state: BranchState, elem: ET.Element, elem_name: str): - """Wires collection loop is over to loop variable. - - Args: - state: current Branch State - elem: assignment element to be wired - elem_name: element name passed in for convenience - - Returns: - None - - """ - # every loop must have a single collection ref - collection_ref_el = parse.get_by_tag(elem, tagname='collectionReference')[0] - collection_ref_var = collection_ref_el.text - loop_var = elem_name - stmt = DataInfluenceStatement( - influenced_var=loop_var, - influencer_var=collection_ref_var, - element_name=elem_name, - source_text=parse.ET.tostring(collection_ref_el, encoding='unicode', - default_namespace='http://soap.sforce.com/2006/04/metadata'), - line_no=collection_ref_el.sourceline, - comment='assign to loop variable', - flow_path=state.flow_path, - source_path=state.flow_path - ) - state.propagate_flows(statement=stmt, assign=True, store=True) - - -def wire_collection_processor(state: BranchState, elem: ET.Element, elem_name: str): - """Wires collection reference in collection processor to collection elem. - - Args: - state: current Branch State - elem: assignment element to be wired - elem_name: element name passed in for convenience - - Returns: - None - - """ - # every collectionProcessor must have a single collection ref - subtype = parse.get_by_tag(elem, tagname='elementSubtype') - if len(subtype) == 1 and subtype[0].text == 'FilterCollectionProcessor': - collection_el = parse.get_by_tag(elem, tagname='collectionReference')[0] - else: - return - collection_ref_var = collection_el.text - collection_var = elem_name - stmt = DataInfluenceStatement( - influenced_var=collection_var, - influencer_var=collection_ref_var, - element_name=elem_name, - source_text=parse.ET.tostring(collection_el, encoding='unicode', - default_namespace='http://soap.sforce.com/2006/04/metadata'), - line_no=collection_el.sourceline, - comment='collection filter', - flow_path=state.flow_path, - source_path=state.flow_path - ) - state.propagate_flows(statement=stmt, assign=True, store=True) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/public/contracts.py b/packages/code-analyzer-flow-engine/FlowScanner/public/contracts.py index 9aeb1238..76d324ea 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/public/contracts.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/public/contracts.py @@ -6,21 +6,24 @@ """ from __future__ import annotations -from typing import TYPE_CHECKING, Optional from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Optional import public.enums if TYPE_CHECKING: - from public.data_obj import DataInfluencePath, VariableType + from public.data_obj import InfluencePath, VariableType, CrawlStep, Jump import xml.etree.ElementTree as ET -from public.enums import RunMode +from public.enums import RunMode, QueryAction # and import other types as needed to process queries -from public.data_obj import QueryResult, Preset +from public.data_obj import QueryResult, Preset, QueryDescription, InfluencePath + +from typing import TypeAlias +var_t: TypeAlias = tuple[str, str] """ To generate custom queries, implement the QueryPresets class and for each query listed in the preset, implement @@ -29,6 +32,139 @@ or programmatically import if invoking as a module. """ +class AbstractCrawler(ABC): + + @abstractmethod + def get_control_influence_from_source(self, influenced_var: str, + source_var: var_t) ->tuple[var_t, ...] | None: + """Get control influence chain from source to influenced var + + Args: + influenced_var (str): top level (traversable) flow element name in the flow crawled by this current crawler. + source_var (str, str): flow_path, element name in either the current flow or in another flow that may or + may not be an ancestor in the call chain. + + Returns: + None if there is no influence, or a set of crawl steps linking the source to the influenced. + Only a single chain of crawl_steps is returned, there may be other control influence chains. + + """ + pass + + @abstractmethod + def get_crawl_schedule(self) -> tuple[CrawlStep]: + pass + + @abstractmethod + def get_flow_path(self) -> str | None: + pass + + @abstractmethod + def get_crawler_history_unsafe(self) -> list[tuple[AbstractCrawler, int]]: + """READ ONLY. Do not perform any crawlstep loads with these crawlers! + + Returns: + history of crawlers encountered during crawl, together with the current step (int) + when they entered a child flow. + """ + pass + + @abstractmethod + def get_cfg(self) -> AbstractControlFlowGraph: + pass + + @abstractmethod + def get_current_step_index(self)->int: + pass + + + @abstractmethod + def load_crawl_step(self) -> CrawlStep | None: + pass + + @abstractmethod + def get_last_ancestor(self, crawl_step) -> CrawlStep | None: + """Get latest ancestor branch that was last visited at :obj:`CrawlStep` + + Useful for knowing which influence map to clone + + Args: + crawl_step: step whose history is sought + + Returns: + CrawlStep instance or None + + """ + pass + + @abstractmethod + def get_elem_to_crawl_step(self, elem_name: str) -> list[CrawlStep]: + """returns a list of all :obj:`CrawlStep` in which this element has been visited + during the crawl of this flow. If not visited, the empty list is returned. + + Args: + elem_name (str): element name (use '*' for the start element) + + Returns: + list of :obj:`CrawlStep` instances that visit this element + + """ + pass + +class AbstractControlFlowGraph(ABC): + # where to start + @property + @abstractmethod + def start_label(self) -> str: + pass + + # map from segment label -> inbound jumps + @property + @abstractmethod + def inbound(self) -> dict[str, list[Jump]]: + pass + + @property + @abstractmethod + def segment_map(self) -> dict[str, AbstractSegment]: + pass + + +class AbstractSegment(ABC): + # name of element at the start of the segment (jump target) + @property + @abstractmethod + def label(self) -> str: + pass + + # list of (element names, element tags) (including label) in this segment (in order) + @property + @abstractmethod + def traversed(self) -> list[tuple[str, str]]: + pass + + # list of traversal indexes that are subflow elements + @property + @abstractmethod + def subflows(self) -> list[int]: + pass + + @property + @abstractmethod + def jumps(self) -> list[Jump]: + pass + + # whether this segment may end execution + @property + @abstractmethod + def is_terminal(self) -> bool: + pass + + # for tracking whether it has been visited + @property + @abstractmethod + def seen_tokens(self) -> list[tuple[tuple[str, str]]]: + pass class QueryProcessor(ABC): """Queries must implement this class. @@ -94,7 +230,7 @@ def __init__(self) -> None: # should be run, and this preset returned. # @abstractmethod - def set_preset_name(self, preset_name: str | None) -> Preset | None: + def set_preset(self, preset_name: str | None) -> Preset | None: """ Args: @@ -110,14 +246,16 @@ def set_preset_name(self, preset_name: str | None) -> Preset | None: @abstractmethod def handle_crawl_element(self, state: State, + crawler: AbstractCrawler, ) -> list[QueryResult] | None: """ Args: state: + crawler: cfg and crawl schedule Returns: - + list of query results """ pass @@ -125,21 +263,23 @@ def handle_crawl_element(self, @abstractmethod def handle_flow_enter(self, state: State, # the state has the flow_path variable + crawler: AbstractCrawler, ) -> list[QueryResult] | None: """Invoked when a flow or subflow is first entered. Args: - state: + state: state instance + crawler: crawl schedule and cfg Returns: - + list of QueryResults """ pass # Called when crawling is complete @abstractmethod def handle_final(self, - all_states: ([State],), + all_states: tuple[State], ) -> list[QueryResult] | None: """Invoked when crawl is complete for the flow and all subflows. @@ -151,6 +291,116 @@ def handle_final(self, """ pass +class AbstractQuery(ABC): + """ + Class representing a standalone query. + + Standalone queries are stateless, so there is no need to reload + after passing to a new flow. + + Standalone queries can only be invoked on a single query action + """ + + @classmethod + def accept(cls, **kwargs) -> list[QueryResult] | None: + """ + The accept method is designed for directly reporting issues discovered + in-line during regular flow processing and not run as the result of running + a query. + + For example, the executor needs to check if a subflow creates a circular reference + in order to ensure that symbolic execution terminates, but the user may also be + looking for this information as a query result. + + Therefore, we ensure that every query has an accept method that takes no + action and in case there is a query (e.g. that checks for subflow circular references) + then this query can override the accept method to handle it. + + The query manager ensures that only instantiated queries can have their + accept method called. + + """ + return None + + @abstractmethod + def get_query_description(self) -> QueryDescription: + pass + + @abstractmethod + def when_to_run(self) -> QueryAction: + pass + + @abstractmethod + def execute(self) -> list[QueryResult] | None: + pass + + +class Query(AbstractQuery, ABC): + + @abstractmethod + def get_query_description(self) -> QueryDescription: + pass + + @abstractmethod + def when_to_run(self) -> QueryAction: + pass + + @abstractmethod + def execute(self, + state: State=None, # the state has the flow_path variable + crawler: AbstractCrawler = None, + all_states = None) -> list[QueryResult] | None: + pass + + + +class LexicalQuery(AbstractQuery, ABC): + + @abstractmethod + def get_query_description(self) -> QueryDescription: + pass + + @abstractmethod + def when_to_run(self) -> QueryAction: + pass + + @abstractmethod + def execute(self, + parser: FlowParser = None, + **kwargs + ) -> list[QueryResult] | None: + pass + + + + +class AbstractFlowVector(ABC): + + @property + @abstractmethod + def property_maps(self) -> dict[InfluencePath, dict[str, set[InfluencePath]]]: + pass + + @classmethod + @abstractmethod + def from_flows(cls, default: set[InfluencePath] = None) -> AbstractFlowVector: + pass + + @abstractmethod + def get_flows_by_prop(self, member_name: str | None = None) -> set[InfluencePath]: + pass + + @abstractmethod + def add_vector(self, vector: AbstractFlowVector) -> AbstractFlowVector: + pass + + + @abstractmethod + def push_via_flow(self, extension_path: InfluencePath, influenced_vec: AbstractFlowVector, + assign: bool = True, + cross_flow: bool = False) -> AbstractFlowVector: + pass + class State(ABC): """Stores DataInfluencePaths in the current execution step @@ -169,7 +419,9 @@ def get_current_elem_name(self) -> str: pass @abstractmethod - def get_flows_from_sources(self, influenced_var: str, source_vars: {(str, str)}) -> set[DataInfluencePath] | None: + def get_flows_from_sources(self, influenced_var: str, + source_vars: set[tuple[str, str]], + restrict: str | None = None) -> set[InfluencePath] | None: pass @abstractmethod @@ -181,6 +433,16 @@ class FlowParser(ABC): """Exposes global information about the current flow """ + + @abstractmethod + def get_all_named_elems(self) -> frozenset[ET.Element] | None: + pass + + + @abstractmethod + def get_all_names(self) -> tuple[str,] | None: + pass + @abstractmethod def get_effective_run_mode(self) -> RunMode: pass @@ -193,6 +455,24 @@ def get_declared_run_mode(self) -> RunMode: def get_api_version(self) -> str: pass + @abstractmethod + def get_all_traversable_flow_elements(self) -> list[ET.Element]: + pass + + @abstractmethod + def get_all_variable_elems(self) -> list[ET.Element] | None: + pass + + @abstractmethod + def get_start_elem(self) -> ET.Element: + pass + + @abstractmethod + def get_traversable_descendents_of_elem(self, elem_name: str) -> list[str]: + """Gets elements that are called (connected to) from elem_name. + Includes the original elem_name""" + pass + @abstractmethod def get_filename(self) -> str: pass @@ -205,6 +485,14 @@ def get_flow_name(self) -> str: def get_flow_type(self)-> public.enums.FlowType: pass + @abstractmethod + def get_trigger_object(self) -> str | None: + pass + + @abstractmethod + def get_trigger_type(self)-> public.enums.TriggerType: + pass + @abstractmethod def get_root(self) -> ET.Element: pass @@ -213,16 +501,27 @@ def get_root(self) -> ET.Element: def get_literal_var(self) -> VariableType: pass + @abstractmethod + def get_action_call_map(self) -> dict[str, list[tuple[str, str]]] | None: + """Gets all actionCalls in the flow element + Returns: actionCall type -> (element name, action name) + """ + pass + + @abstractmethod + def get_async_scheduled_paths(self) -> list[str]: + pass + @abstractmethod def resolve_by_name(self, name: str, path: str | None = None) -> Optional[(str, str, VariableType)]: pass @abstractmethod - def get_output_variables(self, path: str | None = None) -> {(str, str)}: + def get_output_variables(self, path: str | None = None) -> set[tuple[str, str]]: pass @abstractmethod - def get_input_variables(self, path: str | None = None) -> {(str, str)}: + def get_input_variables(self, path: str | None = None) -> set[tuple[str, str]]: """Get Flow variables available for input Returns: (filename, element_name) corresponding to all variables available for input diff --git a/packages/code-analyzer-flow-engine/FlowScanner/public/custom_parser.py b/packages/code-analyzer-flow-engine/FlowScanner/public/custom_parser.py index 7c671111..b8084155 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/public/custom_parser.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/public/custom_parser.py @@ -11,6 +11,12 @@ def get_root(path: str) -> ET.Element: return ET.parse(path, parser=LineNumberingParser()).getroot() +def get_parent_map(root: ET.Element) -> dict[ET.Element, ET.Element]: + parent = {} + for elem in root.iter(): + for child in elem: + parent[child] = elem + return parent def get_root_from_string(byte_str) -> ET.Element: return ET.fromstring(byte_str, parser=LineNumberingParser()) diff --git a/packages/code-analyzer-flow-engine/FlowScanner/public/data_obj.py b/packages/code-analyzer-flow-engine/FlowScanner/public/data_obj.py index 56c857fb..ea5ed973 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/public/data_obj.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/public/data_obj.py @@ -5,34 +5,57 @@ from __future__ import annotations import json +from abc import ABC from dataclasses import dataclass, field from typing import TYPE_CHECKING +import public.enums from public.custom_parser import clean_string if TYPE_CHECKING: - from public.enums import DataType, ReferenceType, Severity + from public.enums import DataType, ReferenceType, Severity, ConnType + +@dataclass(frozen=True) +class JSONSerializable(ABC): + + def to_dict(self): + return {s: getattr(self, s) for s in self.__slots__} @dataclass(frozen=True, eq=True, slots=True) -class DataInfluenceStatement: +class InfluenceStatement: """Represents a statement in which one variable influences another, usually as the result of an assignment, formula or template field, or builtin function. These statement are the basic building blocks of dataflows. """ - # Variable being influenced. This is not assumed to be resolved, + # Variable being influenced. + # + # If a variable, then + # this is not assumed to be resolved, # e.g. can be 'foo.bar', so we use "_var" to emphasize this # in the code. Queries and influence maps are performed against # the *name*, which would just be 'foo'. + # If an element (for control influence) + # then this is the element name (not type). influenced_var: str - # Variable doing the influencing. Not assumed to be resolved. + # Variable or element doing the influencing. Not assumed to be resolved. # If this is a lexical query, you can omit the influencer + # If this is a control influence, use the element name or variable name + # being controlled. influencer_var: str | None # (Top Level) Flow element containing the influence. + # For control influence of one element influencing another, + # this would be the element containing + # the connector that points to the controlled element. + # + # For control influence of one variable control influencing the + # value of another, then this would be the element in which the control + # is defined. + # # Note this may be a huge element, so we want # more specific information later element_name: str @@ -122,7 +145,7 @@ class Preset: # specify which dataflow queries are run on each Flow Element # with enough information for users to understand the significance of not finding any issues # for that query - queries: {QueryDescription} + queries: set[QueryDescription] def to_dict(self): return {s: str(getattr(self, s)) for s in self.__slots__} @@ -161,7 +184,7 @@ def to_dict(self): return {s: str(getattr(self, s)) for s in self.__slots__} -@dataclass(frozen=True, eq=True) +@dataclass(frozen=True) class QueryResult: """The QueryProcessor performs only local analysis, for example searching for whether variables *within* a given Flow Element are assigned to a @@ -181,20 +204,118 @@ class QueryResult: passed is a pair of objects: a data influence statement and a list of flows. """ - # which query from the preset this is a result for + # which query this is a result for query_id: str - # Created by the QueryProcessor as a result of parsing the Flow Element - influence_statement: DataInfluenceStatement + # Type of flow (screen, trigger, etc) this result applies to. + # Flow type is carried from parent to child, so if a screen flow + # calls an auto-launched flow, issues found in the subflow will still + # inherit a flow_type of screen flow. This simplifies auditing and interpretation + # of results. + flow_type: public.enums.FlowType + + # Created by the QueryProcessor as a result of parsing the Flow Element. + # Is intended to be the final passage into the sink. + # the destination is the sink, and the sink's source code + # is presented. + influence_statement: InfluenceStatement | None = None # Provided by State # Only provide no paths if this is a lexical query or if the sink # and source are in the same (local) element - paths: frozenset[DataInfluencePath] or None + paths: frozenset[InfluencePath] | None = None + + # + # + # The following fields are only needed when there is no + # influence statement + elem_code: str | None = None + + elem_line_no: int | None = None + + # name of element (top level) + elem_name: str | None = None + + # name of field, within element (optional) + field: str | None = None + + # filename (required only for lexical) + filename: str | None = None + + def __hash__(self): + """Prior python 3.12, hash of None was volatile, and we need to support these + versions. + """ + if self.influence_statement is None: + infl = '#' + else: + infl = self.influence_statement + + if self.paths is None: + paths ='#' + else: + paths = self.paths + + if self.elem_code is None: + elem_code = '#' + else: + elem_code = self.elem_code + + if self.elem_line_no is None: + elem_line_no = '#' + else: + elem_line_no = self.elem_line_no + + if self.elem_name is None: + elem_name = '#' + else: + elem_name = self.elem_name + + if self.field is None: + field = '#' + else: + field = self.field + + if self.filename is None: + filename = '#' + else: + filename = self.filename + + return hash((infl, elem_code, elem_line_no, elem_name, field, filename,paths, self.query_id)) + + def __eq__(self, other): + if not isinstance(other, QueryResult): + return NotImplemented + + if self.query_id != other.query_id: + return False + + if self.influence_statement != other.influence_statement: + return False + + if self.paths != other.paths: + return False + + if self.elem_code != other.elem_code: + return False + + if self.elem_line_no != other.elem_line_no: + return False + + if self.elem_name != other.elem_name: + return False + + if self.field != other.field: + return False + + if self.filename != other.filename: + return False + + return True @dataclass(frozen=True, eq=True, slots=True) -class DataInfluencePath: +class InfluencePath: """Represents a data influence between two *named* elements, with a history of influence statements explaining the influence. @@ -226,12 +347,16 @@ class DataInfluencePath: builders to ensure data consistency. TODO: add support for labels. """ - # tuple of DataInfluenceStatements. This is what is sent to the + # tuple of InfluenceStatements. This is what is sent to the # results processor and displayed to end users. - history: (DataInfluenceStatement,) + history: tuple[InfluenceStatement, ...] # influenced name. (see 'property'). This is not the same - # as the variable name in the DataInfluenceStatement + # as the variable name in the InfluenceStatement. + # + # This could be the name of an element (for control) + # or the name of a variable (for data or control). In the case + # of a variable, do not include the property influenced_name: str # If the influence path influences a specific property @@ -264,9 +389,9 @@ class DataInfluencePath: influencer_filepath: str # type info about the influenced element - influenced_type_info: VariableType + influenced_type_info: VariableType | None - def report_influence_tuples(self) -> list[(str, str)]: + def report_influence_tuples(self) -> list[tuple[str, str]]: """Returns simple chain of variables for high level analysis Returns: @@ -301,21 +426,21 @@ def short_report(self, arrows: bool = False, filenames: bool = False) -> str: Returns: string containing summary report. """ - if arrows is True: + if arrows: joiner = "->" else: joiner = "," - if filenames is False: + if not filenames: s = joiner.join([s[1] for s in self.report_influence_tuples()]) else: s = joiner.join(f"{s[1]}(path:{s[0]})" for s in self.report_influence_tuples()) return s @classmethod - def combine(cls, start_flow: DataInfluencePath, end_flow: DataInfluencePath, + def combine(cls, start_flow: InfluencePath, end_flow: InfluencePath, cross_flow: bool = False, - type_override: VariableType | None = None) -> DataInfluencePath: + type_override: VariableType | None = None) -> InfluencePath: """Combine two paths Args: @@ -334,7 +459,7 @@ def combine(cls, start_flow: DataInfluencePath, end_flow: DataInfluencePath, dataflows will have different names and filenames. """ - if cross_flow is False: + if not cross_flow: if start_flow.influenced_name != end_flow.influencer_name: raise ValueError("Attempting to append an incompatible dataflow." f"statement influencer: {end_flow.influencer_name} " @@ -347,41 +472,77 @@ def combine(cls, start_flow: DataInfluencePath, end_flow: DataInfluencePath, pass new_history = start_flow.history + end_flow.history - return DataInfluencePath(history=new_history, - influencer_name=start_flow.influencer_name, - influenced_name=end_flow.influenced_name, - influencer_filepath=start_flow.influencer_filepath, - influenced_filepath=end_flow.influenced_filepath, - influenced_type_info=type_override or end_flow.influenced_type_info, - influenced_property=end_flow.influenced_property, - influencer_property=start_flow.influencer_property - ) + return InfluencePath(history=new_history, + influencer_name=start_flow.influencer_name, + influenced_name=end_flow.influenced_name, + influencer_filepath=start_flow.influencer_filepath, + influenced_filepath=end_flow.influenced_filepath, + influenced_type_info=type_override or end_flow.influenced_type_info, + influenced_property=end_flow.influenced_property, + influencer_property=start_flow.influencer_property + ) @dataclass(frozen=True, eq=True, slots=True) class BranchVisitor: current_label: str previous_label: str | None - token: str | None = None - history: ((str, str),) = field(default_factory=tuple) + loop_context: tuple[tuple[str, ConnType],...] = field(default_factory=tuple) + + #: tuple of jumps between segments: ( (src, target), (src, target), ... ) + history: tuple[tuple[str,str], ...] = field(default_factory=tuple) + + #: list of (jmp src, jpm target) when visitor was spawned + token: tuple[tuple[str,str], ...] | None = None def to_dict(self): return {s: str(getattr(self, s)) for s in self.__slots__} - @dataclass(frozen=True, eq=True, slots=True) class CrawlStep: step: int visitor: BranchVisitor element_name: str + element_tag: str + local_index: int = 0 # position within segment def to_dict(self): return {s: getattr(self, s) for s in self.__slots__} +@dataclass(frozen=True, eq=True, slots=True) +class Jump(JSONSerializable): + """Class representing a connector + + """ + # name of element where jump is located + src_name: str + + # where connector points to + target: str + + # true if goto connector + is_goto: bool + + # true if next-value + is_loop: bool + + # true if no more values connector + is_no_more_values: bool + + # true if fault connector + is_fault: bool + + def priority(self) -> int: + # lower is higher priority + if self.is_loop: + return 0 + else: + return 1 + class InfluenceStatementEncoder(json.JSONEncoder): def default(self, obj): - if isinstance(obj, DataInfluenceStatement): + if isinstance(obj, InfluenceStatement): raw_dict = obj.to_dict() # For public display, we replace flow_path with source_path # to correctly display transmission elements @@ -402,12 +563,12 @@ def default(self, obj): return json.JSONEncoder.default(self, obj) -def _get_end_vars(df: DataInfluencePath) -> (str, str): +def _get_end_vars(df: InfluencePath) -> tuple[str, str]: return (_recover_var(df.influencer_name, df.influencer_property), _recover_var(df.influenced_name, df.influenced_property)) -def _recover_var(name, prop) -> (str, str): +def _recover_var(name: str, prop: str) -> str: if prop is None: return name else: diff --git a/packages/code-analyzer-flow-engine/FlowScanner/public/enums.py b/packages/code-analyzer-flow-engine/FlowScanner/public/enums.py index c9519add..d944f389 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/public/enums.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/public/enums.py @@ -1,17 +1,92 @@ """Public Enum types """ -from enum import Enum +from enum import Enum, EnumMeta + +class MetaEnum(EnumMeta): + def __contains__(cls, item): + if isinstance(item, str): + try: + cls(item.lower()) + except ValueError: + return False + return True + return super.__contains__(cls, item) + +class BaseEnum(Enum, metaclass=MetaEnum): + pass + +class TransformType(BaseEnum): + Map = "map" + InnerJoin = "innerjoin" + Sum = "sum" + Count = "count" + +class ComplexValueType(BaseEnum): + """Class the identifies JSON structure schema of complexValue + """ + """ + "resourceTemplate": text_template (with merge-fields) + """ + ResourceDescriptor = "resourcedescriptor" + + """ + "name": "myAccVar.Name", + "resourceType": "SObjectField", + "resourceName": "Account", + "resourceField": "Name", + "collection": false + """ + ResourceAnnotationMap = "resourceannotationmap" + + """ + "dataType": "SObject", + "objectType": "MyObj__c", + "fieldReferences": ["FieldA__c", "FieldB__c"], + "elementReference": "Get_Override_Time_Entries" + """ + FieldReference = "fieldreference" + + """ + "dataType": "SObject", + "objectType": "MyObj__c", + "fieldReferences": ["FieldA__c", "FieldB__c"], + "elementReference": "Get_Override_Time_Entries" + """ + ComplexObjectFieldDetails = "complexobjectfielddetails" + + """ + "leftElementReference":"ContentVersions", + "leftJoinKeys":["Id"], + "leftSelectedFields":["ContentDocumentId"], + "rightElementReference":"Deserialize_File_Upload.fileUpload.files", + "rightJoinKeys":["contentVersionId"], + "rightSelectedFields":["name"] + """ + JoinDefinition = "joindefinition" class FlowType(Enum): + Screen = 0 AutoLaunched = 1 - RecordTrigger = 2 - Scheduled = 3 + Trigger = 2 ProcessBuilder = 4 Workflow = 5 InvocableProcess = 6 + Orchestrator = 8 + Unknown = 10 + +class TriggerType(Enum): + RecordAfterSave = 1 + Capability = 2 + Scheduled = 3 + RecordBeforeSave = 4 + RecordBeforeDelete = 5 + PlatformEvent = 6 + Segment = 7 + NotTrigger = 9 + Unknown = 10 class FlowValue(Enum): @@ -34,8 +109,16 @@ class DataType(Enum): class ConnType(Enum): Loop = 1 # only for nextValue connectors (for loop unrolling) Goto = 2 # all connectors labelled goto - Other = 3 # everything else (including noMoreValue connectors) + Exception = 3 # Fault and Timeout connectors + TriggeredAction = 4 # event to trigger action + ScreenLoadAction = 5 # action when screen is loaded + Other = 10 # everything else (including noMoreValue connectors) +class QueryAction(Enum): + lexical = 0 # can only access parser + flow_enter = 20 # can access CFG, crawl schedule + process_elem = 10 # called on each flow element + scan_exit = 30 # called when flow crawl complete class ReferenceType(Enum): # this is a variable holding the value @@ -53,11 +136,16 @@ class ReferenceType(Enum): # A reference to a variable passed in from an action call, so foo.var refers to var in apex. ActionCallReference = 4 - # A reference to a named Flow Element that is not a subflow or loop or collection ref. + # A reference to a named Flow Element holding a value + # that is not a subflow or loop or collection ref. ElementReference = 5 + # A reference to an element that does not itself hold + # a value but may be needed for reporting and tracking. + NodeReference = 6 + # A constant - Constant = 6 + Constant = 10 # Global Global = 7 diff --git a/packages/code-analyzer-flow-engine/FlowScanner/public/flowtest_exceptions.py b/packages/code-analyzer-flow-engine/FlowScanner/public/flow_scanner_exceptions.py similarity index 84% rename from packages/code-analyzer-flow-engine/FlowScanner/public/flowtest_exceptions.py rename to packages/code-analyzer-flow-engine/FlowScanner/public/flow_scanner_exceptions.py index 09c84d0b..b17778be 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/public/flowtest_exceptions.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/public/flow_scanner_exceptions.py @@ -1,11 +1,11 @@ -class FlowtestException(Exception): +class FlowScannerException(Exception): """base class for all exceptions raised by Flowtest""" def __init__(self, *args): super().__init__(*args) -class InvalidFlowException(FlowtestException): +class InvalidFlowException(FlowScannerException): """Raised when there is something wrong with the flow file, such as invalid xml, invalid filename, missing structure, etc.""" def __init__(self, *args, **kwargs): diff --git a/packages/code-analyzer-flow-engine/FlowScanner/public/parse_utils.py b/packages/code-analyzer-flow-engine/FlowScanner/public/parse_utils.py index 85a03884..e76ccc45 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/public/parse_utils.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/public/parse_utils.py @@ -15,13 +15,16 @@ """ from __future__ import annotations +import json import logging +import os import re +import traceback +from typing import Callable import public.custom_parser as CP from public.custom_parser import ET -from public.enums import DataType, ConnType -from public.flowtest_exceptions import InvalidFlowException +from public.enums import DataType, ConnType, TransformType, ComplexValueType #: sfdc namespace ns = '{http://soap.sforce.com/2006/04/metadata}' @@ -53,32 +56,127 @@ #: list of all known connector tags CONN_LIST = [CONNECTOR, DEFAULT_CONN, NEXT_VALUE_CONNECTOR, FAULT_CONNECTOR, NO_MORE_CONN, TIMEOUT_CONNECTOR] +#: List of global namespaces that correspond to a single record or variable, +#: such as $Record or $User +GLOBALS_RECORD = ["$Record", "$Record__Prior", "$User", "$Event"] + +#: List of global namespaces that correspond to a set of records +#: such as $Input, $Output, $Setup. These must be accessed as $Input.myobj.field or $Input.my_var +GLOBALS_RECORD_GROUP = ["$Input", "$Output", "$Setup", "$CustomMetadata"] + + +#: List of all global namespaces that contain unrelated values that have no object/member relationship with each other. +GLOBALS_SINGLETON = ["$Api", "$Client", "$Flow", "$Label", "$Permission", + "$Organization", "$Profile", "$System", "$UserRole"] + +ALL_GLOBALS = GLOBALS_RECORD + GLOBALS_SINGLETON + GLOBALS_RECORD_GROUP + #: list of all (supported) elements that have a connector except start. #: These are relevant for control flow. - -CTRL_FLOW_ELEM = ["screens", +CTRL_FLOW_ELEM = ["actionCalls", + "apexPluginCalls", "assignments", - "customErrors" - "recordLookups", - "subflows", - "recordUpdates", - "recordDeletes", - "recordCreates", - "loops", - "decisions", "collectionProcessors", - "actionCalls", + "customErrors", + "decisions", + "loops", "orchestratedStages", - "waits", - "apexPluginCalls", + "recordLookups", + "recordCreates", + "recordDeletes", + "recordLookups", + "recordRollbacks", + "recordUpdates", + "screens", + "subflows", "transforms", - "recordRollbacks"] + "waits" + ] #: list of supported start elements START_ELEMS = ['start', 'startElementReference'] +START_ELEMS_TAGGED = ['', ''] #: list of banned elements (we may add support later, but now skip these flows) -BANNED_ELEMS = ['startElement', 'connectors', 'allocators', 'questions', 'experiments', 'statements'] +#: these correspond workflows and older flow grammars +BANNED_ELEMS = ['startElement', + 'connectors', + 'allocators', + 'questions', + 'experiments', + 'source', + 'target', + 'statements'] + +BANNED_ELEMS_TAGGED = ['', + '', + '', + '', + '', + '', + '', + ''] + + +#: List of all resource tags +RESOURCE_TAGS = [ + 'dynamicChoiceSets', + 'choices', + 'variables', + 'constants', + 'formulas', + 'textTemplates' + ] + +#: list of all tags that might hold a reference to a resource as text content +DIRECT_REF_HOLDERS = [ + "assignNextValueToReference", + "assignToReference", + "assignRecordIdToReference", + "choiceReferences", + "collectionReference", + "defaultSelectedChoiceReference", + "elementReference", + "eventSource", + "field", + "inputReference", + "leftValueReference" + "objectFieldReference", + "outputFieldApiName", + "outputReference", + ] + +EXPRESSION_REF_HOLDERS = [ + # values here can be in expressions or templates + # but will appear as merge-fields and must be extracted + "text", # in text template + "fieldText", + "choiceText", + "stringValue" + "expression", # in formula + "formulaExpression" # has no name and so cannot be referenced +] + +SCREEN_FIELD_TYPES = [ + 'ComponentChoice', + 'ComponentInput', + 'ComponentInstance', + 'ComponentMultiChoice' + 'DisplayText', + 'DropdownBox', + 'InputField', + 'LargeTextArea', + 'MultiSelectCheckboxes', + 'MultiSelectPicklist', + 'ObjectProvided', # --> objectFieldReference (e.g. Foo.my_field) assigns contents to var + 'PasswordField', + 'RadioButtons', + 'Region', + 'RegionContainer', + 'Repeater' + ] + +# Then need to also check `value`, `rightValue` tags #: module logger logger = logging.getLogger(__name__) @@ -110,6 +208,18 @@ def get_tag(elem: ET.Element) -> str: else: return '' +def get_text_of_tag(elem: ET.Element, tagname: str) -> str | None: + """look for a single child elem (does not recurse) with the specified tagname and return the text. + returns None if there is not exactly one child elem with the specified tagname or if it has no text.""" + res = get_by_tag(elem, tagname) + if len(res) == 1 and res[0] is not None: + r = res[0].text + if r is None or r == '': + return None + else: + return r + return None + def is_subflow(elem: ET.Element) -> bool: if elem is None: @@ -131,20 +241,22 @@ def is_loop(elem: ET.Element) -> bool: return elem.tag.endswith("loops") -def is_goto_connector(elem: ET.Element) -> bool: +def is_goto_connector(elem: ET.Element) -> bool | None: """Is this element a goto? Args: elem: connector element Returns: - whether this is a goto element + whether this is a goto element, + None if child has no tag or no children """ for child in elem: if get_tag(child) == 'isGoTo': return child.text == 'true' else: return False + return None def is_decision(elem: ET.Element) -> bool: @@ -172,7 +284,7 @@ def get_by_tag(elem: ET.Element, tagname: str) -> list[ET.Element]: XML Elements else [] if no matches """ - return elem.findall(f'{ns}{tagname}') + return elem.findall(f'./{ns}{tagname}') def get_named_elems(elem: ET.Element) -> list[ET.Element]: @@ -210,9 +322,20 @@ def get_elem_string(elem: ET.Element) -> str | None: def get_line_no(elem: ET.Element) -> int: + # noinspection PyUnresolvedReferences return elem.sourceline +def get_start_element(root: ET.Element) -> ET.Element | None: + start_elements = START_ELEMS + start_res = {x: get_by_tag(root, x) for x in start_elements} + + for key in start_res: + if len(start_res[key]) == 1: + return start_res[key][0] + return None + + def get_subflow_name(subflow): sub_name_el = get_by_tag(subflow, "flowName") if sub_name_el is None or len(sub_name_el) == 0: @@ -224,8 +347,7 @@ def get_subflow_name(subflow): return sub_name_el[0].text - -def get_assignment_statement_dicts(elem: ET.Element) -> list[(str, {str: str})] | None: +def get_assignment_statement_dicts(elem: ET.Element) -> list[tuple[str, dict[str, str]]] | None: """Returns assignment statement keywords in 'assignments' elems Args: elem: elem to parse, should have a tag of "assignments" @@ -249,7 +371,7 @@ def get_assignment_statement_dicts(elem: ET.Element) -> list[(str, {str: str})] return None -def get_filters(elem: ET.Element) -> [ET.Element]: +def get_filters(elem: ET.Element) -> list[ET.Element]: """Find all filter elements Searches recursively to find all elements that are children @@ -264,8 +386,291 @@ def get_filters(elem: ET.Element) -> [ET.Element]: """ return elem.findall(f'.//{ns}filters') +def get_transform_influencers(transform_elem: ET.Element) -> list[tuple[TransformType,str|None,tuple[str, ...]]] | None: + """Converts transform elem to a list of tuples [(transform_type, outputAPI field (or None), tuple(influencer_names)] + Args: + transform_elem: top level elem to process -def get_input_assignments(elem: ET.Element) -> [ET.Element]: + Returns: + [(transform_type, influenced_name, tuple(influencer_names, ...))] + + """ + if transform_elem is None: + logger.error("called get_transform_influencers will null element") + return None + values = get_by_tag(transform_elem,'transformValues') + if len(values) == 0: + return None + + output = [] + join_name = None + join_def = None + try: + # first look for meta + for t_value in values: + # first look for meta + value_ref = get_text_of_tag(t_value, 'transformValueName') + + if value_ref is not None: + join_name = value_ref + # this is a join definition, so grab it + val_actions = get_by_tag(t_value, 'transformValueActions') + assert len(val_actions) == 1 + assert get_text_of_tag(val_actions[0], 'transformType') == 'InnerJoin' + val_elem = val_actions[0].find(f'./{ns}inputParameters/{ns}value') + join_def = get_vars_from_value(val_elem) + break + + # Now look for other tags + for t_value in values: + val_actions = get_by_tag(t_value, 'transformValueActions') + + for val_action in val_actions: + output_api = get_text_of_tag(val_action, 'outputFieldApiName') # could be None + action_type = get_text_of_tag(val_action, 'transformType') + + if action_type == 'InnerJoin': + # already processed this + continue + + elif action_type == 'Map': + value_el = get_by_tag(val_action, 'value')[0] + res = get_vars_from_value(value_el) + if res is not None: + t_value = dict.get(res, 'transformValueReference', None) + + if res is not None and t_value is not None: + assert join_name is not None and join_def is not None + + # noinspection PyUnresolvedReferences + left_table = join_def['complexValueType.JoinDefinition.leftElementReference'][0] + # noinspection PyUnresolvedReferences + right_table = join_def['complexValueType.JoinDefinition.rightElementReference'][0] + + # noinspection PyTypeChecker + fixed = t_value[0].replace(f"{join_name}.LeftTable", + left_table).replace(f"{join_name}.RightTable", + right_table) + output.append(('Map', output_api, (fixed,))) + + elif res is not None: + accum = () + for val in res.values(): + accum += tuple(val) + output.append(('Map', output_api, accum)) + + elif action_type == 'Sum' or action_type == 'Count': + elem_top = None + field = None + action_to_return = [] + + for input_ in get_by_tag(val_action, 'inputParameters'): + input_name = get_text_of_tag(input_, 'name') + + if input_name == 'aggregationField': + field = input_.find(f'./{ns}value/{ns}stringValue').text + + elif input_name == 'aggregationValues': + elem_top = input_.find(f'./{ns}value/{ns}elementReference').text + + elif input_name == 'aggregationFieldReference': + res = get_vars_from_value(input_.find(f'./{ns}value')) + to_return = (action_type, output_api, tuple(res['complexValue.FieldReference'])) + action_to_return.append(to_return) + break + + if not action_to_return: + assert elem_top is not None + + if field is None: + action_to_return.append( + (action_type, output_api, (elem_top,)) + ) + else: + action_to_return.append( + (action_type, output_api, (f"{elem_top}.{field}",)) + ) + + output += action_to_return + + return output + + except: + logger.critical(f"could not parse transform {get_elem_string(transform_elem)}\n" + f"{traceback.format_exc()}") + return None + +def get_vars_from_value(elem: ET.Element, + expr_parser :Callable[[str], list[str]]=parse_expression) -> dict[str, list[str]] | None: + """accepts , , or element and returns a list + of variables that influence this element. + * The variables are not normalized, e.g. "foo.Name" will appear. + * In the case of inner join complex values, further processing + is needed to resolve the join tables + + Args: + expr_parser (callable): method to parse expressions (default regexp is provided) + elem: (ET.Element): element + + Returns: + a dict tagname: list[variable names] + where tagname is the tag of the child element of value holding the reference unless + this is a complexValue, in which case the tagname contains refined information: + 'ComplexValueType.FieldReference': ['var1', 'var2'] + 'ComplexValueType.FieldReference': ['var1', 'var2'] + 'ComplexValueType.JoinDefinition.leftJoinKeys: ['var1', 'var2'] + 'ComplexValueType.JoinDefinition.rightJoinKeys: ['var1', 'var2'] + 'ComplexValueType.JoinDefinition.leftElementReference': ['var1'] + 'ComplexValueType.JoinDefinition.rightElementReference': ['var1'] + 'ComplexValueType.JoinDefinition.leftSelectedFields': ['var1'] + 'ComplexValueType.JoinDefinition.rightSelectedFields': ['var1'] + + If there are no variable influencers, the None is returned. + """ + if elem is None: + logger.error("called 'get_vars_from_value' with null input") + return None + + for child_el in elem: + child_tag = get_tag(child_el) + + if child_tag == 'collectionElements': + for el in child_el: + el_tag = get_tag(el) + res = _process_val_child(el, el_tag=el_tag, parent_el=child_el, expr_parser=expr_parser) + if res is not None: + return res + return None + + res = _process_val_child(child_el, el_tag=child_tag,parent_el=elem, expr_parser=expr_parser) + if res is not None: + return res + + # fall through + return None + +def _process_val_child(elem: ET.Element, el_tag: str, parent_el: ET.Element, + expr_parser :Callable[[str], list[str]]=parse_expression) -> dict[str, list[str]] | None: + + raw_data = elem.text + if raw_data is None or len(raw_data) == 0: + return None + + data = rid_item(raw_data) + if data is None or len(data) == 0: + return None + + if el_tag == 'elementReference': + return {el_tag:[data]} + + elif el_tag == 'stringValue' or el_tag == 'formulaExpression': + # this may be a formula + vars_ = expr_parser(data) + if len(vars_) > 0: + return {el_tag: vars_} + else: + return None + + elif el_tag == 'complexValue': + # get the complex value type + t_type = get_text_of_tag(parent_el, 'complexValueType') + if t_type is None or len(t_type) == 0 or t_type not in ComplexValueType: + return None + try: + type_dict = json.loads(data) + except: + logger.error(f"could not de-serialize complex value type {data}") + return None + + try: + """ + if t_type == ComplexValueType.ComplexObjectFieldDetails.name: + # these are used to specify labels in datatables flow extension + # in screen flows but do not correspond to actual flow variables. + pass + """ + if t_type == ComplexValueType.FieldReference.name: + # used in aggregation transforms such as sum and count transforms + field_refs = dict.get(type_dict, "fieldReferences", None) + elem_ref = dict.get(type_dict, "elementReference", None) + if (field_refs is None or len(field_refs) == 0) and ( + elem_ref is None or len(elem_ref) == 0): + return None + + elif field_refs is None or len(field_refs)==0: + to_add = [elem_ref] + else: + to_add = [f"{elem_ref}.{f}" for f in field_refs] + + return {'complexValue.FieldReference': to_add} + + elif t_type == ComplexValueType.JoinDefinition.name: + left_el_ref = dict.get(type_dict, "leftElementReference", None) + left_join_keys = dict.get(type_dict, "leftJoinKeys", None) + left_selected_fields = dict.get(type_dict, "leftSelectedFields", []) + right_el_ref = dict.get(type_dict, "rightElementReference", None) + right_join_keys = dict.get(type_dict,"rightJoinKeys", None) + right_selected_keys = dict.get(type_dict,"rightSelectedFields", []) + + to_return = { + "complexValueType.JoinDefinition.leftJoinKeys": [ + f"{left_el_ref}.{x}" for x in left_join_keys], + "complexValueType.JoinDefinition.rightJoinKeys": [ + f"{right_el_ref}.{x}" for x in right_join_keys], + "complexValueType.JoinDefinition.leftSelectedFields": [ + f"{left_el_ref}.{x}" for x in left_selected_fields], + "complexValueType.JoinDefinition.rightSelectedFields": [ + f"{right_el_ref}.{x}" for x in right_selected_keys], + "complexValueType.JoinDefinition.leftElementReference": [ + left_el_ref + ], + "complexValueType.JoinDefinition.rightElementReference": [ + right_el_ref + ] + } + return to_return + + elif t_type == ComplexValueType.ResourceDescriptor.name: + var_str = dict.get(type_dict, "resourceTemplate", '') + vars_ = expr_parser(var_str) + if len(vars_) > 0: + return {'complexValue.ResourceDescriptor': vars_} + else: + return None + + elif t_type == ComplexValueType.ResourceAnnotationMap.name: + var_str = dict.get(type_dict, "name", '') + vars_ = expr_parser(var_str) + if len(vars_) > 0: + return {'complexValue.ResourceAnnotationMap': vars_} + else: + return None + + except: + logger.error(f"could not process complex value type {data}") + return None + + elif el_tag == 'transformValueReference': + return {'transformValueReference': [data]} + + elif el_tag in ['apexValue', 'sobjectValue']: + try: + parsed = json.loads(data) + to_return = [] + recursive_parse(parsed, parse_callable=expr_parser, accum=to_return) + if len(to_return) == 0: + return None + else: + return {el_tag: to_return} + + except: + logger.error(f"could not json parse data {data} in tag {el_tag}") + + # fall through + return None + + +def get_input_assignments(elem: ET.Element) -> list[ET.Element]: """Find all input assignments Searches recursively to find all elements that are children @@ -281,7 +686,7 @@ def get_input_assignments(elem: ET.Element) -> [ET.Element]: return elem.findall(f'.//{ns}inputAssignments') -def get_sinks_from_field_values(elems: ET.Element) -> list[(str, str)]: +def get_sinks_from_field_values(elems: list[ET.Element]) -> list[tuple[str, str | None, str]]: """Find variables that flow into field/value pairs E.g.if a recordLookup field has a filter:: @@ -294,7 +699,7 @@ def get_sinks_from_field_values(elems: ET.Element) -> list[(str, str)]: - then this would return [('Name', 'var3')] + then this would return [('Name', 'Contains', 'var3')] This strategy also works for inputAssignments:: @@ -305,38 +710,96 @@ def get_sinks_from_field_values(elems: ET.Element) -> list[(str, str)]: + then this would return [('Company', None, 'Company')] + Notes: - TODO: we are cheating a bit by not checking for op code in the case of filters. This should be added later. Args: elems: inputAssignment or field selection criteria xml elements. Returns: - ``list[(field_name, influencer_name)]`` (an empty list if no sinks are found) + ``list[(field_name, op, influencer_name)]`` (an empty list if no sinks are found) """ accum = [] for a_filter in elems: field_name = None influencer = None + operator = None for child in a_filter: + child_tag = get_tag(child) if child_tag == 'field': field_name = child.text + if child_tag == 'operator': + if child.text is None or len(child.text) == 0: + operator = None + else: + operator = child.text + if child_tag == 'value': for e_ref in child: if get_tag(e_ref) == 'elementReference': influencer = e_ref.text if influencer is not None and field_name is not None: - accum.append((field_name, influencer)) + accum.append((field_name, operator, influencer)) + + return accum + +def process_output_assignments(elem: ET.Element) -> list[tuple[str, str]]: + """Searches elem recursively and pulls out doubles of the form: + + WorkItemID + Id + + + returning a list of doubles [('Id', 'WorkItemID')] + if none found, it returns the empty list [] + + :param elem: to search (recursively) + :return: list of triples (influencer field, (influenced) assignTo field) + + """ + elems = elem.findall(f'.//{ns}outputAssignments') + accum = [] + for elem in elems: + influencer = None + influenced = None + for child in elem: + if child.tag == f'{ns}assignToReference' and child.text is not None: + influenced = child.text + if child.tag == f'{ns}field' and child.text is not None: + influencer = child.text + if influencer is not None and influenced is not None: + accum.append((influencer, influenced)) return accum +def get_field_op_values_from_elem(elem: ET.Element, tag: str) -> list[tuple[str, str | None, str]]: + """ + Searches elem recursively for tag, and the pull-out triples of the form: + + foo + Contains + + bar + + returning a list of triples [('foo', 'Contains', 'bar')] -def get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool)} or None: + if none found, it returns the empty list + + :param elem: to search (recursively) + :param tag: tag that must be a descendent of elem + :return: list of triples (field_name, operator, influencer_name) + """ + + elems = elem.findall(f'.//{ns}{tag}') + return get_sinks_from_field_values(elems) + +def get_conn_target_map(elem: ET.Element) -> dict[ET.Element, tuple[str, ConnType, bool]] | None: """Get a connector map that also works for all possible start elements Args: @@ -361,6 +824,8 @@ def get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool)} elif tag == 'start': standard_connectors = _get_conn_target_map(elem) + + # Now look for scheduled paths scheduled_paths = elem.findall(f'.//{ns}scheduledPaths/{ns}connector') if scheduled_paths is None or len(scheduled_paths) == 0: return standard_connectors @@ -369,6 +834,7 @@ def get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool)} try: conn_name = x.find('.//{ns}targetReference').text standard_connectors[x] = (conn_name, ConnType.Other, False) + # noinspection PyBroadException except: continue return standard_connectors @@ -376,7 +842,7 @@ def get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool)} return _get_conn_target_map(elem) -def _get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool)}: +def _get_conn_target_map(elem: ET.Element) -> dict[ET.Element, tuple[str, ConnType, bool]]: """returns map from connectors at elem to where they point Args: @@ -389,32 +855,38 @@ def _get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool) return {} to_return = {} el_tag = get_tag(elem) - + is_optional = False # start with this and then override + missing_connector = False if el_tag == 'decisions': - is_decision_ = True - else: - is_decision_ = False + + rules_els = get_by_tag(elem, 'rules') + for rule in rules_els: + conn = get_by_tag(rule, 'connector') + if not conn: + # if there is a condition with no connector + # then this element can terminate execution + # when the condition is met + missing_connector = True + break for conn_type in CONN_LIST: cons = elem.findall(f'.//{ns}{conn_type}') if cons is not None and len(cons) > 0: for x in cons: - if is_decision_ is True: - # in a decision, only default connectors are not optional - if conn_type == DEFAULT_CONN: - is_optional = False - else: - is_optional = True - else: - if conn_type in [FAULT_CONNECTOR, TIMEOUT_CONNECTOR]: - is_optional = True - else: - is_optional = False + if conn_type in [FAULT_CONNECTOR, TIMEOUT_CONNECTOR, NEXT_VALUE_CONNECTOR]: + is_optional = True + if conn_type == NO_MORE_CONN: + is_optional = False + if (el_tag == 'decisions' and (missing_connector is True or + conn_type != DEFAULT_CONN)): + # connectors are optional if they are not default + # or if they are default and a rule is missing a connector + is_optional = True res = get_by_tag(elem=x, tagname='targetReference') if res is None or len(res) == 0: logger.error(f"ERROR: found a connector without a target reference! " - f"{ET.tostring(elem, encoding='unicode')}") + f"{get_elem_string(elem)}") continue else: # don't overwrite existing value -- each connector should have a single target reference @@ -424,12 +896,14 @@ def _get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool) # classify connector if is_goto_connector(x): # this takes priority - to_return[x] = (target_name, ConnType.Goto, is_optional) elif conn_type == NEXT_VALUE_CONNECTOR: to_return[x] = (res[0].text, ConnType.Loop, is_optional) + elif conn_type == FAULT_CONNECTOR or conn_type == TIMEOUT_CONNECTOR: + to_return[x] = (res[0].text, ConnType.Exception, is_optional) + else: to_return[x] = (res[0].text, ConnType.Other, is_optional) @@ -441,7 +915,6 @@ def _get_conn_target_map(elem: ET.Element) -> {ET.Element: (str, ConnType, bool) # Utilities for parsing variables # - def is_assign_null(elem: ET.Element) -> bool | None: res = elem.find(f'{ns}assignNullValuesIfNoRecordsFound') if res is None: @@ -537,7 +1010,7 @@ def is_output(elem: ET.Element) -> bool: """ -def _process_assignment_item(elem: ET.Element) -> (str, {str: str}): +def _process_assignment_item(elem: CP.ET.Element) -> tuple[str, dict[str, str]] | None: """Returns assignment item dict from assignment element Args: @@ -564,6 +1037,7 @@ def _process_assignment_item(elem: ET.Element) -> (str, {str: str}): for child in elem: if child.tag == f'{ns}assignToReference': entry['influenced_var'] = child.text + # noinspection PyUnresolvedReferences entry['line_no'] = child.sourceline if child.tag == f'{ns}operator': @@ -582,7 +1056,7 @@ def _process_assignment_item(elem: ET.Element) -> (str, {str: str}): return None -def _get_value(el: ET.Element) -> str: +def _get_value(el: ET.Element) -> str | None: for child in el: if get_tag(child) == 'elementReference': return child.text @@ -591,7 +1065,7 @@ def _get_value(el: ET.Element) -> str: return None -def get_subflow_output_map(subflow: ET.Element): +def get_subflow_output_map(subflow: ET.Element) -> tuple[bool, dict[str,str]]: """returns a tuple (bool:, map: child name --> parent name) where the first return value is true if outputs are automatically assigned in which case they are flow_name.flow_var @@ -614,7 +1088,7 @@ def get_subflow_output_map(subflow: ET.Element): return auto, mappings -def get_subflow_input_map(subflow: ET.Element) -> {str: str}: +def get_subflow_input_map(subflow: ET.Element) -> dict[str, str]: """Returns a map from caller variable to variable in called flow E.g. in this example:: @@ -640,10 +1114,126 @@ def get_subflow_input_map(subflow: ET.Element) -> {str: str}: accum = dict() inputs = get_by_tag(subflow, "inputAssignments") for assignment in inputs: - val = get_by_tag(assignment, 'name')[0].text + val_els = get_by_tag(assignment, 'name') + if len(val_els) != 1: + continue + else: + val = val_els[0].text key_refs = assignment.findall(f'{ns}value[1]/{ns}elementReference[1]') if key_refs is None or len(key_refs) == 0: continue key = key_refs[0].text accum[key] = val return accum + +def _get_tags(root: ET.Element, tags: list[str]) -> list[str]: + accum = [] + for tag in tags: + res = root.findall(f'.//{ns}{tag}') + for res in res: + if res.text is not None and res.text.strip() != '': + accum.append(res.text.strip()) + return accum + +def get_all_flow_refs(root: ET.Element) -> list[str]: + accum = _get_tags(root, tags=DIRECT_REF_HOLDERS) + expressions = _get_tags(root, tags=EXPRESSION_REF_HOLDERS) + for expr in expressions: + accum += parse_expression(expr) + + return list(set(accum)) + + +def rid_item(msg: str) -> str: + return msg.replace('[$EachItem]', '') + +def recursive_parse(my_obj, parse_callable=parse_expression, accum=None) -> None: + """walks through json objs and applies the parse_callable to values + + Args: + my_obj (obj): JSON object + parse_callable (Callable): callable to parse strings + accum (list[str]): list of strings that values are added to + + Returns: + None (accum is changed in place) + + """ + if accum is None: + my_accum = [] + else: + my_accum = accum + + if isinstance(my_obj, dict): + for key, value in my_obj.items(): + recursive_parse(value, parse_callable=parse_callable, accum=my_accum) # Recurse into nested dictionaries + + elif isinstance(my_obj, list): + for item in my_obj: + recursive_parse(item, parse_callable=parse_callable, accum=my_accum) # Recurse into list elements + + elif isinstance(my_obj, str): + to_append = parse_callable(my_obj) + if to_append is not None and len(to_append) > 0: + [my_accum.append(x) for x in to_append] + + return None + +def quick_validate(flow_path: str) -> bool: + has_start = False + has_banned = False + try: + with open(flow_path, 'r') as fp: + flow_data = fp.read() + for start_tag in START_ELEMS_TAGGED: + if start_tag in flow_data: + has_start = True + break + + for banned_tag in BANNED_ELEMS_TAGGED: + if banned_tag in flow_data: + has_banned = True + break + + return has_start and not has_banned + + except: + logger.critical(f"exception when attempting to quick_validate flow {flow_path}" + f"{traceback.format_exc()}") + return False + +def validate_flow(flow_path: str) -> bool: + """There are many legacy versions of flows that contain grammars we cannot parse. + This tool only processes modern flows that can be built in flow builder. + + Args: + flow_path (str): path of flow + + Returns: + True if the flow is valid, False otherwise + + """ + # 1. Flows must be parseable + # 2. Flows must have a start element + # 3. Flows must not contain unsupported legacy tags corresponding to older grammars + try: + root = CP.get_root(flow_path) + starts = get_by_tag(root, 'start') + + if len(starts) != 1: + start_ref = get_by_tag(root, 'startElementReference') + if len(start_ref) != 1: + print(f"flow {flow_path} has no start element. Skipping..") + return False + + for x in BANNED_ELEMS: + if len(get_by_tag(root, x)) > 0: + print(f"flow {flow_path} contains the legacy {x} element which is unsupported. Skipping..") + return False + return True + + except Exception: + print(f"Could not parse flow {flow_path}. Skipping..") + return False + + diff --git a/packages/code-analyzer-flow-engine/FlowScanner/queries/debug_query.py b/packages/code-analyzer-flow-engine/FlowScanner/queries/debug_query.py new file mode 100644 index 00000000..a887e7e9 --- /dev/null +++ b/packages/code-analyzer-flow-engine/FlowScanner/queries/debug_query.py @@ -0,0 +1,69 @@ +"""Queries used for debugging only + + +""" +from __future__ import annotations + +import logging +import re +from typing import TypeAlias +import json + +import public +from flow_scanner import control_flow +from flow_scanner.control_flow import Crawler +from public import parse_utils +from public.contracts import (AbstractQuery, QueryAction, QueryDescription, + QueryResult, State, AbstractCrawler, FlowParser, LexicalQuery, Query) +from public.data_obj import CrawlStep, InfluenceStatement, InfluencePath +from public.enums import Severity, ConnType, TriggerType, FlowType + +El: TypeAlias = parse_utils.CP.ET.Element + +logger = logging.getLogger(__name__) + +DEFAULT_HELP_URL = ("https://developer.salesforce.com/docs/atlas.en-us.secure_coding_guide.meta" + "/secure_coding_guide/secure_coding_considerations_flow_design.htm") + + +ns = parse_utils.ns + +RESOURCE_TAGS = parse_utils.RESOURCE_TAGS + +# Add query id to class name map. Add here to register +# the query and make it available for use via CLI +# The key is the query id as used in CLI, and the value is the class name +QUERIES = { + "Detect": "Flow Detected" +} + + +class Detect(AbstractQuery): + + def __init__(self, msg: str|None = None): + try: + conf = json.loads(msg) + self.conf = msg + except: + self.conf = None + self.query_id = 'Detect' + self.query_name = QUERIES[self.query_id] + + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + query_description="Flow detected from one named element to another", + severity=Severity.Flow_Low_Severity, + is_security=False + ) + + + def when_to_run(self) -> QueryAction: + return QueryAction.process_elem + + def execute(self) -> list[QueryResult] | None: + if self.conf is None: + return None + pass diff --git a/packages/code-analyzer-flow-engine/FlowScanner/queries/default_query.py b/packages/code-analyzer-flow-engine/FlowScanner/queries/default_query.py index 6434faba..0cc638cd 100644 --- a/packages/code-analyzer-flow-engine/FlowScanner/queries/default_query.py +++ b/packages/code-analyzer-flow-engine/FlowScanner/queries/default_query.py @@ -8,17 +8,17 @@ from typing import TYPE_CHECKING import logging -from flowtest.flow_result import DEFAULT_HELP_URL +from flow_scanner.flow_result import DEFAULT_HELP_URL if TYPE_CHECKING: import xml.etree.ElementTree as ET from public import parse_utils -from public.data_obj import DataInfluenceStatement, QueryResult +from public.data_obj import InfluenceStatement, QueryResult from public.data_obj import QueryDescription, Preset -from public.enums import Severity -from public.contracts import QueryProcessor, FlowParser, State +from public.enums import Severity, FlowType +from public.contracts import QueryProcessor, FlowParser, State, AbstractCrawler logger = logging.getLogger(__name__) @@ -62,6 +62,7 @@ QUERY_IDS = [] + def build_preset(preset_name: str = DEFAULT_PRESET): if preset_name is None: preset_name = DEFAULT_PRESET @@ -90,29 +91,31 @@ class DefaultQueryProcessor(QueryProcessor): """ def __init__(self) -> None: + + #: flow (xml) root + self.root: ET.Element + #: preset selected by user self.preset: Preset | None = None #: taint sources are populated on flow enter - self.sources: {(str, str)} = set() + self.sources: set[tuple[str, str]] = set() #: flow parser self.parser: FlowParser | None = None - #: flow (xml) root - self.root: ET.Element - #: path of flow - self.flow_paths: [str] = None + self.flow_paths: list[str] | None = None + - def set_preset_name(self, preset_name: str | None) -> Preset | None: + def set_preset(self, preset_name: str | None) -> Preset | None: self.preset = build_preset(preset_name) return self.preset - def handle_crawl_element(self, state: State) -> list[QueryResult] | None: + def handle_crawl_element(self, state: State, crawler: AbstractCrawler=None) -> list[QueryResult] | None: return self.process_element(state.get_current_elem(), state) - def handle_flow_enter(self, state: State) -> list[QueryResult] | None: + def handle_flow_enter(self, state: State, crawler: AbstractCrawler=None) -> list[QueryResult] | None: # set current parser parser = state.get_parser() flow_path = parser.get_filename() @@ -131,7 +134,7 @@ def handle_flow_enter(self, state: State) -> list[QueryResult] | None: # in which case we may want to return a result return None - def handle_final(self, all_states: (State,)) -> list[QueryResult] | None: + def handle_final(self, all_states: tuple[State]) -> list[QueryResult] | None: """Entry point for running queries after all scans are complete Args: @@ -168,14 +171,10 @@ def process_element(self, elem: ET.Element, state: State) -> list[QueryResult] | if elem_type in ["recordUpdates", "recordLookups", "recordCreates", "recordDeletes"]: # Look for filter selection criteria (influences *which records* are returned) - filter_elems = parse_utils.get_filters(elem) - if filter_elems is not None and len(filter_elems) > 0: - filter_influencers = parse_utils.get_sinks_from_field_values(filter_elems) + filter_influencers = parse_utils.get_field_op_values_from_elem(elem, 'filters') # Look for input assignment which influences *what values* are updated or created - input_assignment_elems = parse_utils.get_input_assignments(elem) - if input_assignment_elems is not None and len(input_assignment_elems) > 0: - input_influencers = parse_utils.get_sinks_from_field_values(input_assignment_elems) + input_influencers = parse_utils.get_field_op_values_from_elem(elem, 'inputAssignments') # Look for bulk operators: bulk_ref = parse_utils.get_by_tag(elem, 'inputReference') @@ -188,9 +187,9 @@ def process_element(self, elem: ET.Element, state: State) -> list[QueryResult] | elem_name = state.get_current_elem_name() if elem_type in ['recordLookups', 'recordDeletes']: - filter_influencers.append((elem_name, bulk_var)) + filter_influencers.append((elem_name, None, bulk_var)) else: - input_influencers.append((elem_name, bulk_var)) + input_influencers.append((elem_name, None, bulk_var)) res = self.process_influencers(state, elem, filter_influencers, input_influencers, elem_type, parser) @@ -201,10 +200,14 @@ def process_element(self, elem: ET.Element, state: State) -> list[QueryResult] | assert x.paths is not None return res + # fall through + return None + def process_influencers(self, state: State, current_elem: ET.Element, - filter_influencers: [str], input_influencers: [str], + filter_influencers: list[tuple[str, str | None, str]], + input_influencers: list[tuple[str, str | None, str]], elem_type: str, - parser: FlowParser) -> [QueryResult]: + parser: FlowParser) -> list[QueryResult] | None: """Given a list of variables that flow into sinks, search if these are tainted, and if so, add the tainted flow to the result object. @@ -227,6 +230,7 @@ def process_influencers(self, state: State, current_elem: ET.Element, to_return = [] flow_path = parser.get_filename() run_mode = parser.get_effective_run_mode() + flow_type = parser.get_flow_type() for x in filter_influencers + input_influencers: if x in filter_influencers: @@ -240,7 +244,7 @@ def process_influencers(self, state: State, current_elem: ET.Element, if query_id is None: continue - a_field, influencer_var = x + a_field, op, influencer_var = x # surgery that deals with string or dataInfluencePaths happens in get_tainted_flows() tainted_flows = state.get_flows_from_sources(influenced_var=influencer_var, source_vars=self.sources) @@ -255,17 +259,18 @@ def process_influencers(self, state: State, current_elem: ET.Element, curr_name = parse_utils.get_name(current_elem) # SystemModeWithoutSharing User Influenced Record Update - sink_stmt = DataInfluenceStatement(a_field, influencer_var, curr_name, - comment=f"flow into {elem_type} via influence over {a_field}" + sink_stmt = InfluenceStatement(a_field, influencer_var, curr_name, + comment=f"flow into {elem_type} via influence over {a_field}" f" in run mode {run_mode.name}", - line_no=current_elem.sourceline, - source_text=parse_utils.ET.tostring(current_elem, encoding='unicode'), - flow_path=flow_path, - source_path=flow_path - ) + line_no=current_elem.sourceline, + source_text=parse_utils.get_elem_string(current_elem), + flow_path=flow_path, + source_path=flow_path + ) to_return.append(QueryResult(query_id=query_id, + flow_type=flow_type, influence_statement=sink_stmt, - paths=tainted_flows)) + paths=frozenset(tainted_flows))) msg = ("***Security Finding**" f"in Flow Element {curr_name} of type {elem_type}" diff --git a/packages/code-analyzer-flow-engine/FlowScanner/queries/optional_query.py b/packages/code-analyzer-flow-engine/FlowScanner/queries/optional_query.py new file mode 100644 index 00000000..29e33852 --- /dev/null +++ b/packages/code-analyzer-flow-engine/FlowScanner/queries/optional_query.py @@ -0,0 +1,1055 @@ +"""Queries requested by Engineering + + BETA - Under testing + +""" +from __future__ import annotations + +import logging +import re +import traceback +from typing import TypeAlias + +import public +from flow_scanner import control_flow +from flow_scanner.control_flow import Crawler +from public import parse_utils +from public.contracts import (AbstractQuery, QueryAction, QueryDescription, + QueryResult, State, AbstractCrawler, FlowParser, LexicalQuery, Query) +from public.data_obj import CrawlStep, InfluenceStatement, InfluencePath +from public.enums import Severity, ConnType, TriggerType, FlowType + +El: TypeAlias = parse_utils.CP.ET.Element + +logger = logging.getLogger(__name__) + +DEFAULT_HELP_URL = ("https://developer.salesforce.com/docs/atlas.en-us.secure_coding_guide.meta" + "/secure_coding_guide/secure_coding_considerations_flow_design.htm") + +id_pattern = re.compile(r"^(?=.*[0-9][0-9])[a-zA-Z0-9]+[a-zA-Z0-9]{15}(?:[0-5][A-Z0-5]{3})?$") +copy_name_pattern = re.compile(r"^Copy_\d+_of_[a-zA-Z]+") +copy_label_pattern = re.compile(r"^Copy \d+ of [a-zA-Z]+") + +ns = parse_utils.ns + +RESOURCE_TAGS = parse_utils.RESOURCE_TAGS + +# Add query id to class name map. Add here to register +# the query and make it available for use via CLI +# The key is the query id as used in CLI, and the value is the class name +QUERIES = { + "DbInLoop": "Database Operation In Loop", + "HardcodedId": "Hardcoded Id", + "MissingFaultHandler": "Missing Fault Handler", + "SameRecordUpdate": "Same Record Update In Trigger", + "TriggerEntryCriteria": "Record Trigger With No Entry Criteria", + "DefaultCopy": "Default Copy Label", + "UnusedResource": "Unused Resource", + "UnreachableElement": "Element is Unreachable", + "MissingNextValueConnector": "Loop Element Without nextValueConnector", + "CyclicSubflow": "Chain of subflow calls forms a cycle", + "TriggerWaitEvent": "Wait Event in Trigger", + "TriggerCallout": "Trigger Flow Callout in Synchronous Path", + "MissingDescription": "Missing Description" +} + +""" + TODO: consider these as well + "DanglingConnector": "Connector does not point to a traversable flow element", + "DanglingSubflow": ("Subflow target cannot be reached in code under scan + (Applies only for targets without a namespace)."), + "MissingStart": "The flow has no start element or startElementReference", + "UninitializedVariable": "Variable Used Prior To Initialization", + "NullValueError": "Potentially Null Value Used Unsafely" + } +""" + +class DbInLoop(AbstractQuery): + + query_id = "DbInLoop" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + query_description="This rule detects when there are CRUD flow elements within a loop (RecordLookups, RecordCreates, RecordUpdates, RecordDeletes). This rule does not trigger if the CRUD element is in a fault handler. These DB operations should be bulkified by using collections and the IN condition. This rule does not follow subflows.", + query_version="1.0", + severity=public.enums.Severity.Flow_Moderate_Severity, + help_url=DEFAULT_HELP_URL, + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.flow_enter + + def convert_results(self, results: list[tuple[CrawlStep, str, str, int]], parser: FlowParser) -> list[QueryResult]: + q_results = [] + for step, flow_path, source_text, line_no in results: + loop_name = step.visitor.loop_context[-1][0] + loop_elem = parser.get_by_name(loop_name) + loop_code = parse_utils.get_elem_string(loop_elem) + loop_line_no = parse_utils.get_line_no(loop_elem) + + step_elem = parser.get_by_name(step.element_name) + step_line_no = parse_utils.get_line_no(step_elem) + step_code = parse_utils.get_elem_string(step_elem) + + stmt = InfluenceStatement( + influenced_var=step.element_name, + influencer_var=loop_name, + element_name=step.element_name, + comment=f"Database Operation {step.element_tag} performed within" + f" the Loop {loop_name}", + flow_path=flow_path, + line_no=step_line_no, + source_text=step_code, + source_path=flow_path, + ) + + qr = QueryResult( + query_id=self.query_id, + flow_type=parser.get_flow_type(), + influence_statement=stmt, + elem_code=loop_code, + elem_line_no=loop_line_no, + elem_name=loop_name, + filename=flow_path, + paths=None # only print from source to sink + ) + q_results.append(qr) + return q_results + + def execute(self, + state: State = None, # the state has the flow_path variable + crawler: AbstractCrawler = None, + all_states=None, + ) -> list[QueryResult] | None: + crawl_schedule = crawler.get_crawl_schedule() + parser = state.get_parser() + flow_path = parser.get_filename() + results = [] + db_tags = ["recordLookups", "recordCreates", "recordUpdates", "recordDeletes"] + for step in crawl_schedule: + if step.element_tag in db_tags and step.visitor is not None: + visitor = step.visitor + if (visitor.loop_context is not None and len(visitor.loop_context) > 0 + and visitor.loop_context[-1][1] is ConnType.Loop): + elem = parser.get_by_name(step.element_name) + line_no = parse_utils.get_line_no(elem) + source_text = parse_utils.get_elem_string(elem) + results.append((step, flow_path, source_text, line_no)) + + return self.convert_results(results, parser) + + +class HardcodedId(LexicalQuery): + + query_id = "HardcodedId" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + query_description="This rule detects hardcoded IDs within a flow. Hardcoded Ids are a bad practice, and such flows are not appropriate for distribution.", + query_version="1.0", + severity=public.enums.Severity.Flow_Low_Severity, + help_url=DEFAULT_HELP_URL, + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + if parser is None: + return None + else: + results = [] + root = parser.get_root() + top_level = [x for x in root if parse_utils.get_tag(x) != "processMetadataValues"] + for top in top_level: + for el in top.iter(tag=f"{ns}stringValue"): + msg = el.text + if msg is not None: + res = is_valid_salesforce_id(msg) + if res and (top, el) not in results: + results.append((top, el)) + + return self.report(results, parser) + + def report(self, results: list[tuple], parser) -> list[QueryResult] | None: + accum = [] + for top, el in results: + el_name = parse_utils.get_name(top) + if el_name is None: + el_name = parse_utils.get_tag(top) + accum.append( + QueryResult( + query_id=self.query_id, + flow_type=parser.get_flow_type(), + influence_statement=None, + paths=None, + elem_code=parse_utils.get_elem_string(el), + elem_line_no=parse_utils.get_line_no(el), + elem_name=el_name, + filename=parser.get_filename(), + field=None + ) + ) + if len(accum) == 0: + return None + else: + return accum + + +class MissingFaultHandler(LexicalQuery): + + query_id = "MissingFaultHandler" + query_name = QUERIES[query_id] + fault_tags = ['recordCreates', 'recordUpdates', 'recordDeletes', + 'actionCalls', 'subflows'] + + def __init__(self): + self.root = None + + def get_query_description(self) -> QueryDescription: + return QueryDescription(query_id=self.query_id, + query_name=self.query_name, + severity=public.enums.Severity.Flow_Low_Severity, + help_url=DEFAULT_HELP_URL, + is_security=False, + query_description=("This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.") + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + accum = [] + if self.root is None: + root = parser.get_root() + self.root = root + else: + root = self.root + + for tag_ in self.fault_tags: + elems = parse_utils.get_by_tag(root, tag_) + for el in elems: + fc = parse_utils.get_by_tag(el, 'faultConnector') + if len(fc) == 0: + accum.append(QueryResult( + query_id=self.query_id, + flow_type=parser.get_flow_type(), + influence_statement=None, + paths=None, + elem_code=parse_utils.get_elem_string(el), + elem_line_no=parse_utils.get_line_no(el), + elem_name=parse_utils.get_name(el), + filename=parser.get_filename(), + field=None) + ) + if len(accum) == 0: + return None + else: + return accum + +class SameRecordUpdate(Query): + + query_id = "SameRecordUpdate" + query_name = QUERIES[query_id] + + def __init__(self): + self.should_check: bool = True + self.should_scan: bool = False + self.flow_path: str | None = None + self.source_vars: set[tuple[str, str]] | None = None + + self.trig_object = None + + def populate_trig_object(self, parser: FlowParser) -> None: + self.trig_object = parser.get_trigger_object() + if self.trig_object is None: + self.should_scan = False + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=public.enums.Severity.Flow_Moderate_Severity, + help_url=DEFAULT_HELP_URL, + is_security=False, + query_description=("This rule detects when an AfterSave record trigger modifies the same record. Record modifications should be done in BeforeSave triggers, not AfterSave triggers. This rule follows subflows, so it will detect if the RecordId is passed to a child flow which then modifies a record with that id.") + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.process_elem + + def execute(self, + state: State = None, + crawler: AbstractCrawler = None, + all_states=None) -> list[QueryResult] | None: + + parser = state.get_parser() + + if self.should_check: + should_scan = parser.get_trigger_type() is TriggerType.RecordAfterSave + self.should_check = False + if not should_scan: + self.should_scan = False + self.should_check = False + return None + else: + self.flow_path = state.get_parser().get_filename() + self.source_vars = {(self.flow_path, "$Record")} + self.should_scan = True + self.populate_trig_object(parser) + + if not self.should_scan: + return None + + if not self.should_scan: + return None + + elem = state.get_current_elem() + if parse_utils.get_tag(elem) != 'recordUpdates': + return None + + obj_names = parse_utils.get_by_tag(elem, 'object') + if len(obj_names) > 0: + # if this is an input reference, the obj name is not provided + if obj_names[0].text != self.trig_object: + # don't bother if we are updating some other object + return None + + elem_name = parse_utils.get_name(elem) + + # Look for filter selectors (influences *which records* are returned) + filter_influencers = parse_utils.get_field_op_values_from_elem(elem=elem, tag='filters') + for (fld, op, id_value) in filter_influencers: + if fld.lower() == "id" and op.lower() == 'equalto': + tainted_flows = state.get_flows_from_sources(influenced_var=id_value, + source_vars=self.source_vars, + restrict="Id") + if tainted_flows is not None: + return self.filter_flows(tainted_flows, elem=elem, influenced="id", parser=state.get_parser()) + + # Look for object selectors + bulk_ref = parse_utils.get_by_tag(elem, 'inputReference') + if len(bulk_ref) == 1: + bulk_el = bulk_ref[0] + bulk_var = bulk_el.text + + # We only care about what determines the id, not the rest of the fields + target_var = f"{bulk_var}.Id" + tainted_flows = state.get_flows_from_sources(influenced_var=target_var, + source_vars=self.source_vars, + restrict="Id") + if tainted_flows is not None: + return self.filter_flows(tainted_flows, elem=elem, influenced=elem_name, parser=parser) + + return None + + def filter_flows(self, tainted_flows: set[InfluencePath], elem: El, + influenced: str, parser: FlowParser) -> list[QueryResult] | None: + + if len(tainted_flows) == 0: + return None + + else: + elem_name = parse_utils.get_name(elem) + elem_line_no = parse_utils.get_line_no(elem) + qr = QueryResult( + query_id=self.query_id, + flow_type=parser.get_flow_type(), + influence_statement=InfluenceStatement( + influenced_var=elem_name, + influencer_var=influenced, + element_name=elem_name, + comment="record value flows into record update", + flow_path=self.flow_path, + line_no=elem_line_no, + source_text=parse_utils.get_elem_string(elem), + source_path=self.flow_path + ), + paths=frozenset(tainted_flows) + ) + return [qr] + + +class TriggerEntryCriteria(LexicalQuery): + + query_id = "TriggerEntryCriteria" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Moderate_Severity, + query_description="This rule detects when record trigger flows are missing entry criteria. All record trigger flows should have entry criteria specified in the flow trigger definition rather than solely in the flow's own business logic.", + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + root = parser.get_root() + starts = parse_utils.get_by_tag(root, tagname='start') + if len(starts) != 1: + # The flow could have a startElementReference + logger.debug(f"could not find start element in flow {parser.get_filename()}") + return None + start = starts[0] + trigger_type_els = parse_utils.get_by_tag(start, tagname='recordTriggerType') + + if len(trigger_type_els) != 1: + return None + else: + filter_formula = parse_utils.get_by_tag(elem=start, tagname='filterFormula') + filters = parse_utils.get_by_tag(elem=start, tagname='filters') + + if len(filters) == 0 and len(filter_formula) == 0: + + return [QueryResult( + query_id=self.query_id, + flow_type=parser.get_flow_type(), + elem_name='start', + elem_line_no=parse_utils.get_line_no(start), + elem_code=parse_utils.get_elem_string(start), + filename=parser.get_filename() + )] + else: + return None + + +class DefaultCopy(LexicalQuery): + + query_id = "DefaultCopy" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Low_Severity, + query_description=("This rule detects default names and labels that were auto assigned to elements pasted elements in the flow builder UI. These labels and names should be changed to make the flow comprehensible to maintainers."), + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + els = parser.get_all_named_elems() + if len(els) == 0: + return None + else: + accum = [] + for el in els: + name = parse_utils.get_name(el) + if is_copy_name(name): + accum.append((name, name, el)) + continue + + labels = parse_utils.get_by_tag(el, tagname='label') + if len(labels) > 0: + for label in labels: + label_text = label.text + if label_text is not None and is_copy_label(label_text): + accum.append((name, label_text, el)) + break + + to_return = [] + flow_type = parser.get_flow_type() + for (res_name, res_var, res_el) in accum: + to_return.append(QueryResult( + query_id=self.query_id, + flow_type=flow_type, + elem_code=parse_utils.get_elem_string(res_el), + elem_name=res_name, + elem_line_no=parse_utils.get_line_no(res_el), + field=res_var, + filename=parser.get_filename()) + ) + + return to_return + +class UnusedResource(LexicalQuery): + + query_id = "UnusedResource" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Low_Severity, + query_description="This rule detects redundant variables that are not used in the flow. This can be a sign of developer error.", + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + root = parser.get_root() + filename = parser.get_filename() + flow_type = parser.get_flow_type() + + resource_t = [] # [(name, tag, el)] + for tag_ in RESOURCE_TAGS: + resource_elems = root.findall(f".//{ns}{tag_}") + for elem in resource_elems: + resource_t.append((parse_utils.get_name(elem), tag_, elem)) + + found = [] + to_match = parse_utils.get_all_flow_refs(root) + for (name, tag, elem) in resource_t: + for txt in to_match: + if name in txt.split("."): + found.append((name, tag, elem)) + break + result = [x for x in resource_t if x not in found] + if len(result) == 0: + return None + else: + accum = [(x[0], x[1], parse_utils.get_elem_string(x[2]), parse_utils.get_line_no(x[2])) for x in result] + # noinspection PyTypeChecker + return self.report(accum=accum, filename=filename, flow_type=flow_type) + + + def report(self, accum: list[tuple[str, str, str, str]], filename: str, flow_type: FlowType) -> list[QueryResult] | None: + if len(accum) == 0: + return None + else: + to_return = [] + for (name, tag, code, line) in accum: + to_return.append( + QueryResult( + query_id=self.query_id, + flow_type=flow_type, + elem_code=code, + elem_name=name, + elem_line_no=int(line), + field=name, + filename=filename + ) + ) + + return to_return + + +class MissingNextValueConnector(LexicalQuery): + + query_id = "MissingNextValueConnector" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Moderate_Severity, + query_description=("This rule detects Loops without nextValue connectors. Loops should always have nextValue connectors, and lack of one usually signifies developer error when connecting the loop element to other elements."), + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + accum = [] + root = parser.get_root() + flow_type = parser.get_flow_type() + loops = parse_utils.get_by_tag(root, tagname='loops') + filename = parser.get_filename() + + for loop in loops: + res = loop.findall(f".//{ns}nextValueConnector") + if len(res) == 0: + name = parse_utils.get_name(loop) + accum.append(QueryResult( + query_id=self.query_id, + flow_type=flow_type, + elem_code=parse_utils.get_elem_string(loop), + elem_name=name, + elem_line_no=parse_utils.get_line_no(loop), + field=name, + filename=filename + ) + ) + + if len(accum) == 0: + return None + else: + return accum + + +class CyclicSubflow(LexicalQuery): + + query_id = "CyclicSubflow" + query_name = QUERIES[query_id] + + @classmethod + def accept(cls, **kwargs) -> list[QueryResult] | None: + + # matching_frame = kwargs["matching_frame"] + # all_frames = kwargs["all_frames"] + # flow_path = kwargs["next_flow_path"] + current_frame = kwargs["current_frame"] + + illegal_subflow = current_frame.state.get_current_elem() + # illegal_target_path = flow_path + code = parse_utils.get_elem_string(illegal_subflow) + line_no = parse_utils.get_line_no(illegal_subflow) + elem_name = current_frame.state.get_current_elem_name() + flow_type = current_frame.state.get_parser().get_flow_type() + + return [QueryResult( + query_id=cls.query_id, + flow_type=flow_type, + elem_code=code, + elem_name=elem_name, + elem_line_no=line_no, + field=elem_name, + filename=current_frame.flow_path + ) + ] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Moderate_Severity, + query_description="This rule detects when a subflow calls a parent flow, creating a cyclic flow. The rule will detect cycles of any depth.", + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + pass + +class UnreachableElement(LexicalQuery): + + query_id = "UnreachableElement" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Moderate_Severity, + query_description=("This rule identifies elements that have not been connected to the start element of the flow. Unreachable elements are usually due to incomplete flows or developer error.") + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + crawler = kwargs["crawler"] + cfg = crawler.get_cfg() + # noinspection PyTypeChecker + missing = control_flow.validate_cfg(cfg=cfg, parser=parser, missing_only=True) + + if len(missing) == 0: + return None + + else: + results = [] + filename = parser.get_filename() + flow_type = parser.get_flow_type() + + for (el_name, el_tag) in missing: + elem = parser.get_by_name(el_name) + + if elem is None: + logger.error(f"Unreachable element '{el_name}' not found in flow {filename}") + continue + + code = parse_utils.get_elem_string(elem) + line_no = parse_utils.get_line_no(elem) + results.append(QueryResult( + query_id=self.query_id, + flow_type=flow_type, + elem_code=code, + elem_name=el_name, + elem_line_no=line_no, + field=el_name, + filename=filename + ) + ) + return results + +class MissingDescription(LexicalQuery): + query_id = "MissingDescription" + query_name = QUERIES[query_id] + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Low_Severity, + query_description=("This rule detects elements that contain labels but are missing descriptions. All elements with labels should have accompanying descriptions to make the flow comprehensible to future maintainers."), + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + all_named = list(parser.get_all_named_elems()) + + accum = [] + for el in all_named: + desc = parse_utils.get_by_tag(el,'description') + if len(desc) == 0: + accum.append(el) + + # the root element itself should also have a description + flow_desc_missing = len(parse_utils.get_by_tag(parser.get_root(), 'description')) == 0 + + if len(accum) == 0 and flow_desc_missing is False: + return None + else: + return self.report(parser=parser, results=accum, flow_desc_missing=flow_desc_missing) + + def report(self, parser: FlowParser, results: list[El], flow_desc_missing: bool=False) -> list[QueryResult] | None: + accum = [] + filename = parser.get_filename() + # We don't want to print out the whole flow + FLOW_CODE='' + FLOW_LINE = 2 + flow_type = parser.get_flow_type() + + if flow_desc_missing: + accum.append(QueryResult( + query_id=self.query_id, + flow_type=parser.get_flow_type(), + elem_code=FLOW_CODE, + elem_name='Flow', + elem_line_no=FLOW_LINE, + field="Flow", + filename=filename + ) + ) + + for res in results: + name = parse_utils.get_name(res) + accum.append(QueryResult( + query_id=self.query_id, + flow_type=flow_type, + elem_code=parse_utils.get_elem_string(res), + elem_line_no=parse_utils.get_line_no(res), + elem_name=name, + field = name, + filename=filename + ) + ) + return accum + +class TriggerWaitEvent(LexicalQuery): + """ + This will detect if a trigger flow calls a subflow that has a wait event, + which the current flow builder does not catch. + + It is simple here because we + follow all subflows and the life of the query instance is preserved across + following subflows. + + This is why we need the __init__ semaphores. The same + approach is used in other queries. + """ + query_id = "TriggerWaitEvent" + query_name = QUERIES[query_id] + + def __init__(self): + self.should_scan: bool = False + self.should_check: bool = True + self.start_elems: list[El] | None = None + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_High_Severity, + query_description=("This rule detects when a wait event is reached during trigger execution. Triggers must be performant and cannot contain wait events. For async processing, use scheduled paths within your trigger and async callouts, not wait events. This rule follows subflows."), + is_security=False + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + if self.should_check: + self.should_check = False + + if parser.get_flow_type() is FlowType.Trigger: + self.should_scan = True + else: + self.should_scan = False + return None + + elif not self.should_scan: + return None + + # We are in a new subflow, so add this start elem + if self.start_elems is None: + self.start_elems = [] + + self.start_elems.append(parser.get_start_elem()) + waits = parse_utils.get_by_tag(parser.get_root(), 'waits') + if len(waits) == 0: + return None + else: + accum = [] + filename = parser.get_filename() + flow_type = parser.get_flow_type() + + for wait in waits: + name = parse_utils.get_name(wait) + accum.append(QueryResult( + query_id=self.query_id, + flow_type=flow_type, + elem_code=parse_utils.get_elem_string(wait), + elem_name=name, + elem_line_no=parse_utils.get_line_no(wait), + field=name, + filename=filename + ) + ) + return accum + +class TriggerCallout(LexicalQuery): + query_id = "TriggerCallout" + query_name = QUERIES[query_id] + + def __init__(self): + self.should_scan: bool = False + self.should_check: bool = True + self.top_flow_path: str | None = None + self.called_names: list[str] | None = None + + #: element name corresponding to direct path from start + self.conn_target_name: str| None = None + + + def get_query_description(self) -> QueryDescription: + return QueryDescription( + query_id=self.query_id, + query_name=self.query_name, + severity=Severity.Flow_Moderate_Severity, + query_description=("This rule detects when a trigger performs a callout on the synchronous path. Triggers must be performant and may only contain callouts on async scheduled paths. This rule follows subflows.") + + ) + + def when_to_run(self) -> QueryAction: + return QueryAction.lexical + + def execute(self, parser: FlowParser = None, **kwargs) -> list[QueryResult] | None: + if self.should_check: + if parser.get_flow_type() is FlowType.Trigger: + try: + start_el = parser.get_start_elem() + conn = parse_utils.get_by_tag(start_el, 'connector')[0] + conn_target = parse_utils.get_text_of_tag(conn, 'targetReference') + except: + logger.debug(f"exception thrown when searching for start connector target in {self.top_flow_path}" + f"\n {traceback.format_exc()}") + self.should_scan = False + self.should_check = False + return None + + if conn_target is None: + self.should_scan = False + self.should_check = False + return None + else: + self.should_scan = True + self.should_check = False + self.top_flow_path = parser.get_filename() + self.conn_target_name = conn_target + self.called_names = parser.get_traversable_descendents_of_elem(conn_target) + + else: + self.should_scan = False + self.should_check = False + return None + elif not self.should_scan: + return None + + # fall through + action_calls = parser.get_action_call_map() + if action_calls is None: + return None + + callouts = dict.get(action_calls, 'externalService', None) + if not callouts: + return None + callout_names = [x[0] for x in callouts] + + crawler = kwargs.get("crawler") + + results = search_for_sync_jumps( + parser=parser, + called_names=self.called_names, + current_crawler=crawler, + prev_crawlers=crawler.get_crawler_history_unsafe(), + target_el_names=callout_names, + conn_target_name=self.conn_target_name, + current_filename=parser.get_filename(), + top_filename=self.top_flow_path) + + if len(results) == 0: + return None + else: + accum = [] + for result in results: + elem = parser.get_by_name(name_to_match=result) + + accum.append(QueryResult( + query_id=self.query_id, + flow_type=FlowType.Trigger, + elem_code=parse_utils.get_elem_string(elem), + elem_name=result, + elem_line_no=parse_utils.get_line_no(elem), + field=result, + filename=parser.get_filename() + ) + ) + + return accum + + +def search_for_sync_jumps( + parser: FlowParser, + called_names: list[str], + current_crawler: Crawler, + prev_crawlers: list[Crawler] | None, + target_el_names: list[str], + conn_target_name: str, + current_filename: str, + top_filename: str) -> list[str]: + """ + target_el_names = names of http callouts that should not be called from + the conn_target_name + + Returns: + list of target_el_names that are running in the synchronous path + """ + + results = [] + + if top_filename == current_filename: + + for tgt_name in target_el_names: + if tgt_name in called_names: + results.append(tgt_name) + + else: + # we are in a subflow so we need to find the first top level crawler + # that is connected to this frame. + if prev_crawlers is None: + # This means the executor didn't set the parent crawler + logger.critical(f"could not link back to {top_filename} from {current_filename}") + return results + + else: + crawler_to_check, index = next((c for c in prev_crawlers if c[0].get_flow_path() == top_filename), None) + # The crawler was set, but incorrectly + if crawler_to_check is None: + logger.critical(f"could not link back to {top_filename} from {current_filename}") + return results + + step_to_check = crawler_to_check.get_current_step_index() - 1 + + subflow_name = crawler_to_check.get_crawl_schedule()[step_to_check].element_name + + if subflow_name in called_names: + # all the actions in this subflow are running in the direct path + results = results + target_el_names + + return results + + +def check_in_templates_or_formulas(name_to_check: str, + formula_elems: list[El], + template_elems: list[El]) -> bool: + for el in formula_elems: + if check_in_expression(el, payload_tag='expression', name_to_check=name_to_check): + return True + + for el in template_elems: + if check_in_expression(el, payload_tag='text', name_to_check=name_to_check): + return True + + return False + + +def check_in_expression(expr: El, payload_tag: str, name_to_check: str) -> bool: + payload_els = expr.find(payload_tag) + if len(payload_els) != 1: + logger.critical(f"cannot find expected tag {payload_tag} in expression {parse_utils.get_name(expr)}") + # fail to passing the test + return True + + payload_el = payload_els[0] + expression = payload_el.text + + if expression is None or expression == "": + logger.critical(f"found empty expression in {payload_tag} in expression {parse_utils.get_name(expr)}") + return True + + vars_in_expression = parse_utils.parse_expression(expression) + for var in vars_in_expression: + if _is_in_splits(var=var, name_to_check=name_to_check): + return True + return False + + +def _is_in_splits(var: str, name_to_check: str) -> bool: + splits = var.split('.') + for split in splits: + if name_to_check == split: + return True + return False + + +# 001cU000009Tt2MQAS +def to18(id_str) -> str | None: + if len(id_str) == 15: + # from stackexchange :) + bitstring = 0 + for i in range(0, 15): + if 'A' <= id_str[i] <= 'Z': + bitstring |= 1 << i + + # Take three slices of the bitstring and use them as 5-bit indices into the alphanumeric sequence. + uppers = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ012345' + return id_str + uppers[bitstring & 0x1F] + uppers[bitstring >> 5 & 0x1F] + uppers[bitstring >> 10] + return None + + +def is_valid_salesforce_id(sf_id: str) -> bool: + """ + Checks if a string is a valid 15-character or 18-character Salesforce ID. + """ + if bool(id_pattern.match(sf_id)): + if len(sf_id) == 18: + part = sf_id[:15] + return sf_id == to18(part) + else: + return True + + return False + + +def is_copy_label(label: str) -> bool: + return bool(copy_label_pattern.match(label)) + + +def is_copy_name(name: str) -> bool: + return bool(copy_name_pattern.match(name)) diff --git a/packages/code-analyzer-flow-engine/package.json b/packages/code-analyzer-flow-engine/package.json index 809d1ebc..698af4c0 100644 --- a/packages/code-analyzer-flow-engine/package.json +++ b/packages/code-analyzer-flow-engine/package.json @@ -1,7 +1,7 @@ { "name": "@salesforce/code-analyzer-flow-engine", "description": "Plugin package that adds 'Flow Scanner' as an engine into Salesforce Code Analyzer", - "version": "0.28.0", + "version": "0.29.0-SNAPSHOT", "author": "The Salesforce Code Analyzer Team", "license": "BSD-3-Clause", "homepage": "https://developer.salesforce.com/docs/platform/salesforce-code-analyzer/overview", diff --git a/packages/code-analyzer-flow-engine/src/engine.ts b/packages/code-analyzer-flow-engine/src/engine.ts index 0171316a..b81ae51c 100644 --- a/packages/code-analyzer-flow-engine/src/engine.ts +++ b/packages/code-analyzer-flow-engine/src/engine.ts @@ -12,8 +12,8 @@ import { } from "@salesforce/code-analyzer-engine-api"; import {Clock, RealClock} from '@salesforce/code-analyzer-engine-api/utils'; import {getMessage} from './messages'; -import {FlowNodeDescriptor, FlowScannerCommandWrapper, FlowScannerExecutionResult} from "./python/FlowScannerCommandWrapper"; -import {getConsolidatedRuleByName, getConsolidatedRuleName, getConsolidatedRuleNames} from "./hardcoded-catalog"; +import {FlowNodeDescriptor, FlowScannerCommandWrapper, FlowScannerExecutionResult, FlowScannerRuleResult} from "./python/FlowScannerCommandWrapper"; +import {getDescriptionForRule, getRuleNameFromQueryName, getAllRuleNames, getOptionalQueryIdsForRule} from "./hardcoded-catalog"; /** * An arbitrarily chosen value for how close the engine is to completion before the underlying Flow tool is invoked, @@ -58,8 +58,7 @@ export class FlowScannerEngine extends Engine { return []; } this.emitDescribeRulesProgressEvent(75); - const consolidatedNames: string[] = getConsolidatedRuleNames(); - const convertedRules: RuleDescription[] = consolidatedNames.map(getConsolidatedRuleByName); + const convertedRules: RuleDescription[] = getAllRuleNames().map(getDescriptionForRule); this.emitDescribeRulesProgressEvent(100); return convertedRules; } @@ -81,11 +80,14 @@ export class FlowScannerEngine extends Engine { this.emitRunRulesProgressEvent(normalizeRelativeCompletionPercentage(percentage)); } + const optionalQueryIds: string[] = ruleNames.flatMap(getOptionalQueryIdsForRule); + const executionResults: FlowScannerExecutionResult = await this.commandWrapper.runFlowScannerRules( runOptions.workingFolder, workspaceFlows, targetedFlows, logFile, + optionalQueryIds, percentageUpdateHandler ); const convertedResults: EngineRunResults = toEngineRunResults(executionResults, ruleNames); @@ -139,22 +141,37 @@ function toEngineRunResults(flowScannerExecutionResult: FlowScannerExecutionResu }; for (const queryName of Object.keys(flowScannerExecutionResult.results)) { - const flowScannerRuleResults = flowScannerExecutionResult.results[queryName]; + const flowScannerRuleResults: FlowScannerRuleResult[] = flowScannerExecutionResult.results[queryName]; for (const flowScannerRuleResult of flowScannerRuleResults) { - const ruleName = getConsolidatedRuleName(flowScannerRuleResult.query_name); - // Flow runs quickly, and its rule selection is fiddly. So it's easier to just run all the rules, - // and then throw away results for rules that the user didn't request. + const ruleName = getRuleNameFromQueryName(flowScannerRuleResult.query_name); + // Since the non-optional queries (designated by the default preset) always run, we need filter any of their + // results out if their corresponding rule was not selected. if (!requestedRulesSet.has(ruleName)) { continue; } - const flowNodes: FlowNodeDescriptor[] = flowScannerRuleResult.flow; - results.violations.push({ - ruleName, - message: flowScannerRuleResult.description, - codeLocations: toCodeLocationList(flowNodes), - primaryLocationIndex: flowScannerRuleResult.flow.length - 1, - resourceUrls: [] - }); + + const flowNodes: FlowNodeDescriptor[] | undefined = flowScannerRuleResult.flow; + if (flowNodes) { // If flow based violation + results.violations.push({ + ruleName, + message: flowScannerRuleResult.description, + codeLocations: toCodeLocationList(flowNodes), + primaryLocationIndex: flowNodes.length - 1, + resourceUrls: [] + }); + } else { // else if single element based violation + results.violations.push({ + ruleName, + message: flowScannerRuleResult.description, + codeLocations: [{ + file: flowScannerRuleResult.filename!, + startLine: flowScannerRuleResult.elem_line_no!, + startColumn: 1 + }], + primaryLocationIndex: 0, + resourceUrls: [] + }) + } } } return results; diff --git a/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts b/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts index ee3e067f..9ba41436 100644 --- a/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts +++ b/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts @@ -1,57 +1,323 @@ import {COMMON_TAGS, RuleDescription, SeverityLevel} from '@salesforce/code-analyzer-engine-api'; import {getMessage} from './messages'; -const PREVENT_PASSING_USER_DATA_WITHOUT_SHARING = 'PreventPassingUserDataIntoElementWithoutSharing'; -const PREVENT_PASSING_USER_DATA_WITH_SHARING = 'PreventPassingUserDataIntoElementWithSharing'; - -const QUERY_NAMES_TO_CONSOLIDATED_NAMES: Map = new Map([ - ['Flow: SystemModeWithoutSharing recordCreates data', PREVENT_PASSING_USER_DATA_WITHOUT_SHARING], - ['Flow: SystemModeWithoutSharing recordDeletes selector', PREVENT_PASSING_USER_DATA_WITHOUT_SHARING], - ['Flow: SystemModeWithoutSharing recordLookups selector', PREVENT_PASSING_USER_DATA_WITHOUT_SHARING], - ['Flow: SystemModeWithoutSharing recordUpdates data', PREVENT_PASSING_USER_DATA_WITHOUT_SHARING], - ['Flow: SystemModeWithoutSharing recordUpdates selector', PREVENT_PASSING_USER_DATA_WITHOUT_SHARING], - ['Flow: SystemModeWithSharing recordCreates data', PREVENT_PASSING_USER_DATA_WITH_SHARING], - ['Flow: SystemModeWithSharing recordDeletes selector', PREVENT_PASSING_USER_DATA_WITH_SHARING], - ['Flow: SystemModeWithSharing recordLookups selector', PREVENT_PASSING_USER_DATA_WITH_SHARING], - ['Flow: SystemModeWithSharing recordUpdates data', PREVENT_PASSING_USER_DATA_WITH_SHARING], - ['Flow: SystemModeWithSharing recordUpdates selector', PREVENT_PASSING_USER_DATA_WITH_SHARING] -]); - -const CONSOLIDATED_RULE_DESCRIPTIONS_BY_NAME: Map = new Map([ - [PREVENT_PASSING_USER_DATA_WITHOUT_SHARING, { - name: PREVENT_PASSING_USER_DATA_WITHOUT_SHARING, - description: getMessage('ConsolidatedRuleDescription', 'Without Sharing'), +// Code Analyzer rule names +enum RuleName { + CyclicSubflow = 'CyclicSubflow', + DbInLoop = 'DbInLoop', + DefaultCopy = 'DefaultCopy', + HardcodedId = 'HardCodedId', + MissingDescription = 'MissingDescription', + MissingFaultHandler = 'MissingFaultHandler', + MissingNextValueConnector = 'MissingNextValueConnector', + PreventPassingUserDataIntoElementWithoutSharing = 'PreventPassingUserDataIntoElementWithoutSharing', + PreventPassingUserDataIntoElementWithSharing = 'PreventPassingUserDataIntoElementWithSharing', + SameRecordUpdate = 'SameRecorUpdate', + TriggerCallout = 'TriggerCallout', + TriggerEntryCriteria = 'TriggerEntryCriteria', + TriggerWaitEvent = 'TriggerWaitEvent', + UnreachableElement = 'UnreachableElement', + UnusedResource = 'UnusedResource' +} + +const RULE_DESCRIPTIONS: RuleDescription[] = [ + { + name: RuleName.CyclicSubflow, + description: getMessage('CyclicSubflowRuleDescription'), + severityLevel: SeverityLevel.Critical, + tags: [/* NOT RECOMMENDED */ COMMON_TAGS.CATEGORIES.PERFORMANCE, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.DbInLoop, + description: getMessage('DbInLoopRuleDescription'), + severityLevel: SeverityLevel.High, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.PERFORMANCE, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.DefaultCopy, + description: getMessage('DefaultCopyRuleDescription'), + severityLevel: SeverityLevel.Moderate, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.CODE_STYLE, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.HardcodedId, + description: getMessage('HardcodedIdRuleDescription'), + severityLevel: SeverityLevel.Moderate, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.BEST_PRACTICES, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.MissingDescription, + description: getMessage('MissingDescriptionRuleDescription'), + severityLevel: SeverityLevel.Low, + tags: [/* NOT RECOMMENDED */ COMMON_TAGS.CATEGORIES.CODE_STYLE, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.MissingFaultHandler, + description: getMessage('MissingFaultHandlerRuleDescription'), + severityLevel: SeverityLevel.High, + tags: [/* NOT RECOMMENDED */ COMMON_TAGS.CATEGORIES.BEST_PRACTICES, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.MissingNextValueConnector, + description: getMessage('MissingNextValueConnectorRuleDescription'), + severityLevel: SeverityLevel.Critical, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.BEST_PRACTICES, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.PreventPassingUserDataIntoElementWithoutSharing, + description: getMessage('PreventPassingUserDataIntoElementRuleDescription', 'Without Sharing'), + severityLevel: SeverityLevel.High, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.SECURITY, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: ['https://developer.salesforce.com/docs/platform/salesforce-code-analyzer/guide/rules-flow.html#preventpassinguserdataintoelementwithoutsharing'] + }, + { + name: RuleName.PreventPassingUserDataIntoElementWithSharing, + description: getMessage('PreventPassingUserDataIntoElementRuleDescription', 'With Sharing'), + severityLevel: SeverityLevel.Low, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.SECURITY, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: ['https://developer.salesforce.com/docs/platform/salesforce-code-analyzer/guide/rules-flow.html#preventpassinguserdataintoelementwithsharing'] + }, + { + name: RuleName.SameRecordUpdate, + description: getMessage('SameRecordUpdateRuleDescription'), + severityLevel: SeverityLevel.Moderate, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.SECURITY, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.TriggerCallout, + description: getMessage('TriggerCalloutRuleDescription'), + severityLevel: SeverityLevel.High, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.PERFORMANCE, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.TriggerEntryCriteria, + description: getMessage('TriggerEntryCriteriaRuleDescription'), severityLevel: SeverityLevel.High, - tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.SECURITY, COMMON_TAGS.LANGUAGES.XML], + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.PERFORMANCE, COMMON_TAGS.LANGUAGES.XML], resourceUrls: [] - }], - [PREVENT_PASSING_USER_DATA_WITH_SHARING, { - name: PREVENT_PASSING_USER_DATA_WITH_SHARING, - description: getMessage('ConsolidatedRuleDescription', 'With Sharing'), + }, + { + name: RuleName.TriggerWaitEvent, + description: getMessage('TriggerWaitEventRuleDescription'), + severityLevel: SeverityLevel.High, + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.PERFORMANCE, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.UnreachableElement, + description: getMessage('UnreachableElementRuleDescription'), severityLevel: SeverityLevel.Low, - tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.SECURITY, COMMON_TAGS.LANGUAGES.XML], + tags: [/* NOT RECOMMENDED */ COMMON_TAGS.CATEGORIES.BEST_PRACTICES, COMMON_TAGS.LANGUAGES.XML], + resourceUrls: [] + }, + { + name: RuleName.UnusedResource, + description: getMessage('UnusedResourceRuleDescription'), + severityLevel: SeverityLevel.Moderate, + tags: [/* NOT RECOMMENDED */ COMMON_TAGS.CATEGORIES.BEST_PRACTICES, COMMON_TAGS.LANGUAGES.XML], resourceUrls: [] - }] -]); + } +]; + +const RULE_DESCRIPTIONS_BY_NAME: Map = new Map(RULE_DESCRIPTIONS.map(rd => [rd.name, rd])); -export function getConsolidatedRuleNames(): string[] { - return [...new Set(QUERY_NAMES_TO_CONSOLIDATED_NAMES.values())]; +type FlowScannerQueryAssociation = { + // The id of the flow scanner query. This is used when selecting which query to run (if it is an optional query). + queryId : string, + + // The name of the flow scanner query. Unfortunately this is what shows up in the results instead of the id. + queryName: string, + + // Should be true if the rule is queried by the --optional_queries flag and false if it is in the default preset + isOptional: boolean + + // The name of the Code Analyzer rule that the query is associated with. Note that multiple queries can be under the same rule. + ruleName: RuleName } -export function getConsolidatedRuleName(unconsolidatedName: string): string { +const QUERY_ASSOCIATIONS: FlowScannerQueryAssociation[] = [ + // ==== QUERIES FROM THE DEFAULT PRESET (which we can't turn off when running flow scanner) ==== + { + queryId: "FlowSecurity.SystemModeWithSharing.recordCreates.data", + queryName: "Flow: SystemModeWithSharing recordCreates data", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithSharing + }, + { + queryId: "FlowSecurity.SystemModeWithSharing.recordDeletes.selector", + queryName: "Flow: SystemModeWithSharing recordDeletes selector", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithSharing + }, + { + queryId: "FlowSecurity.SystemModeWithSharing.recordLookups.selector", + queryName: "Flow: SystemModeWithSharing recordLookups selector", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithSharing + }, + { + queryId: "FlowSecurity.SystemModeWithSharing.recordUpdates.data", + queryName: "Flow: SystemModeWithSharing recordUpdates data", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithSharing + }, + { + queryId: "FlowSecurity.SystemModeWithSharing.recordUpdates.selector", + queryName: "Flow: SystemModeWithSharing recordUpdates selector", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithSharing + }, + { + queryId: "FlowSecurity.SystemModeWithoutSharing.recordCreates.data", + queryName: "Flow: SystemModeWithoutSharing recordCreates data", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithoutSharing + }, + { + queryId: "FlowSecurity.SystemModeWithoutSharing.recordDeletes.selector", + queryName: "Flow: SystemModeWithoutSharing recordDeletes selector", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithoutSharing + }, + { + queryId: "FlowSecurity.SystemModeWithoutSharing.recordLookups.selector", + queryName: "Flow: SystemModeWithoutSharing recordLookups selector", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithoutSharing + }, + { + queryId: "FlowSecurity.SystemModeWithoutSharing.recordUpdates.data", + queryName: "Flow: SystemModeWithoutSharing recordUpdates data", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithoutSharing + }, + { + queryId: "FlowSecurity.SystemModeWithoutSharing.recordUpdates.selector", + queryName: "Flow: SystemModeWithoutSharing recordUpdates selector", + isOptional: false, + ruleName: RuleName.PreventPassingUserDataIntoElementWithoutSharing + }, + + // ==== OPTIONAL QUERIES (which we can choose to run) ==== + { + queryId: "CyclicSubflow", + queryName: "Chain of subflow calls forms a cycle", + isOptional: true, + ruleName: RuleName.CyclicSubflow + }, + { + queryId: "DbInLoop", + queryName: "Database Operation In Loop", + isOptional: true, + ruleName: RuleName.DbInLoop + }, + { + queryId: "DefaultCopy", + queryName: "Default Copy Label", + isOptional: true, + ruleName: RuleName.DefaultCopy + }, + { + queryId: "HardcodedId", + queryName: "Hardcoded Id", + isOptional: true, + ruleName: RuleName.HardcodedId + }, + { + queryId: "MissingDescription", + queryName: "Missing Description", + isOptional: true, + ruleName: RuleName.MissingDescription + }, + { + queryId: "MissingFaultHandler", + queryName: "Missing Fault Handler", + isOptional: true, + ruleName: RuleName.MissingFaultHandler + }, + { + queryId: "MissingNextValueConnector", + queryName: "Loop Element Without nextValueConnector", + isOptional: true, + ruleName: RuleName.MissingNextValueConnector + }, + { + queryId: "SameRecordUpdate", + queryName: "Same Record Update In Trigger", + isOptional: true, + ruleName: RuleName.SameRecordUpdate + }, + { + queryId: "TriggerCallout", + queryName: "Trigger Flow Callout in Synchronous Path", + isOptional: true, + ruleName: RuleName.TriggerCallout + }, + { + queryId: "TriggerEntryCriteria", + queryName: "Record Trigger With No Entry Criteria", + isOptional: true, + ruleName: RuleName.TriggerEntryCriteria + }, + { + queryId: "TriggerWaitEvent", + queryName: "Wait Event in Trigger", + isOptional: true, + ruleName: RuleName.TriggerWaitEvent + }, + { + queryId: "UnreachableElement", + queryName: "Element is Unreachable", + isOptional: true, + ruleName: RuleName.UnreachableElement + }, + { + queryId: "UnusedResource", + queryName: "Unused Resource", + isOptional: true, + ruleName: RuleName.UnusedResource + } +] + +const QUERY_ASSOCIATIONS_BY_NAME : Map = new Map(QUERY_ASSOCIATIONS.map(qa => [qa.queryName, qa])); + +export function getAllRuleNames(): string[] { + return Object.values(RuleName); +} + +export function getRuleNameFromQueryName(queryName: string): string { // istanbul ignore else - if (QUERY_NAMES_TO_CONSOLIDATED_NAMES.has(unconsolidatedName)) { - return QUERY_NAMES_TO_CONSOLIDATED_NAMES.get(unconsolidatedName)!; + if (QUERY_ASSOCIATIONS_BY_NAME.has(queryName)) { + return QUERY_ASSOCIATIONS_BY_NAME.get(queryName)!.ruleName; } else { - throw new Error(`Developer error: invalid name ${unconsolidatedName}`); + throw new Error(`Developer error: invalid query name ${queryName}`); + } +} + +export function getOptionalQueryIdsForRule(ruleName: string): string[] { + const queryIds: string[] = []; + for (const queryAssociation of QUERY_ASSOCIATIONS) { + if (queryAssociation.isOptional && queryAssociation.ruleName === ruleName) { + queryIds.push(queryAssociation.queryId); + } } + return queryIds; } -export function getConsolidatedRuleByName(consolidatedName: string): RuleDescription { +export function getDescriptionForRule(ruleName: string): RuleDescription { // istanbul ignore else - if (CONSOLIDATED_RULE_DESCRIPTIONS_BY_NAME.has(consolidatedName)) { - return CONSOLIDATED_RULE_DESCRIPTIONS_BY_NAME.get(consolidatedName)!; + if (RULE_DESCRIPTIONS_BY_NAME.has(ruleName)) { + return RULE_DESCRIPTIONS_BY_NAME.get(ruleName)!; } else { - throw new Error(`Developer rule: No consolidated rule with name ${consolidatedName}`); + throw new Error(`Developer rule: No rule with name ${ruleName}`); } } \ No newline at end of file diff --git a/packages/code-analyzer-flow-engine/src/messages.ts b/packages/code-analyzer-flow-engine/src/messages.ts index b109dcfa..0dba4231 100644 --- a/packages/code-analyzer-flow-engine/src/messages.ts +++ b/packages/code-analyzer-flow-engine/src/messages.ts @@ -40,9 +40,6 @@ const MESSAGE_CATALOG: {[key: string]: string} = { FirstNodeComment: `%s: %s`, - ConsolidatedRuleDescription: - `Avoid passing user data into Flow Scanner elements in run mode: %s`, - SubsequentNodeComment: `%s influences %s: %s`, @@ -53,7 +50,50 @@ const MESSAGE_CATALOG: {[key: string]: string} = { `The following call to python exited with non-zero exit code.\n` + ` Command: %s\n` + ` Exit Code: %d\n` + - ` StdErr:\n%s` + ` StdErr:\n%s`, + + // ==== RULE DESCRIPTIONS ==== + CyclicSubflowRuleDescription: + `This rule detects when a subflow calls a parent flow, creating a cyclic flow. The rule will detect cycles of any depth.`, + + DbInLoopRuleDescription: + `This rule detects when there are CRUD flow elements within a loop (RecordLookups, RecordCreates, RecordUpdates, RecordDeletes). This rule does not trigger if the CRUD element is in a fault handler. These DB operations should be bulkified by using collections and the "IN" condition. This rule does not follow subflows.`, + + DefaultCopyRuleDescription: + `This rule detects default names and labels that were auto assigned to elements pasted elements in the flow builder UI. These labels and names should be changed to make the flow comprehensible to maintainers.`, + + HardcodedIdRuleDescription: + `This rule detects hardcoded IDs within a flow. Hardcoded Ids are a bad practice, and such flows are not appropriate for distribution.`, + + MissingDescriptionRuleDescription: + `This rule detects elements that contain labels but are missing descriptions. All elements with labels should have accompanying descriptions to make theflow comprehensible to future maintainers.`, + + MissingFaultHandlerRuleDescription: + `This rule detects when elements that can fire fault events are missingfault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.`, + + MissingNextValueConnectorRuleDescription: + `This rule detects Loops without nextValue connectors. Loops should always have nextValue connectors, and lack of one usually signifies developer error when connecting the loop element to other elements.`, + + PreventPassingUserDataIntoElementRuleDescription: + `Avoid passing user data into Flow Scanner elements in run mode: %s`, + + SameRecordUpdateRuleDescription: + `This rule detects when an AfterSave record trigger modifies the same record. Record modifications should be done in BeforeSave triggers, not AfterSave triggers. This rule follows subflows, so it will detect if the RecordId is passed to a child flow which then modifies a record with that id.`, + + TriggerCalloutRuleDescription: + `This rule detects when a trigger performs a callout on the synchronous path. Triggers must be performant and may only contain callouts on async scheduled paths. This rule follows subflows.`, + + TriggerEntryCriteriaRuleDescription: + `This rule detects when record trigger flows are missing entry criteria. All record trigger flows should have entry criteria specified in the flow trigger definition rather than solely in the flow's own business logic.`, + + TriggerWaitEventRuleDescription: + `This rule detects when a wait event is reached during trigger execution. Triggers must be performant and cannot contain wait events. For async processing, use scheduled paths within your trigger and async callouts, not wait events. This rule follows subflows.`, + + UnreachableElementRuleDescription: + `This rule identifies elements that have not been connected to the start element of the flow. Unreachable elements are usually due to incomplete flows or developer error.`, + + UnusedResourceRuleDescription: + `This rule detects redundant variables that are not used in the flow. This can be a sign of developer error.` }; export function getMessage(msgId: string, ...args: (string | number)[]): string { diff --git a/packages/code-analyzer-flow-engine/src/python/FlowScannerCommandWrapper.ts b/packages/code-analyzer-flow-engine/src/python/FlowScannerCommandWrapper.ts index d9b6371f..241b6ff3 100644 --- a/packages/code-analyzer-flow-engine/src/python/FlowScannerCommandWrapper.ts +++ b/packages/code-analyzer-flow-engine/src/python/FlowScannerCommandWrapper.ts @@ -9,6 +9,7 @@ export interface FlowScannerCommandWrapper { workspaceFlowFiles: string[], targetedFlowFiles: string[], absLogFilePath: string, + optionalQueryIds: string[], completionPercentageHandler: (percentage: number) => void ): Promise; } @@ -18,13 +19,20 @@ export type FlowScannerExecutionResult = { } export type FlowScannerRuleResult = { - flow: FlowNodeDescriptor[]; query_name: string; severity: string; counter?: number; description: string; elem_name: string; + elem_code: string; field: string; + + // This is defined if the violation is a flow based violation + flow?: FlowNodeDescriptor[]; + + // These are filled in if the violation is a single element based violation + elem_line_no?: number; + filename?: string; } export type FlowNodeDescriptor = { @@ -51,6 +59,7 @@ export class RunTimeFlowScannerCommandWrapper implements FlowScannerCommandWrapp workspaceFlowFiles: string[], targetedFlowFiles: string[], absLogFilePath: string, + optionalQueryIds: string[], completionPercentageHandler: (percentage: number) => void ): Promise { const workspaceFlowsFile: string = path.join(workingFolder, 'workspaceFiles.txt'); @@ -59,7 +68,7 @@ export class RunTimeFlowScannerCommandWrapper implements FlowScannerCommandWrapp await fs.promises.writeFile(targetedFlowsFile, targetedFlowFiles.join('\n'), 'utf-8'); const flowScannerResultsFile: string = path.join(workingFolder, 'flowScannerResultsFile.json') - const commandName = 'flowtest'; //pythonModuleName set by internal team + const commandName = 'flow_scanner'; //pythonModuleName set by internal team const pythonArgs: string[] = [ '-m', @@ -71,6 +80,8 @@ export class RunTimeFlowScannerCommandWrapper implements FlowScannerCommandWrapp workspaceFlowsFile, '--target', targetedFlowsFile, + '--optional_queries', + optionalQueryIds.join(','), '--json', flowScannerResultsFile ]; @@ -130,6 +141,8 @@ export class RunTimeFlowScannerCommandWrapper implements FlowScannerCommandWrapp /* istanbul ignore next */ private ruleResultIsValid(ruleResult: object): ruleResult is FlowScannerRuleResult { + // Only require the fields that we actually use + if (!('query_name' in ruleResult) || typeof ruleResult.query_name !== 'string') { return false; } @@ -139,7 +152,7 @@ export class RunTimeFlowScannerCommandWrapper implements FlowScannerCommandWrapp if (!('description' in ruleResult) || typeof ruleResult.description !== 'string') { return false; } - if (!('elem' in ruleResult) || typeof ruleResult.elem !== 'string') { + if (!('elem_code' in ruleResult) || typeof ruleResult.elem_code !== 'string') { return false; } if (!('elem_name' in ruleResult) || typeof ruleResult.elem_name !== 'string') { @@ -149,7 +162,9 @@ export class RunTimeFlowScannerCommandWrapper implements FlowScannerCommandWrapp return false; } if (!('flow' in ruleResult) || !(Array.isArray(ruleResult.flow))) { - return false; + // If no flow array then it must be a single element violation + return ('elem_line_no' in ruleResult && typeof ruleResult['elem_line_no'] === 'number') && + ('filename' in ruleResult && typeof ruleResult['filename'] === 'string'); } const flowNodes: object[] = ruleResult.flow; for (const flowNode of flowNodes) { diff --git a/packages/code-analyzer-flow-engine/test/engine.test.ts b/packages/code-analyzer-flow-engine/test/engine.test.ts index feecdaae..21ae60bb 100644 --- a/packages/code-analyzer-flow-engine/test/engine.test.ts +++ b/packages/code-analyzer-flow-engine/test/engine.test.ts @@ -1,5 +1,4 @@ import * as path from "node:path"; -import * as os from "node:os"; import { DescribeRulesProgressEvent, EngineRunResults, @@ -35,10 +34,11 @@ const PATH_TO_EXAMPLE3: string = path.join(PATH_TO_MULTIPLE_FLOWS_WORKSPACE, 'ex const PATH_TO_EXAMPLE4_PARENTFLOW: string = path.join(PATH_TO_MULTIPLE_FLOWS_WORKSPACE, 'example4_parentFlow.flow-meta.xml'); const PATH_TO_EXAMPLE4_SUBFLOW: string = path.join(PATH_TO_MULTIPLE_FLOWS_WORKSPACE, 'example4_subflow.flow-meta.xml'); -const ALL_FLOW_RULES: string[] = [ - 'PreventPassingUserDataIntoElementWithSharing', - 'PreventPassingUserDataIntoElementWithoutSharing' -]; +const expectedRulesGoldfile = path.join(TEST_DATA_FOLDER, 'goldfiles', 'all_rules.goldfile.json'); +const goldFileContents: string = fs.readFileSync(expectedRulesGoldfile, 'utf-8'); +const expectedRuleDescriptions: RuleDescription[] = JSON.parse(goldFileContents); + +const PreventPassingUserDataRules = ['PreventPassingUserDataIntoElementWithoutSharing', 'PreventPassingUserDataIntoElementWithSharing']; jest.setTimeout(60_000); @@ -67,7 +67,7 @@ describe('Tests for the FlowScannerEngine', () => { const ruleDescriptors: RuleDescription[] = await engine.describeRules(createDescribeOptions(workspace)); // No need to do in-depth examination of the rules, since other tests already do that. Just make sure we got // the right number of rules. - expect(ruleDescriptors).toHaveLength(2); + expect(ruleDescriptors).toHaveLength(15); expect(describeProgressEvents.map(e => e.percentComplete)).toEqual([0, 75, 100]); // Part 2: Running production rules. @@ -79,9 +79,41 @@ describe('Tests for the FlowScannerEngine', () => { fs.rmSync(expectedFlowLogFile); } const results: EngineRunResults = await engine.runRules(ruleDescriptors.map(r => r.name), runOptions); - // No need to do in-depth examination of the results, since other tests already do that. Just make sure we - // got the right number of violations. - expect(results.violations).toHaveLength(7); + // First we make sure we got the right number of violations per rule + const countsPerRule: Record = results.violations.reduce>((acc, violation) => { + acc[violation.ruleName] = (acc[violation.ruleName] || 0) + 1; + return acc; + }, {}); + expect(countsPerRule).toEqual({ + MissingDescription: 56, + MissingFaultHandler: 15, + PreventPassingUserDataIntoElementWithoutSharing: 5, + PreventPassingUserDataIntoElementWithSharing: 2 + }); + // Next, spot check a few violations to confirm formatting: + expect(results.violations).toContainEqual({ + ruleName: 'MissingDescription', + message: 'This rule detects elements that contain labels but are missing descriptions. All elements with labels should have accompanying descriptions to make the flow comprehensible to future maintainers.', + codeLocations: [{ + file: PATH_TO_EXAMPLE2, + startLine: 225, + startColumn: 1 + }], + primaryLocationIndex: 0, + resourceUrls: [] + }); + expect(results.violations).toContainEqual({ + ruleName: 'MissingFaultHandler', + message: 'This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.', + codeLocations: [{ + file: PATH_TO_EXAMPLE4_SUBFLOW, + startLine: 69, + startColumn: 1 + }], + primaryLocationIndex: 0, + resourceUrls: [] + }); + expect(runProgressEvents.map(e => e.percentComplete)).toEqual([0, 10, 10, 26, 42, 58, 74, 100]); // Confirm separate flow log file exists and the main log points to this file @@ -99,23 +131,9 @@ describe('Tests for the FlowScannerEngine', () => { it('Consolidates well-formed Flow Scanner rule descriptors into Code Analyzer rule descriptors', async () => { const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const ruleDescriptors: RuleDescription[] = await engine.describeRules(createDescribeOptions()); - - expect(ruleDescriptors).toHaveLength(2); - expect(ruleDescriptors[0]).toEqual({ - name: 'PreventPassingUserDataIntoElementWithoutSharing', - severityLevel: SeverityLevel.High, - tags: ['Recommended', 'Security', 'XML'], - description: 'Avoid passing user data into Flow Scanner elements in run mode: Without Sharing', - resourceUrls: [] - }); - expect(ruleDescriptors[1]).toEqual({ - name: 'PreventPassingUserDataIntoElementWithSharing', - severityLevel: SeverityLevel.Low, - tags: ['Recommended', 'Security', 'XML'], - description: 'Avoid passing user data into Flow Scanner elements in run mode: With Sharing', - resourceUrls: [] - }); + const ruleDescriptions: RuleDescription[] = await engine.describeRules(createDescribeOptions()); + + expect(ruleDescriptions).toEqual(expectedRuleDescriptions); }); }); @@ -139,7 +157,7 @@ describe('Tests for the FlowScannerEngine', () => { const ruleDescriptors: RuleDescription[] = await engine.describeRules(createDescribeOptions(workspace)); - expect(ruleDescriptors).toHaveLength(2); + expect(ruleDescriptors).toHaveLength(expectedRuleDescriptions.length); }); it.each([ @@ -358,8 +376,8 @@ describe('Tests for the FlowScannerEngine', () => { jest.restoreAllMocks(); }); - it('When running both rules on workspace that contains violations for SystemModeWithoutSharing and SystemModeWithSharing, then results are as expected', async () => { - const engineResults: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions( + it('When running the PreventPassingUserData* rules on workspace that contains violations for SystemModeWithoutSharing and SystemModeWithSharing, then results are as expected', async () => { + const engineResults: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions( new Workspace('id', [PATH_TO_MULTIPLE_FLOWS_WORKSPACE]))); expect(engineResults.violations).toHaveLength(7); @@ -388,7 +406,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When workspace includes only some files from within a folder, then filters out results for files outside of workspace', async () => { const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const engineResults: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions( + const engineResults: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions( new Workspace('id', [PATH_TO_EXAMPLE2]))); expect(engineResults.violations).toHaveLength(2); @@ -397,7 +415,7 @@ describe('Tests for the FlowScannerEngine', () => { }); it('When workspace does not contain flow files, then return zero violations', async () => { - const engineResults: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions( + const engineResults: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions( new Workspace('id', [PATH_TO_NO_FLOWS_WORKSPACE]))); expect(engineResults.violations).toHaveLength(0); @@ -406,7 +424,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When workspace contains flow files that have no violations but is in folder with other flow files that do have violations, then return valid results with zero violations', async () => { const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const engineResults: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions( + const engineResults: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions( new Workspace('id', [PATH_TO_EXAMPLE3]))); expect(engineResults.violations).toHaveLength(0); @@ -415,7 +433,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When workspace contains flow files with violations but the targeted files have no violations, then return valid results with zero violations', async () => { const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const engineResults: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions( + const engineResults: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions( new Workspace('id', [PATH_TO_MULTIPLE_FLOWS_WORKSPACE], [PATH_TO_EXAMPLE3]))); expect(engineResults.violations).toHaveLength(0); @@ -424,7 +442,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When workspace contains flow files with many violations and the targeted files have violations, then return valid results with violations only of targeted files', async () => { const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const engineResults: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions( + const engineResults: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions( new Workspace('id', [PATH_TO_MULTIPLE_FLOWS_WORKSPACE], [PATH_TO_EXAMPLE1]))); expect(engineResults.violations).toHaveLength(2); @@ -438,7 +456,7 @@ describe('Tests for the FlowScannerEngine', () => { // we might miss the flow utility returning results: null. const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const engineResults: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions( + const engineResults: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions( new Workspace('id', [PATH_TO_ONE_FLOW_NO_VIOLATIONS_WORKSPACE]))); expect(engineResults.violations).toHaveLength(0); @@ -450,7 +468,7 @@ describe('Tests for the FlowScannerEngine', () => { ])('When workspace contains a parent flow but not its child subflow, then return valid results with zero violations', async (workspace) => { const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const engineResults1: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const engineResults1: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(engineResults1.violations).toHaveLength(0); }); @@ -460,7 +478,7 @@ describe('Tests for the FlowScannerEngine', () => { ])('When workspace contains a child subflow but not its parent flow, then return valid results with zero violations', async (workspace) => { const engine: FlowScannerEngine = new FlowScannerEngine(flowScannerCommandWrapper); - const engineResults1: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const engineResults1: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(engineResults1.violations).toHaveLength(0); }); @@ -509,7 +527,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When both parent and child are in workspace and targeted, then expect violation', async () => { const workspace: Workspace = new Workspace("someId", [PARENT_WITH_SOURCE_CALLS_SUB_WITH_SINK_WORKSPACE]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(1); expect(results.violations[0]).toEqual(expectedViolation); @@ -524,7 +542,7 @@ describe('Tests for the FlowScannerEngine', () => { PATH_TO_ONE_FLOW_NO_VIOLATIONS_WORKSPACE ]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); @@ -532,7 +550,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When both parent and child are in workspace but only child is targeted, then expect no violation', async () => { const workspace: Workspace = new Workspace("someId", [PARENT_WITH_SOURCE_CALLS_SUB_WITH_SINK_WORKSPACE], [childFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); @@ -540,7 +558,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When both parent and child are in workspace but only parent is targeted, then expect violation since sink is found in workspace', async () => { const workspace: Workspace = new Workspace("someId", [PARENT_WITH_SOURCE_CALLS_SUB_WITH_SINK_WORKSPACE], [parentFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(1); expect(results.violations[0]).toEqual(expectedViolation); @@ -549,7 +567,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When only parent is in workspace and only parent is targeted, then expect no violation since sink is not in workspace', async () => { const workspace: Workspace = new Workspace("someId", [parentFlowFile], [parentFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); @@ -557,7 +575,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When only child is in workspace and only child is targeted, then expect no violation since source is not in workspace', async () => { const workspace: Workspace = new Workspace("someId", [childFlowFile], [childFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); @@ -608,7 +626,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When both parent and child are in workspace and targeted, then expect violation', async () => { const workspace: Workspace = new Workspace("someId", [PARENT_WITH_SINK_CALLS_SUB_WITH_SOURCE_WORKSPACE]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(1); expect(results.violations[0]).toEqual(expectedViolation); @@ -617,7 +635,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When both parent and child are in workspace but neither are targeted, then expect no violation', async () => { const workspace: Workspace = new Workspace("someId", [parentFlowFile, childFlowFile, PATH_TO_EXAMPLE3], [PATH_TO_EXAMPLE3]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); @@ -625,7 +643,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When both parent and child are in workspace but only child is targeted, then expect no violation since child would not be called', async () => { const workspace: Workspace = new Workspace("someId", [PARENT_WITH_SINK_CALLS_SUB_WITH_SOURCE_WORKSPACE], [childFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); @@ -633,7 +651,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When both parent and child are in workspace but only parent is targeted, then expect a violation since the parent is targeted', async () => { const workspace: Workspace = new Workspace("someId", [PARENT_WITH_SINK_CALLS_SUB_WITH_SOURCE_WORKSPACE], [parentFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(1); expect(results.violations[0]).toEqual(expectedViolation); @@ -642,7 +660,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When only parent is in workspace and only parent is targeted, then expect no violation since sink is not in workspace', async () => { const workspace: Workspace = new Workspace("someId", [parentFlowFile], [parentFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); @@ -650,7 +668,7 @@ describe('Tests for the FlowScannerEngine', () => { it('When only child is in workspace and only child is targeted, then expect no violation since source is not in workspace', async () => { const workspace: Workspace = new Workspace("someId", [childFlowFile], [childFlowFile]); - const results: EngineRunResults = await engine.runRules(ALL_FLOW_RULES, createRunOptions(workspace)); + const results: EngineRunResults = await engine.runRules(PreventPassingUserDataRules, createRunOptions(workspace)); expect(results.violations).toHaveLength(0); }); diff --git a/packages/code-analyzer-flow-engine/test/python/FlowScannerCommandWrapper.test.ts b/packages/code-analyzer-flow-engine/test/python/FlowScannerCommandWrapper.test.ts index 15d0af51..4a07fa75 100644 --- a/packages/code-analyzer-flow-engine/test/python/FlowScannerCommandWrapper.test.ts +++ b/packages/code-analyzer-flow-engine/test/python/FlowScannerCommandWrapper.test.ts @@ -33,7 +33,13 @@ describe('FlowScannerCommandWrapper implementations', () => { }; beforeAll(async () => { - results = await wrapper.runFlowScannerRules(workingFolder, [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], tempLogFile, statusProcessorFunction); + results = await wrapper.runFlowScannerRules( + workingFolder, + [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], + [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], + tempLogFile, + ['MissingFaultHandler'], // adding in one optional query as well + statusProcessorFunction); // The `counter` property is irrelevant to us, and causes problems across platforms. So delete it. for (const queryName of Object.keys(results.results)) { for (const queryResults of results.results[queryName]) { @@ -60,10 +66,10 @@ describe('FlowScannerCommandWrapper implementations', () => { const expectedValue = expectedResults.results[key]; expect(key in results.results).toEqual(true); expect(results.results[key]).toHaveLength(expectedValue.length); - expect(results.results[key]).toEqual(expectedValue); + for(const expectedElement of expectedValue) { // Need to do this because it seems that flow_scanner does not give results in a sorted or deterministic fashion + expect(results.results[key]).toContainEqual(expectedElement); + } } - - }); it('Correctly parses status updates from stdout', () => { @@ -103,7 +109,7 @@ describe('FlowScannerCommandWrapper implementations', () => { }); const wrapper: RunTimeFlowScannerCommandWrapper = new RunTimeFlowScannerCommandWrapper(PYTHON_COMMAND); - await expect(wrapper.runFlowScannerRules(workingFolder, [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], tempLogFile, (_num: number) => {})) + await expect(wrapper.runFlowScannerRules(workingFolder, [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], [PATH_TO_EXAMPLE1, PATH_TO_EXAMPLE2], tempLogFile, [], (_num: number) => {})) .rejects .toThrow(expectedMessage); }); diff --git a/packages/code-analyzer-flow-engine/test/test-data/goldfiles/FlowScannerCommandWrapper.test.ts/results.goldfile.json b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/FlowScannerCommandWrapper.test.ts/results.goldfile.json index 6b451955..78c8a76b 100644 --- a/packages/code-analyzer-flow-engine/test/test-data/goldfiles/FlowScannerCommandWrapper.test.ts/results.goldfile.json +++ b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/FlowScannerCommandWrapper.test.ts/results.goldfile.json @@ -1,145 +1,265 @@ { - "preset" : "Penetration Testing", - "help_url" : null, - "result_id" : "1fe99d91", - "service_version" : "0.7.1", - "flowtest_version" : "0.7.1", - "report_label" : "flowscan of contains-multiple-flows", - "email" : null, - "scan_start" : "2025-01-15 16:26:13", - "scan_end" : "2025-01-15 16:26:13", - "results" : { - "FlowSecurity.SystemModeWithoutSharing.recordUpdates.data" : [ { - "flow" : [ { - "influenced_var" : "change_subject_of_case", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subject_of_case", - "comment" : "Initialization", - "flow_path" : "__PATH_TO_EXAMPLE1__", - "line_no" : 124, - "source_text" : "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n " - }, { - "influenced_var" : "another_case_holder.Subject", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subj_assignment", - "comment" : "Variable Assignment", - "flow_path" : "__PATH_TO_EXAMPLE1__", - "line_no" : 26, - "source_text" : "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n " - }, { - "influenced_var" : "update_to_new_subject", - "influencer_var" : "another_case_holder", - "element_name" : "update_to_new_subject", - "comment" : "flow into recordUpdates via influence over update_to_new_subject in run mode SystemModeWithoutSharing", - "flow_path" : "__PATH_TO_EXAMPLE1__", - "line_no" : 102, - "source_text" : "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n \n " - } ], - "query_name" : "Flow: SystemModeWithoutSharing recordUpdates data", - "severity" : "Flow_High_Severity", - "description" : "User controlled data flows into recordUpdates element data in run mode: SystemModeWithoutSharing", - "elem" : "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n \n ", - "elem_name" : "update_to_new_subject", - "field" : "update_to_new_subject" - } ], - "FlowSecurity.SystemModeWithoutSharing.recordDeletes.selector" : [ { - "flow" : [ { - "influenced_var" : "change_subject_of_case", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subject_of_case", - "comment" : "Initialization", - "flow_path" : "__PATH_TO_EXAMPLE1__", - "line_no" : 124, - "source_text" : "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n " - }, { - "influenced_var" : "another_case_holder.Subject", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subj_assignment", - "comment" : "Variable Assignment", - "flow_path" : "__PATH_TO_EXAMPLE1__", - "line_no" : 26, - "source_text" : "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n " - }, { - "influenced_var" : "delete_created_case", - "influencer_var" : "another_case_holder", - "element_name" : "delete_created_case", - "comment" : "flow into recordDeletes via influence over delete_created_case in run mode SystemModeWithoutSharing", - "flow_path" : "__PATH_TO_EXAMPLE1__", - "line_no" : 69, - "source_text" : "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n \n " - } ], - "query_name" : "Flow: SystemModeWithoutSharing recordDeletes selector", - "severity" : "Flow_High_Severity", - "description" : "User controlled data flows into recordDeletes element selector in run mode: SystemModeWithoutSharing", - "elem" : "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n \n ", - "elem_name" : "delete_created_case", - "field" : "delete_created_case" - } ], - "FlowSecurity.SystemModeWithSharing.recordUpdates.data" : [ { - "flow" : [ { - "influenced_var" : "change_subject_of_case", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subject_of_case", - "comment" : "Initialization", - "flow_path" : "__PATH_TO_EXAMPLE2__", - "line_no" : 124, - "source_text" : "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n " - }, { - "influenced_var" : "another_case_holder.Subject", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subj_assignment", - "comment" : "Variable Assignment", - "flow_path" : "__PATH_TO_EXAMPLE2__", - "line_no" : 26, - "source_text" : "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n " - }, { - "influenced_var" : "update_to_new_subject", - "influencer_var" : "another_case_holder", - "element_name" : "update_to_new_subject", - "comment" : "flow into recordUpdates via influence over update_to_new_subject in run mode SystemModeWithSharing", - "flow_path" : "__PATH_TO_EXAMPLE2__", - "line_no" : 102, - "source_text" : "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n \n " - } ], - "query_name" : "Flow: SystemModeWithSharing recordUpdates data", - "severity" : "Flow_Low_Severity", - "description" : "User controlled data flows into recordUpdates element data in run mode: SystemModeWithSharing", - "elem" : "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n \n ", - "elem_name" : "update_to_new_subject", - "field" : "update_to_new_subject" - } ], - "FlowSecurity.SystemModeWithSharing.recordDeletes.selector" : [ { - "flow" : [ { - "influenced_var" : "change_subject_of_case", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subject_of_case", - "comment" : "Initialization", - "flow_path" : "__PATH_TO_EXAMPLE2__", - "line_no" : 124, - "source_text" : "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n " - }, { - "influenced_var" : "another_case_holder.Subject", - "influencer_var" : "change_subject_of_case", - "element_name" : "change_subj_assignment", - "comment" : "Variable Assignment", - "flow_path" : "__PATH_TO_EXAMPLE2__", - "line_no" : 26, - "source_text" : "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n " - }, { - "influenced_var" : "delete_created_case", - "influencer_var" : "another_case_holder", - "element_name" : "delete_created_case", - "comment" : "flow into recordDeletes via influence over delete_created_case in run mode SystemModeWithSharing", - "flow_path" : "__PATH_TO_EXAMPLE2__", - "line_no" : 69, - "source_text" : "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n \n " - } ], - "query_name" : "Flow: SystemModeWithSharing recordDeletes selector", - "severity" : "Flow_Low_Severity", - "description" : "User controlled data flows into recordDeletes element selector in run mode: SystemModeWithSharing", - "elem" : "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n \n ", - "elem_name" : "delete_created_case", - "field" : "delete_created_case" - } ] + "preset": "Penetration Testing", + "help_url": null, + "result_id": "6b042660", + "service_version": "0.9.9", + "flow_scanner_version": "0.9.9", + "report_label": "scan of code-analyzer-core", + "email": null, + "scan_start": "2025-10-24 10:55:33", + "scan_end": "2025-10-24 10:55:33", + "results": { + "FlowSecurity.SystemModeWithSharing.recordDeletes.selector": [ + { + "query_name": "Flow: SystemModeWithSharing recordDeletes selector", + "severity": "Flow_Low_Severity", + "description": "User controlled data flows into recordDeletes element selector in run mode: SystemModeWithSharing", + "elem": null, + "elem_name": "delete_created_case", + "field": "delete_created_case", + "elem_code": "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n ", + "elem_line_no": null, + "filename": null, + "flow_type": "Screen", + "flow": [ + { + "influenced_var": "change_subject_of_case", + "influencer_var": "change_subject_of_case", + "element_name": "change_subject_of_case", + "comment": "Initialization", + "line_no": 124, + "source_text": "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n ", + "flow_path": "__PATH_TO_EXAMPLE2__" + }, + { + "influenced_var": "another_case_holder.Subject", + "influencer_var": "change_subject_of_case", + "element_name": "change_subj_assignment", + "comment": "Variable Assignment", + "line_no": 26, + "source_text": "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n ", + "flow_path": "__PATH_TO_EXAMPLE2__" + }, + { + "influenced_var": "delete_created_case", + "influencer_var": "another_case_holder", + "element_name": "delete_created_case", + "comment": "flow into recordDeletes via influence over delete_created_case in run mode SystemModeWithSharing", + "line_no": 69, + "source_text": "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n ", + "flow_path": "__PATH_TO_EXAMPLE2__" + } + ] + } + ], + "MissingFaultHandler": [ + { + "query_name": "Missing Fault Handler", + "severity": "Flow_Low_Severity", + "description": "This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.", + "elem": "create_case", + "elem_name": "create_case", + "field": "create_case", + "elem_code": "\n create_from_record\n create_case\n \n 1045\n 224\n \n press_next\n \n case_holder\n ", + "elem_line_no": 58, + "filename": "__PATH_TO_EXAMPLE1__", + "flow_type": "Screen", + "flow": null + }, + { + "query_name": "Missing Fault Handler", + "severity": "Flow_Low_Severity", + "description": "This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.", + "elem": "update_to_new_subject", + "elem_name": "update_to_new_subject", + "field": "update_to_new_subject", + "elem_code": "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n ", + "elem_line_no": 102, + "filename": "__PATH_TO_EXAMPLE1__", + "flow_type": "Screen", + "flow": null + }, + { + "query_name": "Missing Fault Handler", + "severity": "Flow_Low_Severity", + "description": "This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.", + "elem": "create_case", + "elem_name": "create_case", + "field": "create_case", + "elem_code": "\n create_from_record\n create_case\n \n 1045\n 224\n \n press_next\n \n case_holder\n ", + "elem_line_no": 58, + "filename": "__PATH_TO_EXAMPLE2__", + "flow_type": "Screen", + "flow": null + }, + { + "query_name": "Missing Fault Handler", + "severity": "Flow_Low_Severity", + "description": "This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.", + "elem": "delete_created_case", + "elem_name": "delete_created_case", + "field": "delete_created_case", + "elem_code": "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n ", + "elem_line_no": 69, + "filename": "__PATH_TO_EXAMPLE1__", + "flow_type": "Screen", + "flow": null + }, + { + "query_name": "Missing Fault Handler", + "severity": "Flow_Low_Severity", + "description": "This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.", + "elem": "update_to_new_subject", + "elem_name": "update_to_new_subject", + "field": "update_to_new_subject", + "elem_code": "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n ", + "elem_line_no": 102, + "filename": "__PATH_TO_EXAMPLE2__", + "flow_type": "Screen", + "flow": null + }, + { + "query_name": "Missing Fault Handler", + "severity": "Flow_Low_Severity", + "description": "This rule detects when elements that can fire fault events are missing fault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.", + "elem": "delete_created_case", + "elem_name": "delete_created_case", + "field": "delete_created_case", + "elem_code": "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n ", + "elem_line_no": 69, + "filename": "__PATH_TO_EXAMPLE2__", + "flow_type": "Screen", + "flow": null + } + ], + "FlowSecurity.SystemModeWithoutSharing.recordUpdates.data": [ + { + "query_name": "Flow: SystemModeWithoutSharing recordUpdates data", + "severity": "Flow_High_Severity", + "description": "User controlled data flows into recordUpdates element data in run mode: SystemModeWithoutSharing", + "elem": null, + "elem_name": "update_to_new_subject", + "field": "update_to_new_subject", + "elem_code": "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n ", + "elem_line_no": null, + "filename": null, + "flow_type": "Screen", + "flow": [ + { + "influenced_var": "change_subject_of_case", + "influencer_var": "change_subject_of_case", + "element_name": "change_subject_of_case", + "comment": "Initialization", + "line_no": 124, + "source_text": "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n ", + "flow_path": "__PATH_TO_EXAMPLE1__" + }, + { + "influenced_var": "another_case_holder.Subject", + "influencer_var": "change_subject_of_case", + "element_name": "change_subj_assignment", + "comment": "Variable Assignment", + "line_no": 26, + "source_text": "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n ", + "flow_path": "__PATH_TO_EXAMPLE1__" + }, + { + "influenced_var": "update_to_new_subject", + "influencer_var": "another_case_holder", + "element_name": "update_to_new_subject", + "comment": "flow into recordUpdates via influence over update_to_new_subject in run mode SystemModeWithoutSharing", + "line_no": 102, + "source_text": "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n ", + "flow_path": "__PATH_TO_EXAMPLE1__" + } + ] + } + ], + "FlowSecurity.SystemModeWithoutSharing.recordDeletes.selector": [ + { + "query_name": "Flow: SystemModeWithoutSharing recordDeletes selector", + "severity": "Flow_High_Severity", + "description": "User controlled data flows into recordDeletes element selector in run mode: SystemModeWithoutSharing", + "elem": null, + "elem_name": "delete_created_case", + "field": "delete_created_case", + "elem_code": "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n ", + "elem_line_no": null, + "filename": null, + "flow_type": "Screen", + "flow": [ + { + "influenced_var": "change_subject_of_case", + "influencer_var": "change_subject_of_case", + "element_name": "change_subject_of_case", + "comment": "Initialization", + "line_no": 124, + "source_text": "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n ", + "flow_path": "__PATH_TO_EXAMPLE1__" + }, + { + "influenced_var": "another_case_holder.Subject", + "influencer_var": "change_subject_of_case", + "element_name": "change_subj_assignment", + "comment": "Variable Assignment", + "line_no": 26, + "source_text": "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n ", + "flow_path": "__PATH_TO_EXAMPLE1__" + }, + { + "influenced_var": "delete_created_case", + "influencer_var": "another_case_holder", + "element_name": "delete_created_case", + "comment": "flow into recordDeletes via influence over delete_created_case in run mode SystemModeWithoutSharing", + "line_no": 69, + "source_text": "\n delete_created_case\n \n 247\n 201\n \n exit_screen\n \n another_case_holder\n ", + "flow_path": "__PATH_TO_EXAMPLE1__" + } + ] + } + ], + "FlowSecurity.SystemModeWithSharing.recordUpdates.data": [ + { + "query_name": "Flow: SystemModeWithSharing recordUpdates data", + "severity": "Flow_Low_Severity", + "description": "User controlled data flows into recordUpdates element data in run mode: SystemModeWithSharing", + "elem": null, + "elem_name": "update_to_new_subject", + "field": "update_to_new_subject", + "elem_code": "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n ", + "elem_line_no": null, + "filename": null, + "flow_type": "Screen", + "flow": [ + { + "influenced_var": "change_subject_of_case", + "influencer_var": "change_subject_of_case", + "element_name": "change_subject_of_case", + "comment": "Initialization", + "line_no": 124, + "source_text": "\n change_subject_of_case\n String\n \n another_case_holder.Subject\n \n change subject of case\n InputField\n true\n ", + "flow_path": "__PATH_TO_EXAMPLE2__" + }, + { + "influenced_var": "another_case_holder.Subject", + "influencer_var": "change_subject_of_case", + "element_name": "change_subj_assignment", + "comment": "Variable Assignment", + "line_no": 26, + "source_text": "\n another_case_holder.Subject\n Assign\n \n change_subject_of_case\n \n ", + "flow_path": "__PATH_TO_EXAMPLE2__" + }, + { + "influenced_var": "update_to_new_subject", + "influencer_var": "another_case_holder", + "element_name": "update_to_new_subject", + "comment": "flow into recordUpdates via influence over update_to_new_subject in run mode SystemModeWithSharing", + "line_no": 102, + "source_text": "\n update_to_new_subject\n \n 50\n 355\n \n confirm_delete\n \n another_case_holder\n ", + "flow_path": "__PATH_TO_EXAMPLE2__" + } + ] + } + ] } } \ No newline at end of file diff --git a/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json new file mode 100644 index 00000000..5559e282 --- /dev/null +++ b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json @@ -0,0 +1,166 @@ +[ + { + "name": "CyclicSubflow", + "description": "This rule detects when a subflow calls a parent flow, creating a cyclic flow. The rule will detect cycles of any depth.", + "severityLevel": 1, + "tags": [ + "Performance", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "DbInLoop", + "description": "This rule detects when there are CRUD flow elements within a loop (RecordLookups, RecordCreates, RecordUpdates, RecordDeletes). This rule does not trigger if the CRUD element is in a fault handler. These DB operations should be bulkified by using collections and the \"IN\" condition. This rule does not follow subflows.", + "severityLevel": 2, + "tags": [ + "Recommended", + "Performance", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "DefaultCopy", + "description": "This rule detects default names and labels that were auto assigned to elements pasted elements in the flow builder UI. These labels and names should be changed to make the flow comprehensible to maintainers.", + "severityLevel": 3, + "tags": [ + "Recommended", + "CodeStyle", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "HardCodedId", + "description": "This rule detects hardcoded IDs within a flow. Hardcoded Ids are a bad practice, and such flows are not appropriate for distribution.", + "severityLevel": 3, + "tags": [ + "Recommended", + "BestPractices", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "MissingDescription", + "description": "This rule detects elements that contain labels but are missing descriptions. All elements with labels should have accompanying descriptions to make theflow comprehensible to future maintainers.", + "severityLevel": 4, + "tags": [ + "CodeStyle", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "MissingFaultHandler", + "description": "This rule detects when elements that can fire fault events are missingfault handlers. The rule currently detects Create Records, Update Records, Delete Records, Action Calls, and Subflows.", + "severityLevel": 2, + "tags": [ + "BestPractices", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "MissingNextValueConnector", + "description": "This rule detects Loops without nextValue connectors. Loops should always have nextValue connectors, and lack of one usually signifies developer error when connecting the loop element to other elements.", + "severityLevel": 1, + "tags": [ + "Recommended", + "BestPractices", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "PreventPassingUserDataIntoElementWithoutSharing", + "description": "Avoid passing user data into Flow Scanner elements in run mode: Without Sharing", + "severityLevel": 2, + "tags": [ + "Recommended", + "Security", + "XML" + ], + "resourceUrls": [ + "https://developer.salesforce.com/docs/platform/salesforce-code-analyzer/guide/rules-flow.html#preventpassinguserdataintoelementwithoutsharing" + ] + }, + { + "name": "PreventPassingUserDataIntoElementWithSharing", + "description": "Avoid passing user data into Flow Scanner elements in run mode: With Sharing", + "severityLevel": 4, + "tags": [ + "Recommended", + "Security", + "XML" + ], + "resourceUrls": [ + "https://developer.salesforce.com/docs/platform/salesforce-code-analyzer/guide/rules-flow.html#preventpassinguserdataintoelementwithsharing" + ] + }, + { + "name": "SameRecorUpdate", + "description": "This rule detects when an AfterSave record trigger modifies the same record. Record modifications should be done in BeforeSave triggers, not AfterSave triggers. This rule follows subflows, so it will detect if the RecordId is passed to a child flow which then modifies a record with that id.", + "severityLevel": 3, + "tags": [ + "Recommended", + "Security", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "TriggerCallout", + "description": "This rule detects when a trigger performs a callout on the synchronous path. Triggers must be performant and may only contain callouts on async scheduled paths. This rule follows subflows.", + "severityLevel": 2, + "tags": [ + "Recommended", + "Performance", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "TriggerEntryCriteria", + "description": "This rule detects when record trigger flows are missing entry criteria. All record trigger flows should have entry criteria specified in the flow trigger definition rather than solely in the flow's own business logic.", + "severityLevel": 2, + "tags": [ + "Recommended", + "Performance", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "TriggerWaitEvent", + "description": "This rule detects when a wait event is reached during trigger execution. Triggers must be performant and cannot contain wait events. For async processing, use scheduled paths within your trigger and async callouts, not wait events. This rule follows subflows.", + "severityLevel": 2, + "tags": [ + "Recommended", + "Performance", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "UnreachableElement", + "description": "This rule identifies elements that have not been connected to the start element of the flow. Unreachable elements are usually due to incomplete flows or developer error.", + "severityLevel": 4, + "tags": [ + "BestPractices", + "XML" + ], + "resourceUrls": [] + }, + { + "name": "UnusedResource", + "description": "This rule detects redundant variables that are not used in the flow. This can be a sign of developer error.", + "severityLevel": 3, + "tags": [ + "BestPractices", + "XML" + ], + "resourceUrls": [] + } +] \ No newline at end of file From 61ba8ca797dead2bdd2bc4357327dce5d5bd0114 Mon Sep 17 00:00:00 2001 From: Stephen Carter Date: Fri, 24 Oct 2025 13:50:40 -0400 Subject: [PATCH 2/3] Review feedback --- packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts | 2 +- .../test/test-data/goldfiles/all_rules.goldfile.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts b/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts index 9ba41436..124e5168 100644 --- a/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts +++ b/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts @@ -12,7 +12,7 @@ enum RuleName { MissingNextValueConnector = 'MissingNextValueConnector', PreventPassingUserDataIntoElementWithoutSharing = 'PreventPassingUserDataIntoElementWithoutSharing', PreventPassingUserDataIntoElementWithSharing = 'PreventPassingUserDataIntoElementWithSharing', - SameRecordUpdate = 'SameRecorUpdate', + SameRecordUpdate = 'SameRecordUpdate', TriggerCallout = 'TriggerCallout', TriggerEntryCriteria = 'TriggerEntryCriteria', TriggerWaitEvent = 'TriggerWaitEvent', diff --git a/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json index 5559e282..89aa1027 100644 --- a/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json +++ b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json @@ -100,7 +100,7 @@ ] }, { - "name": "SameRecorUpdate", + "name": "SameRecordUpdate", "description": "This rule detects when an AfterSave record trigger modifies the same record. Record modifications should be done in BeforeSave triggers, not AfterSave triggers. This rule follows subflows, so it will detect if the RecordId is passed to a child flow which then modifies a record with that id.", "severityLevel": 3, "tags": [ From a11864f85b8f962f8afeff9e2f2734318e87d2b2 Mon Sep 17 00:00:00 2001 From: Stephen Carter Date: Fri, 24 Oct 2025 14:29:18 -0400 Subject: [PATCH 3/3] Review feedback --- packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts | 2 +- .../test/test-data/goldfiles/all_rules.goldfile.json | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts b/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts index 124e5168..ecda68f1 100644 --- a/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts +++ b/packages/code-analyzer-flow-engine/src/hardcoded-catalog.ts @@ -25,7 +25,7 @@ const RULE_DESCRIPTIONS: RuleDescription[] = [ name: RuleName.CyclicSubflow, description: getMessage('CyclicSubflowRuleDescription'), severityLevel: SeverityLevel.Critical, - tags: [/* NOT RECOMMENDED */ COMMON_TAGS.CATEGORIES.PERFORMANCE, COMMON_TAGS.LANGUAGES.XML], + tags: [COMMON_TAGS.RECOMMENDED, COMMON_TAGS.CATEGORIES.PERFORMANCE, COMMON_TAGS.LANGUAGES.XML], resourceUrls: [] }, { diff --git a/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json index 89aa1027..5d4961f7 100644 --- a/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json +++ b/packages/code-analyzer-flow-engine/test/test-data/goldfiles/all_rules.goldfile.json @@ -4,6 +4,7 @@ "description": "This rule detects when a subflow calls a parent flow, creating a cyclic flow. The rule will detect cycles of any depth.", "severityLevel": 1, "tags": [ + "Recommended", "Performance", "XML" ],