diff --git a/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.dataflow_analysis.rst b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.dataflow_analysis.rst new file mode 100644 index 000000000..343287f28 --- /dev/null +++ b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.dataflow_analysis.rst @@ -0,0 +1,98 @@ +macaron.code\_analyzer.dataflow\_analysis package +================================================= + +.. automodule:: macaron.code_analyzer.dataflow_analysis + :members: + :show-inheritance: + :undoc-members: + +Submodules +---------- + +macaron.code\_analyzer.dataflow\_analysis.analysis module +--------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.analysis + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.bash module +----------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.bash + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.cmd\_parser module +------------------------------------------------------------ + +.. automodule:: macaron.code_analyzer.dataflow_analysis.cmd_parser + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.core module +----------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.core + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.evaluation module +----------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.evaluation + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.facts module +------------------------------------------------------ + +.. automodule:: macaron.code_analyzer.dataflow_analysis.facts + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.github module +------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.github + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.github\_expr module +------------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.github_expr + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.models module +------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.models + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.printing module +--------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.printing + :members: + :show-inheritance: + :undoc-members: + +macaron.code\_analyzer.dataflow\_analysis.run\_analysis\_standalone module +-------------------------------------------------------------------------- + +.. automodule:: macaron.code_analyzer.dataflow_analysis.run_analysis_standalone + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst index 6216f77e6..b46c0eac7 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.code_analyzer.rst @@ -6,13 +6,10 @@ macaron.code\_analyzer package :show-inheritance: :undoc-members: -Submodules ----------- +Subpackages +----------- -macaron.code\_analyzer.call\_graph module ------------------------------------------ +.. toctree:: + :maxdepth: 1 -.. automodule:: macaron.code_analyzer.call_graph - :members: - :show-inheritance: - :undoc-members: + macaron.code_analyzer.dataflow_analysis diff --git a/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst b/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst index 63ad1a5e9..3dad1ee97 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.parsers.rst @@ -33,6 +33,14 @@ macaron.parsers.bashparser module :show-inheritance: :undoc-members: +macaron.parsers.bashparser\_model module +---------------------------------------- + +.. automodule:: macaron.parsers.bashparser_model + :members: + :show-inheritance: + :undoc-members: + macaron.parsers.github\_workflow\_model module ---------------------------------------------- diff --git a/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst b/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst index d745c347f..67b6da97f 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.slsa_analyzer.ci_service.github_actions.rst @@ -9,14 +9,6 @@ macaron.slsa\_analyzer.ci\_service.github\_actions package Submodules ---------- -macaron.slsa\_analyzer.ci\_service.github\_actions.analyzer module ------------------------------------------------------------------- - -.. automodule:: macaron.slsa_analyzer.ci_service.github_actions.analyzer - :members: - :show-inheritance: - :undoc-members: - macaron.slsa\_analyzer.ci\_service.github\_actions.github\_actions\_ci module ----------------------------------------------------------------------------- diff --git a/golang/cmd/bashexprparser/bashexprparser.go b/golang/cmd/bashexprparser/bashexprparser.go new file mode 100644 index 000000000..3a55db7d2 --- /dev/null +++ b/golang/cmd/bashexprparser/bashexprparser.go @@ -0,0 +1,59 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/oracle/macaron/golang/internal/bashparser" + "github.com/oracle/macaron/golang/internal/filewriter" +) + +// Parse the bash expression and provide parsed objects in JSON format to stdout or a file. +// Params: +// +// -input : the bash expr content in string +// -output : the output file path to store the JSON content +// +// Return code: +// +// 0 - Parse successfully, return the JSON as string to stdout. If -output is set, store the json content to the file. +// If there is any errors storing to file, the result is still printed to stdout, but the errors are put to stderr instead. +// 1 - Error: Missing bash script or output file paths. +// 2 - Error: Could not parse the bash script file. Parse errors will be printed to stderr. +func main() { + input := flag.String("input", "", "The bash expr content to be parsed.") + out_path := flag.String("output", "", "The output file path to store the JSON content.") + flag.Parse() + + var json_content string + var parse_err error + if len(*input) <= 0 { + fmt.Fprintln(os.Stderr, "Missing bash expr input.") + flag.PrintDefaults() + os.Exit(1) + } else { + // Read the bash script from command line argument. + json_content, parse_err = bashparser.ParseExpr(*input) + } + + if parse_err != nil { + fmt.Fprintln(os.Stderr, parse_err.Error()) + os.Exit(2) + } + + fmt.Println(json_content) + + if len(*out_path) > 0 { + err := filewriter.StoreBytesToFile([]byte(json_content), *out_path) + if err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(1) + } + } + + os.Exit(0) +} diff --git a/golang/cmd/bashparser/bashparser.go b/golang/cmd/bashparser/bashparser.go index ed598ea28..50cc6fec2 100644 --- a/golang/cmd/bashparser/bashparser.go +++ b/golang/cmd/bashparser/bashparser.go @@ -1,4 +1,4 @@ -/* Copyright (c) 2022 - 2023, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ package main @@ -29,13 +29,14 @@ func main() { file_path := flag.String("file", "", "The path of the bash script file.") input := flag.String("input", "", "The bash script content to be parsed. Input is prioritized over file option.") out_path := flag.String("output", "", "The output file path to store the JSON content.") + raw := flag.Bool("raw", false, "Return raw parse-tree") flag.Parse() var json_content string var parse_err error if len(*input) > 0 { // Read the bash script from command line argument. - json_content, parse_err = bashparser.ParseCommands(*input) + json_content, parse_err = bashparser.Parse(*input, *raw) } else if len(*file_path) <= 0 { fmt.Fprintln(os.Stderr, "Missing bash script input or file path.") flag.PrintDefaults() @@ -47,7 +48,7 @@ func main() { fmt.Fprintln(os.Stderr, read_err.Error()) os.Exit(1) } - json_content, parse_err = bashparser.ParseCommands(string(data)) + json_content, parse_err = bashparser.Parse(string(data), *raw) } if parse_err != nil { diff --git a/golang/internal/bashparser/bashparser.go b/golang/internal/bashparser/bashparser.go index a033e6f73..b88e43a6e 100644 --- a/golang/internal/bashparser/bashparser.go +++ b/golang/internal/bashparser/bashparser.go @@ -1,4 +1,4 @@ -/* Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ // Package bashparser parses the bash scripts and provides parsed objects in JSON. @@ -11,6 +11,7 @@ import ( "strings" "mvdan.cc/sh/v3/syntax" + "mvdan.cc/sh/v3/syntax/typedjson" ) // CMDResult is used to export the bash command results in JSON. @@ -68,3 +69,63 @@ func ParseCommands(data string) (string, error) { return string(result_bytes), nil } + +func ParseRaw(data string) (string, error) { + // Replace GitHub Actions's expressions with ``$MACARON_UNKNOWN``` variable because the bash parser + // doesn't recognize such expressions. For example: ``${{ foo }}`` will be replaced by ``$MACARON_UNKNOWN``. + // Note that we don't use greedy matching, so if we have `${{ ${{ foo }} }}`, it will not be replaced by + // `$MACARON_UNKNOWN`. + // See: https://docs.github.com/en/actions/learn-github-actions/expressions. + var re, reg_error = regexp.Compile(`\$\{\{.*?\}\}`) + if reg_error != nil { + return "", reg_error + } + + // We replace the GH Actions variables with "$MACARON_UNKNOWN". + data = string(re.ReplaceAll([]byte(data), []byte("$$MACARON_UNKNOWN"))) + data_str := strings.NewReader(data) + data_parsed, parse_err := syntax.NewParser().Parse(data_str, "") + if parse_err != nil { + return "", parse_err + } + + b := new(strings.Builder) + encode_err := typedjson.Encode(b, data_parsed) + if encode_err != nil { + return "", encode_err + } + + return b.String(), nil +} + +func Parse(data string, raw bool) (string, error) { + if raw { + return ParseRaw(data) + } else { + return ParseCommands(data) + } +} + +func ParseExpr(data string) (string, error) { + data_str := strings.NewReader(data) + result_str := "[" + first := true + for word_parsed, parse_err := range syntax.NewParser().WordsSeq(data_str) { + if parse_err != nil { + return "", parse_err + } + b := new(strings.Builder) + encode_err := typedjson.Encode(b, word_parsed) + if encode_err != nil { + return "", encode_err + } + if first { + result_str = result_str + b.String() + first = false + } else { + result_str = result_str + ", " + b.String() + } + } + result_str = result_str + "]" + return result_str, nil +} diff --git a/pyproject.toml b/pyproject.toml index c10eafcdd..ad9671f32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "semgrep == 1.113.0", "email-validator >=2.2.0,<3.0.0", "rich >=13.5.3,<15.0.0", + "lark >= 1.3.0,<2.0.0", + "frozendict >= 2.4.6, <3.0.0", ] keywords = [] # https://pypi.org/classifiers/ diff --git a/src/macaron/code_analyzer/call_graph.py b/src/macaron/code_analyzer/call_graph.py deleted file mode 100644 index 1f3be3fac..000000000 --- a/src/macaron/code_analyzer/call_graph.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""This module contains classes to generate build call graphs for the target repository.""" - -from collections import deque -from collections.abc import Iterable -from typing import Any, Generic, TypeVar - -Node = TypeVar("Node", bound="BaseNode") -# The documentation below for `TypeVar` is commented out due to a breaking -# change in Sphinx version (^=6.1.0). -# Reported at: https://github.com/oracle/macaron/issues/58. -# """This binds type ``Node`` to ``BaseNode`` and any of its subclasses. - -# Therefore, any node of type ``Node`` that is stored in the call graph -# container will be a subtype of ``BaseNode``. -# """ - - -class BaseNode(Generic[Node]): - """This is the generic class for call graph nodes.""" - - def __init__(self, caller: Node | None = None, node_id: str | None = None) -> None: - """Initialize instance. - - Parameters - ---------- - caller: Node | None - The caller node. - node_id: str | None - The unique identifier of a node in the callgraph. - """ - self.callee: list[Node] = [] - self.caller: Node | None = caller - # Each node can have a model that summarizes certain properties for static analysis. - # By default this model is set to None. - self.model: Any = None - self.node_id = node_id - - def add_callee(self, node: Node) -> None: - """Add a callee to the current node. - - Parameters - ---------- - node : Node - The callee node. - """ - self.callee.append(node) - - def has_callee(self) -> bool: - """Check if the current node has callees. - - Returns - ------- - bool - Return False if there are no callees, otherwise True. - """ - return bool(self.callee) - - -class CallGraph(Generic[Node]): - """This is the generic class for creating a call graph.""" - - def __init__(self, root: Node, repo_path: str) -> None: - """Initialize instance. - - Parameters - ---------- - root : Node - The root call graph node. - repo_path : str - The path to the repo. - """ - self.root = root - self.repo_path = repo_path - - def get_root(self) -> Node: - """Get the root node in the call graph. - - Returns - ------- - Node - The root node. - """ - return self.root - - def bfs(self) -> Iterable[Node]: - """Traverse the call graph in breadth first search order. - - Yields - ------ - Node - The traversed nodes. - """ - queue: deque[Node] = deque() - queue.extend(self.root.callee) - visited = [] - while queue: - node = queue.popleft() - if node not in visited: - queue.extend(node.callee) - visited.append(node) - yield node diff --git a/src/macaron/code_analyzer/dataflow_analysis/__init__.py b/src/macaron/code_analyzer/dataflow_analysis/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/code_analyzer/dataflow_analysis/analysis.py b/src/macaron/code_analyzer/dataflow_analysis/analysis.py new file mode 100644 index 000000000..6f7c3f35f --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/analysis.py @@ -0,0 +1,469 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Entry points to perform and use the dataflow analysis.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from macaron.code_analyzer.dataflow_analysis import bash, core, evaluation, facts, github, printing +from macaron.errors import CallGraphError +from macaron.parsers import actionparser, github_workflow_model +from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool, BuildToolCommand + + +def analyse_github_workflow_file(workflow_path: str, repo_path: str | None, dump_debug: bool = False) -> core.Node: + """Perform dataflow analysis for GitHub Actions Workflow file. + + Parameters + ---------- + workflow_path: str + The path to workflow file. + repo_path: str | None + The path to the repo. + dump_debug: bool + Whether to output debug dot file (in the current working directory). + + Returns + ------- + core.Node + Graph representation of workflow and analysis results. + """ + workflow = actionparser.parse(workflow_path) + + analysis_context = core.OwningContextRef(core.AnalysisContext(repo_path)) + + core.reset_debug_sequence_number() + raw_workflow_node = github.RawGitHubActionsWorkflowNode.create(workflow, analysis_context, workflow_path) + core.increment_debug_sequence_number() + + raw_workflow_node.analyse() + + if dump_debug: + with open("analysis." + workflow_path.replace("/", "_") + ".dot", "w", encoding="utf-8") as f: + printing.print_as_dot_graph(raw_workflow_node, f, include_properties=True, include_states=True) + + return raw_workflow_node + + +def analyse_github_workflow( + workflow: github_workflow_model.Workflow, workflow_source_path: str, repo_path: str | None, dump_debug: bool = False +) -> core.Node: + """Perform dataflow analysis for GitHub Actions Workflow. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + The workflow. + workflow_path: str + The source path for the workflow. + repo_path: str | None + The path to the repo. + dump_debug: bool + Whether to output debug dot file (in the current working directory). + + Returns + ------- + core.Node + Graph representation of workflow and analysis results. + """ + analysis_context = core.OwningContextRef(core.AnalysisContext(repo_path)) + + core.reset_debug_sequence_number() + raw_workflow_node = github.RawGitHubActionsWorkflowNode.create(workflow, analysis_context, workflow_source_path) + core.increment_debug_sequence_number() + + raw_workflow_node.analyse() + + if dump_debug: + with open("analysis." + workflow_source_path.replace("/", "_") + ".dot", "w", encoding="utf-8") as f: + printing.print_as_dot_graph(raw_workflow_node, f, include_properties=True, include_states=True) + + return raw_workflow_node + + +def analyse_bash_script( + bash_content: str, source_path: str, repo_path: str | None, dump_debug: bool = False +) -> core.Node: + """Perform dataflow analysis for Bash script. + + Parameters + ---------- + bash_content: str + The Bash script content. + source_path: str + The source path for the Bash script. + repo_path: str | None + The path to the repo. + dump_debug: bool + Whether to output debug dot file (in the current working directory). + + Returns + ------- + core.Node + Graph representation of Bash script and analysis results. + """ + analysis_context = core.OwningContextRef(core.AnalysisContext(repo_path)) + bash_context = core.OwningContextRef(bash.BashScriptContext.create_in_isolation(analysis_context, source_path)) + core.reset_debug_sequence_number() + bash_node = bash.RawBashScriptNode(facts.StringLiteral(bash_content), bash_context) + core.increment_debug_sequence_number() + + bash_node.analyse() + + if dump_debug: + with open( + "analysis." + source_path.replace("/", "_") + "." + str(hash(bash_content)) + ".dot", "w", encoding="utf-8" + ) as f: + printing.print_as_dot_graph(bash_node, f, include_properties=True, include_states=True) + + return bash_node + + +# TODO generalise visitors +class FindSecretsVisitor: + """Visitor to find references to GitHub secrets in analysis expressions.""" + + #: Scope in which secrets may be found + workflow_var_scope: facts.Scope + #: Found secret variable names, populated by running the visitor + secrets: set[str] + + def __init__(self, workflow_var_scope: facts.Scope) -> None: + """Construct a visitor to find secrets. + + Parameters + ---------- + workflow_var_scope: facts.Scope + Scope in which secrets may be found + """ + self.workflow_var_scope = workflow_var_scope + self.secrets = set() + + def visit_value(self, value: facts.Value) -> None: + """Search value expression for secrets.""" + match value: + case facts.StringLiteral(_): + return + case facts.Read(loc): + self.visit_location(loc) + if evaluation.scope_matches(loc.scope, self.workflow_var_scope): + match loc.loc: + case facts.Variable(facts.StringLiteral(name)): + if name.startswith("secrets."): + self.secrets.add(name[len("secrets.") :]) + return + case facts.ArbitraryNewData(_): + return + case facts.UnaryStringOp(_, operand): + self.visit_value(operand) + return + case facts.BinaryStringOp(_, operand1, operand2): + self.visit_value(operand1) + self.visit_value(operand2) + return + case facts.ParameterPlaceholderValue(name): + return + case facts.InstalledPackage(name, version, distribution, url): + self.visit_value(name) + self.visit_value(version) + self.visit_value(distribution) + self.visit_value(url) + return + case facts.Symbolic(sym_val): + self.visit_value(sym_val) + return + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def visit_location(self, location: facts.Location) -> None: + """Search location expression for secrets.""" + self.visit_location_specifier(location.loc) + + def visit_location_specifier(self, location: facts.LocationSpecifier) -> None: + """Search location expression for secrets.""" + match location: + case facts.Filesystem(path): + self.visit_value(path) + return + case facts.Variable(name): + self.visit_value(name) + return + case facts.Artifact(name, file): + self.visit_value(name) + self.visit_value(file) + return + case facts.FilesystemAnyUnderDir(path): + self.visit_value(path) + return + case facts.ArtifactAnyFilename(name): + self.visit_value(name) + return + case facts.ParameterPlaceholderLocation(name): + return + case facts.Console(): + return + case facts.Installed(name): + self.visit_value(name) + return + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + +def get_reachable_secrets(bash_cmd_node: bash.BashSingleCommandNode) -> set[str]: + """Get GitHub secrets that are reachable at a bash command. + + Parameters + ---------- + bash_cmd_node: bash.BashSingleCommandNode + The target Bash command node. + + Returns + ------- + set[str] + The set of reachable secret variable names. + """ + result: set[str] = set() + github_context = bash_cmd_node.context.ref.get_containing_github_context() + if github_context is None: + return result + env_scope = bash_cmd_node.context.ref.env.ref + workflow_var_scope = github_context.job_context.ref.workflow_context.ref.workflow_variables.ref + + for loc, vals in bash_cmd_node.before_state.state.items(): + if evaluation.scope_matches(env_scope, loc.scope): + for val in vals: + visitor = FindSecretsVisitor(workflow_var_scope) + visitor.visit_value(val) + result.update(visitor.secrets) + + return result + + +def get_containing_github_job( + node: core.Node, parents: dict[core.Node, core.Node] +) -> github.GitHubActionsNormalJobNode | None: + """Return the GitHub job node containing the given node, if any. + + Parameters + ---------- + node: core.Node + The target node. + parents: dict[core.Node, code.Node] + The mapping of nodes to their parent nodes. + + Returns + ------- + github.GitHubActionsNormalJobNode | None + The containing job node, or None if there is no containing job. + """ + caller_node: core.Node | None = parents.get(node) + while caller_node is not None: + match caller_node: + case github.GitHubActionsWorkflowNode(): + break + case github.GitHubActionsNormalJobNode(): + return caller_node + + caller_node = parents.get(caller_node) + + return None + + +def get_containing_github_step( + node: core.Node, parents: dict[core.Node, core.Node] +) -> github.GitHubActionsRunStepNode | None: + """Return the GitHub step node containing the given node, if any. + + Parameters + ---------- + node: core.Node + The target node. + parents: dict[core.Node, code.Node] + The mapping of nodes to their parent nodes. + + Returns + ------- + github.GitHubActionsRunStepNode | None + The containing step node, or None if there is no containing step. + """ + caller_node: core.Node | None = parents.get(node) + while caller_node is not None: + match caller_node: + case github.GitHubActionsWorkflowNode(): + break + case github.GitHubActionsNormalJobNode(): + break + case github.GitHubActionsRunStepNode(): + return caller_node + + caller_node = parents.get(caller_node) + + return None + + +def get_containing_github_workflow( + node: core.Node, parents: dict[core.Node, core.Node] +) -> github.GitHubActionsWorkflowNode | None: + """Return the GitHub workflow node containing the given node, if any. + + Parameters + ---------- + node: core.Node + The target node. + parents: dict[core.Node, code.Node] + The mapping of nodes to their parent nodes. + + Returns + ------- + github.GitHubActionsWorkflowNode | None + The containing workflow node, or None if there is no containing workflow. + """ + caller_node: core.Node | None = parents.get(node) + while caller_node is not None: + match caller_node: + case github.GitHubActionsWorkflowNode(): + return caller_node + + caller_node = parents.get(caller_node) + + return None + + +def _get_build_tool_commands(nodes: core.NodeForest, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: + """Traverse the callgraph and find all the reachable build tool commands.""" + for root in nodes.root_nodes: + for node in core.traverse_bfs(root): + # We are just interested in nodes that have bash commands. + if isinstance(node, bash.BashSingleCommandNode): + # We collect useful contextual information for the called BashNode. + # The GitHub Actions workflow that triggers the path in the callgraph. + workflow_node = None + # The step in GitHub Actions job that triggers the path in the callgraph. + step_node = None + + # Walk up the callgraph to find the relevant caller nodes. + # In GitHub Actions a `GitHubWorkflowNode` may call several `GitHubJobNode`s + # and a `GitHubJobNode` may call several steps, which can be external `GitHubWorkflowNode` + # or inlined run nodes. + # TODO: revisit this implementation if analysis of external workflows is supported in + # the future, and decide if setting the caller workflow and job nodes to the nodes in the + # main triggering workflow is still expected. + workflow_node = get_containing_github_workflow(node, nodes.parents) + step_node = get_containing_github_step(node, nodes.parents) + + # Find the bash commands that call the build tool. + resolved_cmds = evaluation.evaluate(node, node.cmd) + resolved_args = [evaluation.evaluate(node, arg) if arg is not None else None for arg in node.args] + + # TODO combinations + + cmd = [evaluation.get_single_resolved_str_with_default(resolved_cmds, "$MACARON_UNKNOWN")] + [ + ( + evaluation.get_single_resolved_str_with_default(resolved_arg, "$MACARON_UNKNOWN") + if resolved_arg is not None + else "$MACARON_UNKNOWN" + ) + for resolved_arg in resolved_args + ] + + if build_tool.is_build_command(cmd): + lang_versions = lang_distributions = lang_url = None + evaluated_installed_languages = evaluation.evaluate( + node, + facts.Read( + facts.Location( + node.context.ref.filesystem.ref, + facts.Installed(facts.StringLiteral(build_tool.language)), + ) + ), + ) + evaluated_installed_languages = evaluation.filter_symbolic_values(evaluated_installed_languages) + + lang_versions = [] + lang_distributions = [] + lang_urls = [] + + for evaluated_installed_language in evaluated_installed_languages: + if isinstance(evaluated_installed_language[0], facts.InstalledPackage): + if isinstance(evaluated_installed_language[0].version, facts.StringLiteral): + lang_version_str = evaluated_installed_language[0].version.literal + if lang_version_str not in lang_versions: + lang_versions.append(lang_version_str) + if isinstance(evaluated_installed_language[0].distribution, facts.StringLiteral): + lang_distribution_str = evaluated_installed_language[0].distribution.literal + if lang_distribution_str not in lang_distributions: + lang_distributions.append(lang_distribution_str) + if isinstance(evaluated_installed_language[0].url, facts.StringLiteral): + lang_url_str = evaluated_installed_language[0].url.literal + if lang_url_str not in lang_urls: + lang_urls.append(lang_url_str) + + lang_url = lang_urls[0] if len(lang_urls) > 0 else "" + + lang_versions = sorted(lang_versions) + lang_distributions = sorted(lang_distributions) + lang_urls = sorted(lang_urls) + + yield BuildToolCommand( + ci_path=( + workflow_node.context.ref.source_filepath + if workflow_node is not None + else node.context.ref.source_filepath + ), + command=cmd, + step_node=step_node, + language=build_tool.language, + language_versions=lang_versions, + language_distributions=lang_distributions, + language_url=lang_url, + reachable_secrets=list(get_reachable_secrets(node)), + events=get_ci_events_from_workflow(workflow_node.definition) if workflow_node else [], + ) + + +def get_build_tool_commands(nodes: core.NodeForest, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: + """Traverse the callgraph and find all the reachable build tool commands. + + This generator yields sorted build tool command objects to allow a deterministic behavior. + The objects are sorted based on the string representation of the build tool object. + + Parameters + ---------- + nodes: core.NodeForest + The callgraph reachable from the CI workflows. + build_tool: BaseBuildTool + The corresponding build tool for which shell commands need to be detected. + + Yields + ------ + BuildToolCommand + The object that contains the build command as well useful contextual information. + """ + return sorted(_get_build_tool_commands(nodes, build_tool), key=str) + + +def get_ci_events_from_workflow(workflow: github_workflow_model.Workflow) -> list[str]: + """Get the CI events that trigger the GitHub Action workflow. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + The target GitHub Action workflow. + + Returns + ------- + list[str] + The list of event names. + """ + result: list[str] = [] + on = workflow["on"] + if isinstance(on, str): + result.append(on) + elif isinstance(on, list): + for hook in on: + result.append(hook) + else: + for key in on: + result.append(key) + + return result diff --git a/src/macaron/code_analyzer/dataflow_analysis/bash.py b/src/macaron/code_analyzer/dataflow_analysis/bash.py new file mode 100644 index 000000000..f350448a5 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/bash.py @@ -0,0 +1,1891 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Dataflow analysis implementation for analysing Bash shell scripts.""" + +from __future__ import annotations + +import json +import os.path +from collections import defaultdict +from collections.abc import Callable, Iterator +from dataclasses import dataclass +from itertools import product +from typing import cast + +from macaron import MACARON_PATH +from macaron.code_analyzer.dataflow_analysis import core, evaluation, facts, github, models, printing +from macaron.errors import CallGraphError, ParseError +from macaron.parsers import bashparser, bashparser_model + + +class BashExit(core.ExitType): + """Exit type for Bash exit statement.""" + + def __hash__(self) -> int: + return 37199 + + def __eq__(self, other: object) -> bool: + return isinstance(other, BashExit) + + +# Convenience instance of BashExit. +BASH_EXIT = BashExit() + + +class BashReturn(core.ExitType): + """Exit type for returning from a Bash function.""" + + def __hash__(self) -> int: + return 91193 + + def __eq__(self, other: object) -> bool: + return isinstance(other, BashReturn) + + +# Convenience instance of BashReturn. +BASH_RETURN = BashReturn() + + +@dataclass(frozen=True) +class BashScriptContext(core.Context): + """Context for a Bash script.""" + + #: Outer context, which may be a GitHub run step, another Bash script + #: that ran this script, or just the outermost analysis context if analysing + #: the script in isolation. + outer_context: ( + core.ContextRef[github.GitHubActionsStepContext] + | core.ContextRef[BashScriptContext] + | core.ContextRef[core.AnalysisContext] + ) + #: Scope for filesystem used by the script. + filesystem: core.ContextRef[facts.Scope] + #: Scope for env variables within the script. + env: core.ContextRef[facts.Scope] + #: Scope for defined functions within the script. + func_decls: core.ContextRef[facts.Scope] + #: Scope for the stdin attached to the Bash process. + stdin_scope: core.ContextRef[facts.Scope] + #: Location for the stdin attached to the Bash process. + stdin_loc: facts.LocationSpecifier + #: Scope for the stdout attached to the Bash process. + stdout_scope: core.ContextRef[facts.Scope] + #: Location for the stdout attached to the Bash process. + stdout_loc: facts.LocationSpecifier + #: Filepath for Bash script file. + source_filepath: str + + @staticmethod + def create_from_run_step( + context: core.ContextRef[github.GitHubActionsStepContext], source_filepath: str + ) -> BashScriptContext: + """Create a new Bash script context (for being called from a GitHub step) and its associated scopes. + + Reuses the filesystem and stdout scopes from the outer context, env scope inherits from the outer scope. + + Parameters + ---------- + context: core.ContextRef[github.GitHubActionsStepContext] + Outer step context. + source_filepath: str + Filepath of Bash script file. + + Returns + ------- + BashScriptContext + The new Bash script context. + """ + return BashScriptContext( + context.get_non_owned(), + context.ref.job_context.ref.filesystem.get_non_owned(), + core.OwningContextRef(facts.Scope("env", context.ref.env.ref)), + core.OwningContextRef(facts.Scope("func_decls")), + stdin_scope=core.OwningContextRef(facts.Scope("stdin")), + stdin_loc=facts.Console(), + stdout_scope=context.ref.job_context.ref.workflow_context.ref.console.get_non_owned(), + stdout_loc=facts.Console(), + source_filepath=source_filepath, + ) + + @staticmethod + def create_from_bash_script(context: core.ContextRef[BashScriptContext], source_filepath: str) -> BashScriptContext: + """Create a new Bash script context (for being called from another Bash script) and its associated scopes. + + Reuses the filesystem, stdin, and stdout scopes from the outer context, env scope inherits from the outer context. + + Parameters + ---------- + context: core.ContextRef[BashScriptContext] + Outer Bash script context. + source_filepath: str + Filepath of Bash script file. + + Returns + ------- + BashScriptContext + The new Bash script context. + """ + return BashScriptContext( + context.get_non_owned(), + context.ref.filesystem.get_non_owned(), + core.OwningContextRef(facts.Scope("env", context.ref.env.ref)), + core.OwningContextRef(facts.Scope("func_decls")), + stdin_scope=context.ref.stdin_scope.get_non_owned(), + stdin_loc=facts.Console(), + stdout_scope=context.ref.stdout_scope.get_non_owned(), + stdout_loc=facts.Console(), + source_filepath=source_filepath, + ) + + @staticmethod + def create_in_isolation(context: core.ContextRef[core.AnalysisContext], source_filepath: str) -> BashScriptContext: + """Create a new Bash script context (for being analysed in isolation) and its associated scopes. + + Parameters + ---------- + context: core.ContextRef[core.AnalysisContext] + Outer analysis context. + source_filepath: str + Filepath of Bash script file. + + Returns + ------- + BashScriptContext + The new Bash script context. + """ + return BashScriptContext( + context.get_non_owned(), + core.OwningContextRef(facts.Scope("filesystem")), + core.OwningContextRef(facts.Scope("env")), + core.OwningContextRef(facts.Scope("func_decls")), + stdin_scope=core.OwningContextRef(facts.Scope("stdin")), + stdin_loc=facts.Console(), + stdout_scope=core.OwningContextRef(facts.Scope("stdout")), + stdout_loc=facts.Console(), + source_filepath=source_filepath, + ) + + def with_stdin( + self, stdin_scope: core.ContextRef[facts.Scope], stdin_loc: facts.LocationSpecifier + ) -> BashScriptContext: + """Return a modified bash script context with the given stdin.""" + return BashScriptContext( + self.outer_context, + self.filesystem, + self.env, + self.func_decls, + stdin_scope, + stdin_loc, + self.stdout_scope, + self.stdout_loc, + self.source_filepath, + ) + + def with_stdout( + self, stdout_scope: core.ContextRef[facts.Scope], stdout_loc: facts.LocationSpecifier + ) -> BashScriptContext: + """Return a modified bash script context with the given stdout.""" + return BashScriptContext( + self.outer_context, + self.filesystem, + self.env, + self.func_decls, + self.stdin_scope, + self.stdin_loc, + stdout_scope, + stdout_loc, + self.source_filepath, + ) + + def get_containing_github_context(self) -> github.GitHubActionsStepContext | None: + """Return the (possibly transitive) containing GitHub step context, if there is one.""" + outer_context = self.outer_context.ref + while isinstance(outer_context, BashScriptContext): + outer_context = outer_context.outer_context.ref + + if isinstance(outer_context, github.GitHubActionsStepContext): + return outer_context + return None + + def get_containing_analysis_context(self) -> core.AnalysisContext: + """Return the (possibly transitive) containing analysis context.""" + outer_context = self.outer_context.ref + while isinstance(outer_context, BashScriptContext): + outer_context = outer_context.outer_context.ref + + if isinstance(outer_context, github.GitHubActionsStepContext): + return outer_context.job_context.ref.workflow_context.ref.analysis_context.ref + + return outer_context + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.outer_context + yield self.filesystem + yield self.env + yield self.func_decls + yield self.stdin_scope + yield self.stdout_scope + + +class RawBashScriptNode(core.InterpretationNode): + """Interpretation node representing a Bash script (with the script as an unparsed string value). + + Defines how to resolve and parse the Bash script content and generate the analysis representation. + """ + + #: Value for Bash script content (as a string). + script: facts.Value + #: Bash script context. + context: core.ContextRef[BashScriptContext] + + def __init__(self, script: facts.Value, context: core.ContextRef[BashScriptContext]) -> None: + """Initialize Bash script node. + + Parameters + ---------- + script: facts.Value + Value for Bash script content (as a string). + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.script = script + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the Bash script to resolve and parse the Bash script content and generate the analysis representation.""" + if isinstance(self.script, facts.StringLiteral): + script_str = self.script.literal + + def build_bash_script() -> core.Node: + try: + parsed_bash = bashparser.parse_raw(script_str, MACARON_PATH) + return BashScriptNode.create(parsed_bash, self.context.get_non_owned()) + except ParseError: + return core.NoOpStatementNode() + + return {"default": build_bash_script} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +class BashScriptNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash script. + + Control flow structure consists of a sequence of Bash statements. + Note that this can model complex control flow with branching, loops, etc. + because those control flow constructs will be statement nodes with their + own control flow nested within. + + Control flow that the cuts across multiple levels, such as an exit statement + within a if statement branch that would cause the entire script to exit + early, are modelled using the alternate exits mechanism (i.e. exit statement + creates a BashExit exit state, in the enclosing control-flow constructs the + successor of the BashExit exit of a child node will be an early BashExit exit + of that construct, and so on up until this node, where there will be a early + normal exit, and so the caller of this script would then proceed as normal after + the script exits). + """ + + #: Parsed Bash script AST. + definition: bashparser_model.File + #: Statement nodes in execution order. + stmts: list[BashStatementNode] + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.File, + stmts: list[BashStatementNode], + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash script node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.File + Parsed Bash script AST. + stmts: list[BashStatementNode] + Statement nodes in execution order. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.stmts = stmts + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence(self.stmts) + + def children(self) -> Iterator[core.Node]: + """Yield the nodes in the sequence.""" + yield from self.stmts + + def get_entry(self) -> core.Node: + """Return the entry node, the first statement in the sequence.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns the next in the sequence or the exit in the case of the last node, or an + early exit in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {core.DEFAULT_EXIT} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create(script: bashparser_model.File, context: core.NonOwningContextRef[BashScriptContext]) -> BashScriptNode: + """Create Bash script node from Bash script AST. + + Parameters + ---------- + script: bashparser_model.File + Parsed Bash script AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + stmts = [BashStatementNode(stmt, context) for stmt in script["Stmts"]] + return BashScriptNode(script, stmts, context) + + +class BashBlockNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash block. + + Control flow structure consists of a sequence of Bash statements. + """ + + #: Parsed block AST or list of statement ASTs. + definition: bashparser_model.Block | list[bashparser_model.Stmt] + #: Statement nodes in execution order. + stmts: list[BashStatementNode] + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.Block | list[bashparser_model.Stmt], + stmts: list[BashStatementNode], + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash block node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.Block | list[bashparser_model.Stmt] + Parsed block AST or list of statement ASTs. + stmts: list[BashStatementNode] + Statement nodes in execution order. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.stmts = stmts + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence(self.stmts) + + def children(self) -> Iterator[core.Node]: + """Yield the nodes in the sequence.""" + yield from self.stmts + + def get_entry(self) -> core.Node: + """Return the entry node, the first statement in the sequence.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns the next in the sequence or the exit in the case of the last node, or a + propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if isinstance(self.definition, list): + if len(self.definition) > 0: + result["line num (in script)"] = {(None, str(self.definition[0]["Pos"]["Line"]))} + else: + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + script: bashparser_model.Block | list[bashparser_model.Stmt], + context: core.NonOwningContextRef[BashScriptContext], + ) -> BashBlockNode: + """Create Bash block node from block AST or list of statement ASTs. + + Parameters + ---------- + script: bashparser_model.Block | list[bashparser_model.Stmt] + Parsed block AST or list of statement ASTs. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + if isinstance(script, list): + stmts = [BashStatementNode(stmt, context) for stmt in script] + else: + stmts = [BashStatementNode(stmt, context) for stmt in script["Stmts"]] + return BashBlockNode(script, stmts, context) + + +class BashFuncCallNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a call to a Bash function. + + Control flow structure consists of a single block containing the function body. + """ + + #: The parsed AST of the callsite statement. + call_definition: bashparser_model.Stmt + #: The parsed AST of the function declaration. + func_definition: bashparser_model.FuncDecl + #: Node representing the function body. + block: BashBlockNode + #: Bash script context. + context: core.ContextRef[BashScriptContext] + + def __init__( + self, + call_definition: bashparser_model.Stmt, + func_definition: bashparser_model.FuncDecl, + block: BashBlockNode, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash function call node. + + Parameters + ---------- + call_definition: bashparser_model.Stmt + The parsed AST of the callsite statement. + func_definition: bashparser_model.FuncDecl + The parsed AST of the function declaration. + block: BashBlockNode + Node representing the function body. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.call_definition = call_definition + self.func_definition = func_definition + self.block = block + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence([self.block]) + + def children(self) -> Iterator[core.Node]: + """Yield the function body block node.""" + yield self.block + + def get_entry(self) -> core.Node: + """Return the function body block node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns the next node in the sequence or the exit in the case of the last node, or an + early exit in the case of a BashReturn exit type, or a propagated early BashExit exit + in the case of a BashExit exit type. + """ + if isinstance(exit_type, BashReturn): + return {core.DEFAULT_EXIT} + if isinstance(exit_type, BashExit): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the line number of the callsite, the line number of the function declaration, and the scopes. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.call_definition["Pos"]["Line"]))} + result["callee decl line num (in script)"] = {(None, str(self.func_definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +def get_stdout_redirects(stmt: bashparser_model.Stmt, context: BashScriptContext) -> set[facts.Location]: + """Extract the stdout redirects specified on the statement as a set of location expressions.""" + redirs: set[facts.Location] = set() + for redir in stmt.get("Redirs", []): + if redir["Op"] in { + bashparser_model.RedirOperators.RdrOut.value, + bashparser_model.RedirOperators.RdrAll.value, + bashparser_model.RedirOperators.AppAll.value, + bashparser_model.RedirOperators.AppOut.value, + }: + if "Word" in redir: + redir_word = redir["Word"] + redir_val = convert_shell_word_to_value(redir_word, context) + if redir_val is not None: + redirs.add(facts.Location(context.filesystem.ref, facts.Filesystem(redir_val[0]))) + return redirs + + +class BashStatementNode(core.InterpretationNode): + """Interpretation node representing any kind of Bash statement. + + Defines how to interpret the different kinds of statements and generate the appropriate + analysis representation. + """ + + #: The parsed statement AST. + definition: bashparser_model.Stmt + #: Bash script context. + context: core.ContextRef[BashScriptContext] + + def __init__(self, definition: bashparser_model.Stmt, context: core.ContextRef[BashScriptContext]) -> None: + """Initialize statement node.""" + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the different kinds of statements and generate the appropriate analysis representation.""" + cmd = self.definition["Cmd"] + if ( + bashparser_model.is_call_expr(cmd) + and len(cmd.get("Args", [])) == 0 + and "Assigns" in cmd + and len(cmd["Assigns"]) == 1 + ): + # Single variable assignment statement. + assign = cmd["Assigns"][0] + + def build_assign() -> core.Node: + rhs_content = ( + parse_content(assign["Value"]["Parts"], True) + if "Value" in assign + else [LiteralOrEnvVar(is_env_var=False, literal="")] + ) + if rhs_content is not None: + rhs_val = convert_shell_value_sequence_to_fact_value(rhs_content, self.context.ref) + return models.VarAssignNode( + kind=models.VarAssignKind.BASH_ENV_VAR, + var_scope=self.context.ref.env.ref, + var_name=facts.StringLiteral(assign["Name"]["Value"]), + value=rhs_val, + ) + return core.NoOpStatementNode() + + return {"default": build_assign} + if bashparser_model.is_call_expr(cmd) and "Args" in cmd and len(cmd["Args"]) > 0: + # Statement executing a command, generate node with command name expression and + # expressions for each argument value. + # In the case where a word may tokenize as multiple words depending on the value, + # attempt to resolve them and where they do resolve to something that tokenizes as + # multiple args, generate alternative interpretations with those expanded number of + # args, alongside interpretations where those words are a dynamic expression that is + # constrained to be a single word. + arg_vals = [convert_shell_word_to_value(arg, self.context.ref) for arg in cmd["Args"]] + multitoken_resolved_arg_vals: dict[ + int, list[tuple[list[bashparser_model.Word], evaluation.ReadBindings]] + ] = defaultdict(list) + + for index, arg_val_elem in enumerate(arg_vals): + if arg_val_elem is None: + continue + arg_val_elem_val, arg_quoted = arg_val_elem + if not arg_quoted: + resolved_arg_vals = evaluation.evaluate(self, arg_val_elem_val) + for resolved_arg_val, resolved_arg_val_bindings in resolved_arg_vals: + match resolved_arg_val: + case facts.StringLiteral(literal): + parsed_bash_expr = parse_bash_expr(literal) + if parsed_bash_expr is not None and len(parsed_bash_expr) > 1: + multitoken_resolved_arg_vals[index].append( + (parsed_bash_expr, resolved_arg_val_bindings) + ) + arg_indices_in_order: list[int] = [] + values_indices_in_order: list[list[int]] = [] + for index, vals in multitoken_resolved_arg_vals.items(): + arg_indices_in_order.append(index) + values_indices_in_order.append([index for index, _ in enumerate(vals)] + [-1]) + + # Cross product could become very expensive + values_product = list(product(*values_indices_in_order)) + + if len(values_product) == 0: + values_product = [()] + + result: dict[core.InterpretationKey, Callable[[], core.Node]] = {} + + for values_product_elem in values_product: + new_arg_vals: dict[int, list[facts.Value | None]] = {} + read_bindings_list: list[evaluation.ReadBindings] = [] + for arg_index, value_index in zip(arg_indices_in_order, values_product_elem): + if value_index != -1: + expanded_vals, bindings = multitoken_resolved_arg_vals[arg_index][value_index] + read_bindings_list.append(bindings) + converted = [ + convert_shell_word_to_value(expanded_val, self.context.ref) + for expanded_val in expanded_vals + ] + new_arg_vals[arg_index] = [x[0] if x is not None else None for x in converted] + else: + old_arg_val = arg_vals[arg_index] + new_arg_vals[arg_index] = [ + facts.SingleBashTokenConstraint(old_arg_val[0]) if old_arg_val is not None else None + ] + + combined_bindings = evaluation.ReadBindings.combine_bindings(read_bindings_list) + if combined_bindings is None: + continue + full_arg_list: list[facts.Value | None] = [] + + for index, arg_val in enumerate(arg_vals): + if index in new_arg_vals: + full_arg_list.extend(new_arg_vals[index]) + else: + full_arg_list.append(arg_val[0] if arg_val is not None else None) + + cmd_arg = full_arg_list[0] + # TODO subshells + if cmd_arg is not None: + cmd_arg_val = cmd_arg + + def build_single_cmd( # pylint: disable=dangerous-default-value + cmd_arg: facts.Value = cmd_arg_val, cmd_arg_list: list[facts.Value | None] = full_arg_list[1:] + ) -> core.Node: + stdout_redirs = get_stdout_redirects(self.definition, self.context.ref) + return BashSingleCommandNode( + self.definition, self.context.get_non_owned(), cmd_arg, cmd_arg_list, stdout_redirs + ) + + result[("cmd", values_product_elem, combined_bindings)] = build_single_cmd + return result + if bashparser_model.is_if_clause(cmd): + # If statement. + + def build_if() -> core.Node: + return BashIfClauseNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_if} + + if bashparser_model.is_for_clause(cmd): + # For statement. + + def build_for() -> core.Node: + return BashForClauseNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_for} + if bashparser_model.is_binary_cmd(cmd): + match cmd["Op"]: + case bashparser_model.BinCmdOperators.Pipe.value: + + def build_pipe() -> core.Node: + return BashPipeNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_pipe} + case bashparser_model.BinCmdOperators.PipeAll.value: + pass + case bashparser_model.BinCmdOperators.AndStmt.value: + + def build_and() -> core.Node: + return BashAndNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_and} + case bashparser_model.BinCmdOperators.OrStmt.value: + + def build_or() -> core.Node: + return BashOrNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_or} + raise CallGraphError("unknown binary operator: " + str(cmd["Op"])) + if bashparser_model.is_func_decl(cmd): + # Represent Bash function decl as a store of the serialized function defintion, + # into a variable in the function decl scope. + func_decl_str = json.dumps(cmd) + + def build_func_decl() -> core.Node: + return models.VarAssignNode( + kind=models.VarAssignKind.BASH_FUNC_DECL, + var_scope=self.context.ref.func_decls.ref, + var_name=facts.StringLiteral(cmd["Name"]["Value"]), + value=facts.StringLiteral(func_decl_str), + ) + + return {"default": build_func_decl} + if bashparser_model.is_block(cmd): + + def build_block() -> core.Node: + return BashBlockNode.create(cmd, self.context.get_non_owned()) + + return {"default": build_block} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +class BashIfClauseNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash if statement. + + Control flow structure consists of executing the statements of the condition, + followed by a branch to execute either the then node or the else node (or if + there is no else node, exit immediately). The analysis is not path sensitive, + so both branches are always considered possible regardless of the condition. + """ + + #: Parsed if statement AST. + definition: bashparser_model.IfClause + #: Block node to execute the condition. + cond_stmts: BashBlockNode + #: Block node for the case where the condition is true. + then_stmts: BashBlockNode + #: Node for the case where the condition is false, if any + #: (will be another if node in the case of an elif). + else_stmts: BashBlockNode | BashIfClauseNode | None + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.IfClause, + cond_stmts: BashBlockNode, + then_stmts: BashBlockNode, + else_stmts: BashBlockNode | BashIfClauseNode | None, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash if statement node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.IfClause + Parsed if statement AST. + cond_stmts: BashBlockNode + Block node to execute the condition. + then_stmts: BashBlockNode + Block node for the case where the condition is true. + else_stmts: BashBlockNode | BashIfClauseNode | None + Node for the case where the condition is false, if any + (will be another if node in the case of an elif). + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.cond_stmts = cond_stmts + self.then_stmts = then_stmts + self.else_stmts = else_stmts + self.context = context + + self._cfg = core.ControlFlowGraph(self.cond_stmts) + self._cfg.add_successor(self.cond_stmts, core.DEFAULT_EXIT, self.then_stmts) + self._cfg.add_successor(self.then_stmts, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + if else_stmts is not None: + self._cfg.add_successor(self.cond_stmts, core.DEFAULT_EXIT, else_stmts) + self._cfg.add_successor(else_stmts, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + else: + self._cfg.add_successor(self.cond_stmts, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + + def children(self) -> Iterator[core.Node]: + """Yield the condition node, then node and (if present) else node.""" + yield self.cond_stmts + yield self.then_stmts + if self.else_stmts is not None: + yield self.else_stmts + + def get_entry(self) -> core.Node: + """Return the entry node (the condition node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + if_stmt: bashparser_model.IfClause, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashIfClauseNode: + """Create a Bash if statement node from if statement AST. + + Parameters + ---------- + if_stmt: bashparser_model.IfClause + Parsed if statement AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + cond_stmts = BashBlockNode.create(if_stmt["Cond"], context) + then_stmts = BashBlockNode.create(if_stmt["Then"], context) + else_clause = if_stmt.get("Else") + else_part: BashBlockNode | BashIfClauseNode | None = None + if else_clause is None: + else_part = None + elif bashparser_model.is_else_clause(else_clause): + else_part = BashBlockNode.create(else_clause["Then"], context) + else: + else_part = BashIfClauseNode.create(cast(bashparser_model.IfClause, else_clause), context) + return BashIfClauseNode( + definition=if_stmt, cond_stmts=cond_stmts, then_stmts=then_stmts, else_stmts=else_part, context=context + ) + + +class BashForClauseNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a Bash for statement. + + Control flow structure consists of executing the statements of the condition, + followed by a branch to execute or skip the loop body node . The analysis is + not path sensitive, so both branches are always considered possible regardless + of the condition. + + TODO: Currently doesn't actually model the loop back edge (need more testing to + be confident of analysis termination in the presence of loops). + """ + + #: Parsed for statement AST. + definition: bashparser_model.ForClause + #: Block node to execute the initializer. + init_stmts: BashBlockNode | None + #: Block node to execute the condition. + cond_stmts: BashBlockNode | None + #: Block node for the loop body. + body_stmts: BashBlockNode + #: Block node to execute the post. + post_stmts: BashBlockNode | None + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.ForClause, + init_stmts: BashBlockNode | None, + cond_stmts: BashBlockNode | None, + body_stmts: BashBlockNode, + post_stmts: BashBlockNode | None, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash for statement node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.ForClause + Parsed if statement AST. + init_stmts: BashBlockNode | None + Block node to execute the initializer. + cond_stmts: BashBlockNode | None + Block node to execute the condition. + body_stmts: BashBlockNode + Block node for the body. + post_stmts: BashBlockNode | None + Block node to execute the post. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.init_stmts = init_stmts + self.cond_stmts = cond_stmts + self.body_stmts = body_stmts + self.post_stmts = post_stmts + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.init_stmts, self.cond_stmts, self.body_stmts, self.post_stmts])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the initializer, condition, body and post nodes.""" + if self.init_stmts is not None: + yield self.init_stmts + if self.cond_stmts is not None: + yield self.cond_stmts + yield self.body_stmts + if self.post_stmts is not None: + yield self.post_stmts + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + for_stmt: bashparser_model.ForClause, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashForClauseNode: + """Create a Bash for statement node from for statement AST. + + Parameters + ---------- + for_stmt: bashparser_model.ForClause + Parsed for statement AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + body_stmts = BashBlockNode.create(for_stmt["Do"], context) + + loop = for_stmt["Loop"] + if not bashparser_model.is_cstyle_loop(loop): + return BashForClauseNode( + definition=for_stmt, + init_stmts=None, + cond_stmts=None, + body_stmts=body_stmts, + post_stmts=None, + context=context, + ) + + init_stmts: BashBlockNode | None = None + if "Init" in loop: + init_arithm_cmd = bashparser_model.ArithmCmd( + Type="ArithmCmd", + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Left=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Right=bashparser_model.Pos(Offset=0, Line=0, Col=0), + X=loop["Init"], + ) + init_stmt = bashparser_model.Stmt( + Cmd=init_arithm_cmd, + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Position=bashparser_model.Pos(Offset=0, Line=0, Col=0), + ) + init_stmts = BashBlockNode.create([init_stmt], context) + + cond_stmts: BashBlockNode | None = None + if "Cond" in loop: + cond_arithm_cmd = bashparser_model.ArithmCmd( + Type="ArithmCmd", + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Left=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Right=bashparser_model.Pos(Offset=0, Line=0, Col=0), + X=loop["Cond"], + ) + cond_stmt = bashparser_model.Stmt( + Cmd=cond_arithm_cmd, + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Position=bashparser_model.Pos(Offset=0, Line=0, Col=0), + ) + cond_stmts = BashBlockNode.create([cond_stmt], context) + + post_stmts: BashBlockNode | None = None + if "Post" in loop: + post_arithm_cmd = bashparser_model.ArithmCmd( + Type="ArithmCmd", + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Left=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Right=bashparser_model.Pos(Offset=0, Line=0, Col=0), + X=loop["Post"], + ) + post_stmt = bashparser_model.Stmt( + Cmd=post_arithm_cmd, + Pos=bashparser_model.Pos(Offset=0, Line=0, Col=0), + End=bashparser_model.Pos(Offset=0, Line=0, Col=0), + Position=bashparser_model.Pos(Offset=0, Line=0, Col=0), + ) + post_stmts = BashBlockNode.create([post_stmt], context) + + return BashForClauseNode( + definition=for_stmt, + init_stmts=init_stmts, + cond_stmts=cond_stmts, + body_stmts=body_stmts, + post_stmts=post_stmts, + context=context, + ) + + +@dataclass(frozen=True) +class BashPipeContext(core.Context): + """Context for a Bash pipe operation. + + Introduces a scope and location to represent the pipe itself connecting the piped commands, + where output from the piped-from command is written prior to being read as input by the piped-to + command. + """ + + #: Outer Bash script context + bash_script_context: core.ContextRef[BashScriptContext] + #: Scope for pipe. + pipe_scope: core.ContextRef[facts.Scope] + #: Location for pipe. + pipe_loc: facts.LocationSpecifier + + @staticmethod + def create(context: core.ContextRef[BashScriptContext]) -> BashPipeContext: + """Create a new pipe context and its associated scope.""" + return BashPipeContext(context.get_non_owned(), core.OwningContextRef(facts.Scope("pipe")), facts.Console()) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.bash_script_context + yield self.pipe_scope + + +class BashPipeNode(core.ControlFlowGraphNode): + """Control flow node representing a Bash pipe ("|") binary command. + + Control flow structure consists of executing the left-hand side, + followed by the right-hand side. + A pipe scope and location is introduced to model the piping of the + output from the first command to the input of the second command. + """ + + #: Parsed pipe binary command AST. + definition: bashparser_model.BinaryCmd + #: Left-hand side (first) command. + lhs: BashStatementNode + #: Right-hand side (second) command. + rhs: BashStatementNode + #: Pipe context. + context: core.ContextRef[BashPipeContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.BinaryCmd, + lhs: BashStatementNode, + rhs: BashStatementNode, + context: core.ContextRef[BashPipeContext], + ) -> None: + """Initialize Bash pipe node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.BinaryCmd + Parsed pipe binary command AST. + lhs: BashStatementNode + Left-hand side (first) command. + rhs: BashStatementNode + Right-hand side (second) command. + context: core.ContextRef[BashPipeContext] + Pipe context. + """ + super().__init__() + self.definition = definition + self.lhs = lhs + self.rhs = rhs + self.context = context + + self._cfg = core.ControlFlowGraph(self.lhs) + self._cfg.add_successor(self.lhs, core.DEFAULT_EXIT, self.rhs) + self._cfg.add_successor(self.rhs, core.DEFAULT_EXIT, core.DEFAULT_EXIT) + + def children(self) -> Iterator[core.Node]: + """Yield the subcommands.""" + yield self.lhs + yield self.rhs + + def get_entry(self) -> core.Node: + """Return the entry node (the lhs node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + pipe_cmd: bashparser_model.BinaryCmd, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashPipeNode: + """Create Bash pipe node from pipe binary command AST. + + Parameters + ---------- + pipe_cmd: bashparser_model.BinaryCmd + Parsed pipe binary command AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + pipe_context = core.OwningContextRef(BashPipeContext.create(context)) + piped_from_context = core.NonOwningContextRef( + context.ref.with_stdout(pipe_context.ref.pipe_scope.get_non_owned(), pipe_context.ref.pipe_loc) + ) + piped_to_context = core.NonOwningContextRef( + context.ref.with_stdin(pipe_context.ref.pipe_scope.get_non_owned(), pipe_context.ref.pipe_loc) + ) + lhs = BashStatementNode(pipe_cmd["X"], piped_from_context) + rhs = BashStatementNode(pipe_cmd["Y"], piped_to_context) + return BashPipeNode(definition=pipe_cmd, lhs=lhs, rhs=rhs, context=pipe_context) + + +class BashAndNode(core.ControlFlowGraphNode): + """Control flow node representing a Bash AND ("&&") binary command. + + Control flow structure consists of executing the left-hand side, + followed by the right-hand side. + + (TODO model short circuit?) + """ + + #: Parsed AND binary command AST. + definition: bashparser_model.BinaryCmd + #: Left-hand side (first) command. + lhs: BashStatementNode + #: Right-hand side (second) command. + rhs: BashStatementNode + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.BinaryCmd, + lhs: BashStatementNode, + rhs: BashStatementNode, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash and node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.BinaryCmd + Parsed AND binary command AST. + lhs: BashStatementNode + Left-hand side (first) command. + rhs: BashStatementNode + Right-hand side (second) command. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.lhs = lhs + self.rhs = rhs + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence([lhs, rhs]) + + def children(self) -> Iterator[core.Node]: + """Yield the subcommands.""" + yield self.lhs + yield self.rhs + + def get_entry(self) -> core.Node: + """Return the entry node (the lhs node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + and_cmd: bashparser_model.BinaryCmd, context: core.NonOwningContextRef[BashScriptContext] + ) -> BashAndNode: + """Create Bash and node from AND binary command AST. + + Parameters + ---------- + and_cmd: bashparser_model.BinaryCmd + Parsed AND binary command AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + lhs = BashStatementNode(and_cmd["X"], context) + rhs = BashStatementNode(and_cmd["Y"], context) + return BashAndNode(definition=and_cmd, lhs=lhs, rhs=rhs, context=context) + + +class BashOrNode(core.ControlFlowGraphNode): + """Control flow node representing a Bash OR ("||") binary command. + + Control flow structure consists of executing the left-hand side, + followed by the right-hand side. + + (TODO model short circuit?) + """ + + #: Parsed OR binary command AST. + definition: bashparser_model.BinaryCmd + #: Left-hand side (first) command. + lhs: BashStatementNode + #: Right-hand side (second) command. + rhs: BashStatementNode + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: bashparser_model.BinaryCmd, + lhs: BashStatementNode, + rhs: BashStatementNode, + context: core.ContextRef[BashScriptContext], + ) -> None: + """Initialize Bash OR node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: bashparser_model.BinaryCmd + Parsed OR binary command AST. + lhs: BashStatementNode + Left-hand side (first) command. + rhs: BashStatementNode + Right-hand side (second) command. + context: core.ContextRef[BashScriptContext] + Bash script context. + """ + super().__init__() + self.definition = definition + self.lhs = lhs + self.rhs = rhs + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence([lhs, rhs]) + + def children(self) -> Iterator[core.Node]: + """Yield the subcommands.""" + yield self.lhs + yield self.rhs + + def get_entry(self) -> core.Node: + """Return the entry node (the lhs node).""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successor for a given node. + + Returns a propagated early exit of the same type in the case of a BashExit or BashReturn exit type. + """ + if isinstance(exit_type, (BashExit, BashReturn)): + return {exit_type} + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the line number and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create(or_cmd: bashparser_model.BinaryCmd, context: core.NonOwningContextRef[BashScriptContext]) -> BashOrNode: + """Create Bash OR node from OR binary command AST. + + Parameters + ---------- + and_cmd: bashparser_model.BinaryCmd + Parsed AND binary command AST. + context: core.NonOwningContextRef[BashScriptContext] + Bash script context. + """ + lhs = BashStatementNode(or_cmd["X"], context) + rhs = BashStatementNode(or_cmd["Y"], context) + return BashOrNode(definition=or_cmd, lhs=lhs, rhs=rhs, context=context) + + +class BashSingleCommandNode(core.InterpretationNode): + """Interpretation node representing a single Bash command. + + Defines how to interpret the semantics of the different supported commands that + may be invoked. + """ + + #: Parsed statement AST. + definition: bashparser_model.Stmt + #: Bash script context. + context: core.ContextRef[BashScriptContext] + #: Expression for command name. + cmd: facts.Value + #: Expressions for argument values (None if unrepresentable). + args: list[facts.Value | None] + #: Location expressions for where stdout is redirected to. + stdout_redirects: set[facts.Location] + + def __init__( + self, + definition: bashparser_model.Stmt, + context: core.ContextRef[BashScriptContext], + cmd: facts.Value, + args: list[facts.Value | None], + stdout_redirects: set[facts.Location], + ) -> None: + """Initialize Bash single command node. + + Parameters + ---------- + definition: bashparser_model.Stmt + Parsed statement AST. + context: core.ContextRef[BashScriptContext] + Bash script context. + cmd: facts.Value + Expression for command name. + args: list[facts.Value | None] + Expressions for argument values (None if unrepresentable). + stdout_redirects: set[facts.Location] + Location expressions for where stdout is redirected to. + """ + super().__init__() + self.definition = definition + self.context = context + self.cmd = cmd + self.args = args + self.stdout_redirects = stdout_redirects + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the semantics of the different supported commands that may be invoked.""" + eval_transformer = evaluation.EvaluationTransformer(state) + evaluated_writes = eval_transformer.transform_value(self.cmd) + result: dict[core.InterpretationKey, Callable[[], core.Node]] = {} + + for resolved_cmd, bindings in evaluated_writes: + match resolved_cmd: + case facts.StringLiteral("echo"): + # Echo command, may have two different interpretations: + # - The concrete semantics of writing to the location its stdout is directed to + # - If writing to the special GitHub output var file, the higher-level semantics + # of writing to the variable as specified in the echoed value. + if len(self.stdout_redirects) in {0, 1} and len(self.args) == 1: + first_arg = self.args[0] + stdout_redir = ( + next(iter(self.stdout_redirects)) + if len(self.stdout_redirects) == 1 + else facts.Location(self.context.ref.stdout_scope.ref, self.context.ref.stdout_loc) + ) + if first_arg is not None: + first_arg_val = first_arg + + def build_echo( + stdout_redir: facts.Location = stdout_redir, first_arg_val: facts.Value = first_arg_val + ) -> core.Node: + return models.BashEchoNode(stdout_redir, first_arg_val) + + github_context = self.context.ref.get_containing_github_context() + + if ( + self._is_github_output_loc(stdout_redir) + and github_context is not None + and github_context.output_var_prefix is not None + ): + output_var_prefix = github_context.output_var_prefix + job_variables_scope = github_context.job_context.ref.job_variables.ref + split = evaluation.parse_str_expr_split(first_arg, "=", maxsplit=1) + if len(split) == 2: + + def build_github_var_write( + job_variables_scope: facts.Scope = job_variables_scope, + output_var_prefix: str = output_var_prefix, + split: list[facts.Value] = split, + ) -> core.Node: + return models.VarAssignNode( + kind=models.VarAssignKind.GITHUB_JOB_VAR, + var_scope=job_variables_scope, + var_name=facts.BinaryStringOp.get_string_concat( + facts.StringLiteral(output_var_prefix), split[0] + ), + value=split[1], + ) + + result[("echo_github_var", bindings)] = build_github_var_write + + result[("echo", bindings)] = build_echo + case facts.StringLiteral("mvn"): + # Maven build command. + for arg in self.args: + match arg: + case facts.StringLiteral(arg_lit): + if arg_lit in {"package", "install", "deploy", "verify"}: + + def build_mvn_build() -> core.Node: + return models.MavenBuildModelNode( + filesystem_scope=self.context.ref.filesystem.ref + ) + + result[("mvn", bindings)] = build_mvn_build + case facts.StringLiteral("exit"): + # Exit command exits the script. + def build_exit_stmt() -> core.Node: + return BashExitNode() + + result[("exit", bindings)] = build_exit_stmt + case facts.StringLiteral("base64"): + # base64 command may encode or decode Base64 strings. + + # TODO model other possibilities + if len(self.stdout_redirects) in {0, 1}: + stdout_redir = ( + next(iter(self.stdout_redirects)) + if len(self.stdout_redirects) == 1 + else facts.Location(self.context.ref.stdout_scope.ref, self.context.ref.stdout_loc) + ) + if len(self.args) == 0: + + def build_base64_encode(stdout_redir: facts.Location = stdout_redir) -> core.Node: + return models.Base64EncodeNode( + facts.Location(self.context.ref.stdin_scope.ref, self.context.ref.stdin_loc), + stdout_redir, + ) + + result[("base64_encode", bindings)] = build_base64_encode + elif len(self.args) == 1 and ( + self.args[0] == facts.StringLiteral("-d") or self.args[0] == facts.StringLiteral("--decode") + ): + + def build_base64_decode(stdout_redir: facts.Location = stdout_redir) -> core.Node: + return models.Base64DecodeNode( + facts.Location(self.context.ref.stdin_scope.ref, self.context.ref.stdin_loc), + stdout_redir, + ) + + result[("base64_decode", bindings)] = build_base64_decode + case facts.StringLiteral(cmd_name) if cmd_name.endswith(".sh"): + # Invoking another shell script. + + # TODO pass arguments + + repo_path = self.context.ref.get_containing_analysis_context().repo_path + if repo_path is not None: + # Check for path traversal patterns before analyzing a bash file. + # TODO working dir + bash_file_path = os.path.realpath(os.path.join(repo_path, "", cmd_name)) + if os.path.exists(bash_file_path) and bash_file_path.startswith(repo_path): + + def build_run_bash_script_file(bash_file_path: str = bash_file_path) -> core.Node: + bash_text = "" + with open(bash_file_path, encoding="utf-8") as bash_file: + bash_text = bash_file.read() + return RawBashScriptNode( + facts.StringLiteral(bash_text), + core.OwningContextRef( + BashScriptContext.create_from_bash_script(self.context, bash_file_path) + ), + ) + + result[("run_file_bash_script", bindings)] = build_run_bash_script_file + case facts.StringLiteral(cmd_name): + # If the command name is a defined shell function (as resolved from a read of the variable of that + # name in the function decl scope), then create a function call to the function definition stored + # in that variable. + + evaluated_func_decls = evaluation.evaluate( + self, + facts.Read( + facts.Location( + scope=self.context.ref.func_decls.ref, loc=facts.Variable(facts.StringLiteral(cmd_name)) + ) + ), + ) + for resolved_func, resolved_func_bindings in evaluated_func_decls: + if isinstance(resolved_func, facts.StringLiteral): + combined_func_bindings = evaluation.ReadBindings.combine_bindings( + [bindings, resolved_func_bindings] + ) + if combined_func_bindings is not None: + resolved_func_json = resolved_func.literal + + def build_func_call(func_json: str = resolved_func_json) -> core.Node: + func_decl = cast(bashparser_model.FuncDecl, json.loads(func_json)) + return BashFuncCallNode( + self.definition, + func_decl, + BashBlockNode.create([func_decl["Body"]], self.context.get_non_owned()), + self.context, + ) + + result[("function_call", combined_func_bindings)] = build_func_call + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + if not isinstance(self.cmd, facts.StringLiteral) or len(result) == 0: + result["default"] = build_noop + + return result + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the line number, command expression, argument expressions, stdout redirect location expressions, and scopes. + """ + properties: dict[str, set[tuple[str | None, str]]] = {} + properties["line num (in script)"] = {(None, str(self.definition["Pos"]["Line"]))} + properties["cmd"] = {(None, self.cmd.to_datalog_fact_string())} + for index, arg in enumerate(self.args): + properties["arg" + str(index)] = { + (None, arg.to_datalog_fact_string()) if arg is not None else (None, "UNKNOWN") + } + properties["stdout_redirects"] = {(None, x.to_datalog_fact_string()) for x in self.stdout_redirects} + printing.add_context_owned_scopes_to_properties_table(properties, self.context) + return properties + + @staticmethod + def _is_github_output_loc(loc: facts.Location) -> bool: + """Return whether the location is the special GitHub output variable file.""" + match loc: + case facts.Location( + _, facts.Filesystem(facts.Read(facts.Location(_, facts.Variable(facts.StringLiteral("GITHUB_OUTPUT"))))) + ): + return True + return False + + +class BashExitNode(core.StatementNode): + """Statement node representing a Bash exit command. + + Always exits with the BashExit exit type (which causes the whole script to exit). + """ + + def apply_effects(self, before_state: core.State) -> dict[core.ExitType, core.State]: + """Apply the effects of the Bash exit. + + Returns a BashExit exit state that is otherwise the same as the before state. + """ + state = core.State() + core.transfer_state(before_state, state) + return {BASH_EXIT: state} + + +@dataclass(frozen=True) +class LiteralOrEnvVar: + """Represents either a literal or a read of an environment variable.""" + + #: Whether this represents an environment variable (or else a string literal). + is_env_var: bool + #: The environment variable name or string literal value. + literal: str + + +def is_simple_var_read(param_exp: bashparser_model.ParamExp) -> bool: + """Return whether expression is a simple env var read e.g. $ENV_VAR.""" + if param_exp.get("Excl", False) or param_exp.get("Length", False) or param_exp.get("Width", False): + return False + if ( + "Index" in param_exp + or "Slice" in param_exp + or "Repl" in param_exp + or "Names" in param_exp + or "Exp" in param_exp + ): + return False + return True + + +def parse_env_var_read_word_part(part: bashparser_model.WordPart, allow_dbl_quoted: bool) -> str | None: + """Parse word part as a read of an environment variable. + + If the given word part is a read of an env var (possibly enclosed in double quotes, if allowed), + return the name of the variable, otherwise None. + """ + if bashparser_model.is_dbl_quoted(part): + if not allow_dbl_quoted: + return None + if "Parts" not in part or len(part["Parts"]) == 0: + return "" + if len(part["Parts"]) == 1: + part = part["Parts"][0] + else: + return None + + if bashparser_model.is_param_exp(part): + if not is_simple_var_read(part): + return None + return part["Param"]["Value"] + + return None + + +def parse_env_var_read_word(word: bashparser_model.Word, allow_dbl_quoted: bool) -> str | None: + """Parse word as a read of an environment variable. + + If the given word is a read of an env var (possibly enclosed in double quotes, if allowed), + return the name of the variable, otherwise None. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + return parse_env_var_read_word_part(part, allow_dbl_quoted) + return None + + +def parse_content(parts: list[bashparser_model.WordPart], allow_dbl_quoted: bool) -> list[LiteralOrEnvVar] | None: + """Parse the given sequence of word parts. + + Return a representation as a sequence of string literal and env var reads, or else return None if not representable in this way. + + If allow_dbl_quoted is True, permit word parts to be double quoted expressions, the content of which will + be included in the sequence (if False, return None if the sequence contains double quoted expressions). + """ + content: list[LiteralOrEnvVar] = [] + for part in parts: + env_var = parse_env_var_read_word_part(part, allow_dbl_quoted) + if env_var is not None: + content.append(LiteralOrEnvVar(is_env_var=True, literal=env_var)) + elif bashparser_model.is_lit(part): + content.append(LiteralOrEnvVar(is_env_var=False, literal=part["Value"])) + elif bashparser_model.is_dbl_quoted(part) and "Parts" in part: + subcontent = parse_content(part["Parts"], False) + if subcontent is None: + return None + content.extend(subcontent) + else: + return None + return content + + +def convert_shell_value_sequence_to_fact_value( + content: list[LiteralOrEnvVar], context: BashScriptContext +) -> facts.Value: + """Convert sequence of Bash values into a single concatenated expression.""" + if len(content) == 0: + raise CallGraphError("sequence cannot be empty") + + first_val = convert_shell_value_to_fact_value(content[0], context) + if len(content) == 1: + return first_val + + rest_val = convert_shell_value_sequence_to_fact_value(content[1:], context) + + return facts.BinaryStringOp(op=facts.BinaryStringOperator.STRING_CONCAT, operand1=first_val, operand2=rest_val) + + +def convert_shell_value_to_fact_value(val: LiteralOrEnvVar, context: BashScriptContext) -> facts.Value: + """Convert a Bash literal or env var read into a value expression.""" + if val.is_env_var: + return facts.Read( + loc=facts.Location(scope=context.env.ref, loc=facts.Variable(name=facts.StringLiteral(literal=val.literal))) + ) + return facts.StringLiteral(literal=val.literal) + + +def convert_shell_word_to_value( + word: bashparser_model.Word, context: BashScriptContext +) -> tuple[facts.Value, bool] | None: + """Convert a Bash word into a value expression. + + Return value expression alongside a bool indicating whether the value is + "quoted" (or else may require further expansion post-resolution if "unquoted"). + """ + dbl_quoted_parts = parse_dbl_quoted_string(word) + if dbl_quoted_parts is not None: + return convert_shell_value_sequence_to_fact_value(dbl_quoted_parts, context), True + + sgl_quoted_str = parse_sql_quoted_string(word) + if sgl_quoted_str is not None: + return facts.StringLiteral(sgl_quoted_str), True + + singular_literal = parse_singular_literal(word) + if singular_literal is not None: + return facts.StringLiteral(literal=singular_literal), True + + single_var = parse_env_var_read_word(word, False) + if single_var is not None: + return convert_shell_value_to_fact_value(LiteralOrEnvVar(True, single_var), context), False + + return None + + +def parse_dbl_quoted_string(word: bashparser_model.Word) -> list[LiteralOrEnvVar] | None: + """Parse double quoted string. + + If the given word is a double quoted expression, return + a representation as a sequence of string literal and env var reads, or + else return None if it is not a double quoted expression or if it is + not representable in this way. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + if bashparser_model.is_dbl_quoted(part) and "Parts" in part: + return parse_content(part["Parts"], False) + + return None + + +def parse_sql_quoted_string(word: bashparser_model.Word) -> str | None: + """Parse single quoted string. + + If the given word is a single quoted string, return the string + literal content, otherwise return None. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + if bashparser_model.is_sgl_quoted(part): + return part["Value"] + + return None + + +def parse_singular_literal(word: bashparser_model.Word) -> str | None: + """Parse singular literal word. + + If the given word is a single literal, return the string + literal content, otherwise return None. + """ + if len(word["Parts"]) == 1: + part = word["Parts"][0] + if bashparser_model.is_lit(part): + return part["Value"] + + return None + + +# Cache for Bash expression parsing. +# note: not thread safe +_bashparser_cache: dict[str, list[bashparser_model.Word] | None] = {} + + +def parse_bash_expr(expr: str) -> list[bashparser_model.Word] | None: + """Parse bash expression. + + Results are cached to avoid unnessary invocations of the Bash parser + (since it requires spawning a separate process). + """ + if expr in _bashparser_cache: + return _bashparser_cache[expr] + try: + parse_result = bashparser.parse_expr(expr, MACARON_PATH) + _bashparser_cache[expr] = parse_result + return parse_result + except ParseError: + return None diff --git a/src/macaron/code_analyzer/dataflow_analysis/cmd_parser.py b/src/macaron/code_analyzer/dataflow_analysis/cmd_parser.py new file mode 100644 index 000000000..f6a074a90 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/cmd_parser.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains parsers for command line interfaces for commands relevant to analysis.""" + +from __future__ import annotations + +import argparse + + +def parse_python_command_line(args: list[str]) -> argparse.Namespace: + """Parse python command line. + + Parameters + ---------- + args: list[str] + Argument list to python command + + Returns + ------- + argparse.Namespace + Parsed python command args + """ + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("-B", action="store_true") + parser.add_argument("-b", action="count") + parser.add_argument("--check-hash-based-pycs") + parser.add_argument("-d", action="store_true") + parser.add_argument("-E", action="store_true") + parser.add_argument("-h", action="store_true") + parser.add_argument("-?", action="store_true", dest="h") + parser.add_argument("--help", action="store_true", dest="h") + parser.add_argument("--help-env", action="store_true") + parser.add_argument("--help-xoptions", action="store_true") + parser.add_argument("--help-all", action="store_true") + parser.add_argument("-i", action="store_true") + parser.add_argument("-I", action="store_true") + parser.add_argument("-o", action="count") + parser.add_argument("-P", action="store_true") + parser.add_argument("-q", action="store_true") + parser.add_argument("-s", action="store_true") + parser.add_argument("-S", action="store_true") + parser.add_argument("-u", action="store_true") + parser.add_argument("-v", action="count") + parser.add_argument("-V", action="count") + parser.add_argument("--version", action="count", dest="V") + parser.add_argument("-w", action="store") + parser.add_argument("-x", action="store") + parser.add_argument("-m", nargs=argparse.REMAINDER) + parser.add_argument("-c", nargs=argparse.REMAINDER) + parser.add_argument("file", nargs=argparse.REMAINDER) + + parsed_args = parser.parse_args(args) + + if parsed_args.m is not None: + parsed_args.subprocess_args = parsed_args.m[1:] + parsed_args.m = parsed_args.m[0] + parsed_args.file = None + elif parsed_args.c is not None: + parsed_args.subprocess_args = parsed_args.c[1:] + parsed_args.c = parsed_args.c[0] + parsed_args.file = None + else: + if len(parsed_args.file) > 0 and parsed_args.file[0] == "--": + parsed_args.file = parsed_args.file[1:] + if len(parsed_args.file) == 0: + parsed_args.subprocess_args = [] + parsed_args.file = None + else: + parsed_args.subprocess_args = parsed_args.file[1:] + parsed_args.file = parsed_args.file[0] + + return parsed_args + + +def main() -> None: + """Test python command line parser.""" + print(str(parse_python_command_line(["-B", "-m", "pip", "install", "-U", "cibuildwheel"]))) # noqa: T201 + print(str(parse_python_command_line(["-B", "pip.py", "install", "-U", "cibuildwheel"]))) # noqa: T201 + print(str(parse_python_command_line(["-B", "--", "--pip.py", "install", "-U", "cibuildwheel"]))) # noqa: T201 + print( # noqa: T201 + str(parse_python_command_line(["-B", "-c", "import sys; print(sys.argv[1:])", "install", "-U", "cibuildwheel"])) + ) + print(str(parse_python_command_line(["-B"]))) # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/src/macaron/code_analyzer/dataflow_analysis/core.py b/src/macaron/code_analyzer/dataflow_analysis/core.py new file mode 100644 index 000000000..5a33ef56a --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/core.py @@ -0,0 +1,695 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Core dataflow analysis framework definitions and algorithm.""" + +from __future__ import annotations + +import functools +from abc import ABC, abstractmethod +from collections import defaultdict +from collections.abc import Callable, Iterator, Sequence +from dataclasses import dataclass +from queue import Queue +from typing import Generic, Protocol, TypeGuard, TypeVar + +from macaron.code_analyzer.dataflow_analysis import facts +from macaron.errors import CallGraphError + +# Debug sequence number used to provide ordering information in debug graph. +# note: not thread safe +DEBUG_SEQUENCE_NUMBER = 0 + + +def reset_debug_sequence_number() -> None: + """Reset debug sequence number.""" + global DEBUG_SEQUENCE_NUMBER # pylint: disable=global-statement + DEBUG_SEQUENCE_NUMBER = 0 + + +def get_debug_sequence_number() -> int: + """Get current debug sequence number value.""" + return DEBUG_SEQUENCE_NUMBER + + +def increment_debug_sequence_number() -> None: + """Increment debug sequence number.""" + global DEBUG_SEQUENCE_NUMBER # pylint: disable=global-statement + DEBUG_SEQUENCE_NUMBER = DEBUG_SEQUENCE_NUMBER + 1 + + +@dataclass(frozen=True) +class StateDebugLabel: + """Label for state fact providing information useful for debugging. + + Provides a record of analysis ordering and whether the fact was just copied + from another state rather than newly produced. + """ + + #: Sequence number at time when state fact was created. + sequence_number: int + #: Whether the state fact is just copied from another state rather than newly produced.""" + copied: bool + + +class StateTransferFilter(ABC): + """Interface for state transfer filters, which filter out state facts by location.""" + + @abstractmethod + def should_transfer(self, loc: facts.Location) -> bool: + """Return whether facts with the given locations should be transferred or else filtered out.""" + + +class State: + """Representation of the abstract storage state at some program point. + + Consists of a set of abstract locations, each associated with a set of possible values. + """ + + #: Mapping of locations to a set of possible values. + #: Values are annotated with a label containing info relevant for debugging + state: dict[facts.Location, dict[facts.Value, StateDebugLabel]] + + def __init__(self) -> None: + """Construct an empty state.""" + self.state = defaultdict(dict) + + +class DefaultStateTransferFilter(StateTransferFilter): + """Default state transfer filter that includes all locations.""" + + def should_transfer(self, loc: facts.Location) -> bool: + """Transfer all locations.""" + return True + + +# Convenience instance of DefaultStateTransferFilter +DEFAULT_STATE_TRANSFER_FILTER = DefaultStateTransferFilter() + + +class ExcludedLocsStateTransferFilter(StateTransferFilter): + """State transfer filter that excludes any locations in the given set.""" + + #: Locations to exclude. + excluded_locs: set[facts.Location] + + def __init__(self, excluded_locs: set[facts.Location]) -> None: + """Construct filter that excludes the given locations.""" + self.excluded_locs = excluded_locs + + def should_transfer(self, loc: facts.Location) -> bool: + """Return whether facts with the given locations should be transferred or else filtered out.""" + return loc not in self.excluded_locs + + +class ExcludedScopesStateTransferFilter(StateTransferFilter): + """State transfer filter that excludes any locations that are within the scopes in the given set.""" + + #: Scopes to exclude. + excluded_scopes: set[facts.Scope] + + def __init__(self, excluded_scopes: set[facts.Scope]) -> None: + """Construct filter that excludes the given scopes.""" + self.excluded_scopes = excluded_scopes + + def should_transfer(self, loc: facts.Location) -> bool: + """Return whether facts with the given locations should be transferred or else filtered out.""" + return loc.scope not in self.excluded_scopes + + +def transfer_state( + src_state: State, + dest_state: State, + transfer_filter: StateTransferFilter = DEFAULT_STATE_TRANSFER_FILTER, + debug_is_copy: bool = True, +) -> bool: + """Transfer/copy all facts in the src state to the dest state, except those excluded by the given filter. + + Parameters + ---------- + src_state: State + The state to transfer facts from. + dest_state: State + The state to modify by transferring facts to. + transfer_filter: StateTransferFilter + The filter to apply to the transferred facts (by default, transfer all). + debug_is_copy: bool + Whether the facts newly added to the dest state should be recorded as being copied or not (for debugging purposes). + + Returns + ------- + bool + Whether the dest state was modified. + """ + changed = False + for loc, vals in src_state.state.items(): + if not transfer_filter.should_transfer(loc): + continue + exit_vals = dest_state.state[loc] + for val, label in vals.items(): + if val not in exit_vals: + exit_vals[val] = StateDebugLabel(get_debug_sequence_number(), True if debug_is_copy else label.copied) + changed = True + return changed + + +class ExitType(ABC): + """Representation of an exit type, describing the manner in which the execution of a node may terminate.""" + + @abstractmethod + def __hash__(self) -> int: + pass + + @abstractmethod + def __eq__(self, other: object) -> bool: + pass + + +class DefaultExit(ExitType): + """Default, normal exit.""" + + def __hash__(self) -> int: + return 19391 + + def __eq__(self, other: object) -> bool: + return isinstance(other, DefaultExit) + + +# Convenience instance of DefaultExit. +DEFAULT_EXIT = DefaultExit() + + +class Node(ABC): + """Base class of all node types in dataflow analysis. + + Subclasses will represent the various program/semantic constructs, + and define how to analyse them. + """ + + #: Abstract state at the point before the execution of this node. + before_state: State + + #: Abstract state at the point after the execution of this node, for each possible distinct exit type. + exit_states: dict[ExitType, State] + + #: Sequence number at the point the node was created, recorded for debugging purposes. + created_debug_sequence_num: int + #: Log of begin/end sequence numbers each time this node was processed, recorded for debugging purposes. + processed_log: list[tuple[int, int]] + + def __init__(self) -> None: + """Initialize with empty states.""" + self.before_state = State() + self.exit_states = defaultdict(State) + self.created_debug_sequence_num = get_debug_sequence_number() + self.processed_log = [] + + @abstractmethod + def children(self) -> Iterator[Node]: + """Yield the child nodes of this node.""" + + @abstractmethod + def analyse(self) -> bool: + """Perform analysis of this node (and potentially any child nodes). + + Update the exit states with the analysis result. + Returns whether anything was modified. + """ + raise NotImplementedError + + def is_processed(self) -> bool: + """Return whether this node has been processed.""" + return len(self.processed_log) > 0 + + def notify_processed(self, begin_seq_num: int, end_seq_num: int) -> None: + """Record that this node has been processed.""" + self.processed_log.append((begin_seq_num, end_seq_num)) + + def get_exit_state_transfer_filter(self) -> StateTransferFilter: + """Return the state transfer filter applicable to the exit state of this node. + + By default, nothing is excluded. Subclasses should override to provide appropriate filters + to avoid transferring state that will be irrelevant after the node exits. + """ + return DEFAULT_STATE_TRANSFER_FILTER + + def __hash__(self) -> int: + return id(self) + + def __eq__(self, other: object) -> bool: + return self is other + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a table of stringified properties, describing the details of this node, for debugging purposes. + + The returned properties table is a mapping of name to value-set, which can be rendered via the functions + in the printing module. + """ + return {} + + +def node_is_not_none(node: Node | None) -> TypeGuard[Node]: + """Return whether the given node is not None.""" + return node is not None + + +def traverse_bfs(node: Node) -> Iterator[Node]: + """Traverse the node tree in a breadth-first manner, yielding the nodes (including this node) in traversal order.""" + queue: Queue[Node] = Queue() + queue.put(node) + while not queue.empty() > 0: + next_node = queue.get() + yield next_node + for child in next_node.children(): + queue.put(child) + + +def build_parent_mapping(node: Node) -> dict[Node, Node]: + """Construct a mapping of nodes to their parent nodes.""" + parents: dict[Node, Node] = {} + + queue: Queue[Node] = Queue() + queue.put(node) + while not queue.empty(): + next_node = queue.get() + for child in next_node.children(): + parents[child] = next_node + queue.put(child) + + return parents + + +class NodeForest: + """A collection of independent root nodes (with no control-flow or relation between them).""" + + #: Collection of root nodes. + root_nodes: list[Node] + #: Mapping of nodes to their parent nodes. + parents: dict[Node, Node] + + def __init__(self, root_nodes: list[Node]) -> None: + """Construct a NodeForest for the given nodes, and build the parent mapping.""" + self.root_nodes = root_nodes + self.parents = {} + for root_node in root_nodes: + root_node_parents = build_parent_mapping(root_node) + self.parents.update(root_node_parents) + + +class ControlFlowGraph: + """Graph structure to represent control flow graphs.""" + + #: Entry node. + entry: Node + #: Graph of successor edges. + #: Each edge is from a particular exit of a particular node, either to a node or to an exit of the control flow itself. + successors: dict[Node, dict[ExitType, set[Node | ExitType]]] + + def __init__(self, entry: Node) -> None: + """Construct an initially-empty control flow graph.""" + self.entry = entry + self.successors = defaultdict(lambda: defaultdict(set)) + + def get_entry(self) -> Node: + """Return the entry node.""" + return self.entry + + def add_successor(self, src: Node, exit_type: ExitType, dest: Node | ExitType) -> None: + """Add a successor edge to the control flow graph.""" + self.successors[src][exit_type].add(dest) + + def get_successors(self, node: Node, exit_type: ExitType) -> set[Node | ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self.successors[node][exit_type] + + @staticmethod + def create_from_sequence(seq: Sequence[Node]) -> ControlFlowGraph: + """Construct a linear sequence of nodes.""" + if len(seq) == 0: + raise CallGraphError("cannot create control flow graph from empty sequence") + cfg = ControlFlowGraph(seq[0]) + prev_node = seq[0] + for node in seq[1:]: + cfg.add_successor(prev_node, DEFAULT_EXIT, node) + prev_node = node + + cfg.add_successor(prev_node, DEFAULT_EXIT, DEFAULT_EXIT) + + return cfg + + +class ControlFlowGraphNode(Node): + """Base class for nodes representing control-flow constructs. + + Defines the generic algorithm for analysing control flow graphs. + Subclasses will define the child nodes and concrete graph structure. + """ + + def _propagate_edges( + self, + worklist: set[Node], + src_state: State, + state_transfer_filter: StateTransferFilter, + successors: set[Node | ExitType], + ) -> bool: + changed = False + for successor in successors: + if isinstance(successor, Node): + transfer_changed = transfer_state(src_state, successor.before_state, state_transfer_filter) + changed = changed or transfer_changed + if transfer_changed or not successor.is_processed(): + worklist.add(successor) + elif isinstance(successor, ExitType): + changed = transfer_state(src_state, self.exit_states[successor], state_transfer_filter) or changed + return changed + + def analyse(self) -> bool: + """Perform analysis of this node. + + Performs analysis of the child nodes and propagates state from the exit state of an updated node to the before + state of its successor nodes, according to the control-flow-graph structure, then analyses the successor nodes, + and so on until a fixpoint is reached and no further updates may be made to any node states. + + Returns whether anything was modified. + """ + begin_seq_num = get_debug_sequence_number() + entry_node = self.get_entry() + if entry_node is None: + changed = transfer_state(self.before_state, self.exit_states[DEFAULT_EXIT]) + increment_debug_sequence_number() + return changed + + changed = transfer_state(self.before_state, entry_node.before_state) + increment_debug_sequence_number() + + worklist = {entry_node} + + while len(worklist) > 0: + next_node = worklist.pop() + next_changed = next_node.analyse() + changed = changed or next_changed + + next_state_transfer_filter = next_node.get_exit_state_transfer_filter() + + for exit_type, exit_state in next_node.exit_states.items(): + successors = self.get_successors(next_node, exit_type) + changed = self._propagate_edges(worklist, exit_state, next_state_transfer_filter, successors) or changed + + increment_debug_sequence_number() + + self.notify_processed(begin_seq_num, get_debug_sequence_number() - 1) + return changed + + @abstractmethod + def get_entry(self) -> Node | None: + """Return the entry node.""" + + @abstractmethod + def get_successors(self, node: Node, exit_type: ExitType) -> set[Node | ExitType]: + """Return the successors for a particular exit of a particular node.""" + + +class StatementNode(Node): + """Base class for nodes representing constructs with direct effects (and no child nodes). + + Subclasses will define the effects that apply when the node is executed. + """ + + def analyse(self) -> bool: + """Perform analysis of this node, by applying the effects to update the after state. + + Returns whether anything was modified. + """ + begin_seq_num = get_debug_sequence_number() + new_exit_states = self.apply_effects(self.before_state) + changed = False + for new_exit_type, new_exit_state in new_exit_states.items(): + changed = transfer_state(new_exit_state, self.exit_states[new_exit_type], debug_is_copy=False) or changed + + self.notify_processed(begin_seq_num, get_debug_sequence_number()) + increment_debug_sequence_number() + return changed + + def children(self) -> Iterator[Node]: + """Yield nothing, as statements have no child nodes.""" + yield from () + + @abstractmethod + def apply_effects(self, before_state: State) -> dict[ExitType, State]: + """Apply the effects of the statement, given the before state, returning the resulting exit state.""" + + +class NoOpStatementNode(StatementNode): + """Statement that has no effect.""" + + def apply_effects(self, before_state: State) -> dict[ExitType, State]: + """Apply the effects of the no-op, returning an exit state that is the same as the before state.""" + state = State() + transfer_state(before_state, state) + return {DEFAULT_EXIT: state} + + +class InterpretationKey(Protocol): + """Interpretation key used to identify interpretations that have been produced before. + + Must support hashing and equality comparison to allow use as a dict key. + """ + + @abstractmethod + def __hash__(self) -> int: + pass + + @abstractmethod + def __eq__(self, other: object, /) -> bool: + pass + + +class InterpretationNode(Node): + """Base class for nodes representing constructs requiring interpretation. + + Such constructs must be interpreted to produce possibly-multiple child nodes representing possible + interpretations of the semantics of the node. + + Analysing the interpretation node will apply the combined effects of all of the possible interpretations. + Subclasses will define how to identify the possible interpretations and generate the corresponding nodes. + """ + + #: The generated interpretations of this node, identified/deduplicated by some interpretation key. + interpretations: dict[InterpretationKey, Node] + + def __init__(self) -> None: + """Initialize node with no interpretations.""" + super().__init__() + self.interpretations = {} + + def children(self) -> Iterator[Node]: + """Yield each of the possible interpretations.""" + yield from self.interpretations.values() + + def update_interpretations(self) -> bool: + """Analyse the node to identify interpretations. + + Analysis is done in the context of the current before state, adding any + new interpretations generated to the interpretations dict. + """ + latest_interpretations = self.identify_interpretations(self.before_state) + new_interpretations = {x: y for (x, y) in latest_interpretations.items() if x not in self.interpretations} + for new_interpretation, build_node in new_interpretations.items(): + self.interpretations[new_interpretation] = build_node() + + return len(new_interpretations) != 0 + + @abstractmethod + def identify_interpretations(self, state: State) -> dict[InterpretationKey, Callable[[], Node]]: + """Analyse the node, in the context of the given before state, to identify interpretations. + + Returns, for each discovered interpretation, an identifying interpretation key that can be used + to determine if the interpretation has been produced previously, and a callable that generates + the node representing that interpretation (used to generate the node if the interpretation is new, + otherwise the previously-generated node will be reused). + """ + + def analyse(self) -> bool: + """Perform analysis of this node, by analysing each possible interpretation. + + Merges the exit states of each analysed interpretation to update the exit state of this node. + + Returns whether anything was modified. + """ + begin_seq_num = get_debug_sequence_number() + + interpretations_changed = self.update_interpretations() + + increment_debug_sequence_number() + + sub_nodes_changed = False + exit_changed = False + + key_transfer_changed: dict[InterpretationKey, bool] = {} + + for key, node in self.interpretations.items(): + transfer_changed = transfer_state(self.before_state, node.before_state) + key_transfer_changed[key] = transfer_changed + sub_nodes_changed = sub_nodes_changed or transfer_changed + + increment_debug_sequence_number() + + for key, node in self.interpretations.items(): + if key_transfer_changed[key] or not node.is_processed(): + analyse_changed = node.analyse() + sub_nodes_changed = sub_nodes_changed or analyse_changed + + for node in self.interpretations.values(): + for exit_type, exit_state in node.exit_states.items(): + if exit_type not in self.exit_states: + exit_changed = True + exit_changed = ( + transfer_state(exit_state, self.exit_states[exit_type], node.get_exit_state_transfer_filter()) + or exit_changed + ) + + self.notify_processed(begin_seq_num, get_debug_sequence_number()) + increment_debug_sequence_number() + + return interpretations_changed or sub_nodes_changed or exit_changed + + +R_co = TypeVar("R_co", covariant=True) + + +@dataclass(frozen=True) +class OwningContextRef(Generic[R_co]): + """A reference to a part of a node's context that "owns" it. + + Ownership is used to identify what scopes are tied to a particular node + such that they cease to exist or become irrelevant after the node exits, + and thus any values stored in locations within those scopes may be erased + from the state beyond that point to simplify the state. + """ + + ref: R_co + + def get_non_owned(self) -> NonOwningContextRef[R_co]: + """Return a non owning reference to the same object.""" + return NonOwningContextRef(self.ref) + + +@dataclass(frozen=True) +class NonOwningContextRef(Generic[R_co]): + """A reference to a part of a node's context that does not "own" it. + + Ownership is used to identify what scopes are tied to a particular node + such that they cease to exist or become irrelevant after the node exits, + and thus any values stored in locations within those scopes may be erased + from the state beyond that point to simplify the state. + """ + + ref: R_co + + def get_non_owned(self) -> NonOwningContextRef[R_co]: + """Return a non-owning reference to the same object.""" + return self + + +# A context ref may be owning or non-owning. +ContextRef = OwningContextRef[R_co] | NonOwningContextRef[R_co] + + +class Context(ABC): + """Base class for node contexts. + + Represents the necessary context that influences the analysis of a node, + primarily that of identifying the concrete scopes that fill particular + roles in the node. + """ + + @abstractmethod + def direct_refs(self) -> Iterator[ContextRef[Context] | ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + + def owned_scopes(self) -> Iterator[OwningContextRef[facts.Scope]]: + """Yield the scopes that are owned by this context. + + Owned scopes are those that are directly referenced by owning references or scopes + that are indirectly referenced by owning references, through referenced contexts that + are referenced by owning references. + """ + for ref in self.direct_refs(): + if isinstance(ref, OwningContextRef): + if isinstance(ref.ref, Context): + yield from ref.ref.owned_scopes() + else: + yield ref + + +@dataclass(frozen=True) +class AnalysisContext(Context): + """Outermost context of the analysis. + + Records the path to the repo checkout, to allow the analysis access to files in the repo. + """ + + repo_path: str | None + + def direct_refs(self) -> Iterator[ContextRef[Context] | ContextRef[facts.Scope]]: + """No direct references, yields nothing.""" + yield from [] + + +class SimpleSequence(ControlFlowGraphNode): + """Control-flow-graph node representing the execution of a sequence of nodes.""" + + #: The sequence of nodes to execute. + seq: list[Node] + #: The control flow graph. + _cfg: ControlFlowGraph + + def __init__(self, seq: list[Node]) -> None: + """Construct control-flow-graph from sequence.""" + super().__init__() + self.seq = seq + self._cfg = ControlFlowGraph.create_from_sequence(seq) + + def children(self) -> Iterator[Node]: + """Yield the nodes in the sequence.""" + yield from self.seq + + def get_entry(self) -> Node: + """Return the entry node, the first in the sequence.""" + return self.seq[0] + + def get_successors(self, node: Node, exit_type: ExitType) -> set[Node | ExitType]: + """Return the successor for a given node (the next in the sequence or the exit in the case of the last node).""" + return self._cfg.get_successors(node, exit_type) + + +class SimpleAlternatives(InterpretationNode): + """Interpretation node representing a concrete set of alternative nodes.""" + + #: The alternatives. + alts: list[Node] + + def __init__(self, alts: list[Node]) -> None: + """Initialize node.""" + super().__init__() + self.alts = alts + + def identify_interpretations(self, state: State) -> dict[InterpretationKey, Callable[[], Node]]: + """Return the interpretations of this node, that is, each of the alternatives.""" + + def get_alt(index: int) -> Node: + return self.alts[index] + + return {i: functools.partial(get_alt, i) for i in range(0, len(self.alts))} + + +def get_owned_scopes(context: ContextRef[Context]) -> set[facts.Scope]: + """Return the set of scopes owned via the given reference to a context. + + Returns empty if the given reference is non-owning. + """ + match context: + case OwningContextRef(ref): + return {scope.ref for scope in ref.owned_scopes()} + case NonOwningContextRef(ref): + return set() diff --git a/src/macaron/code_analyzer/dataflow_analysis/evaluation.py b/src/macaron/code_analyzer/dataflow_analysis/evaluation.py new file mode 100644 index 000000000..69d5a022c --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/evaluation.py @@ -0,0 +1,772 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Functions for evaluating and resolving dataflow analysis expressions.""" + +from __future__ import annotations + +import base64 +import os.path +from dataclasses import dataclass +from typing import TypeVar + +from frozendict import frozendict + +from macaron.code_analyzer.dataflow_analysis import bash, core, facts +from macaron.errors import CallGraphError + + +def evaluate(node: core.Node, value: facts.Value) -> set[tuple[facts.Value, ReadBindings]]: + """Evaluate the given value, at the point immediately prior to the execution of the given node. + + Parameters + ---------- + node: core.Node + The node at which to evaluate the value (i.e. in the context of the before state of the node). + value: facts.Value + The value expression to evaluate. + + Returns + ------- + set[tuple[facts.Value, ReadBindings]] + The set of possible resolved values for the value expression, each with a record of the + resolved value chosen for any read expressions. + """ + eval_transformer = EvaluationTransformer(node.before_state) + return eval_transformer.transform_value(value) + + +@dataclass(frozen=True) +class WriteStatement: + """Representation of a write to a given location of a given value.""" + + #: The location to write to. + location: facts.Location + #: The value to write. + value: facts.Value + + def perform_write(self, before_state: core.State) -> tuple[core.State, set[facts.Location]]: + """Return a state containing only the values stored by the write operation, in context of the before state. + + Also returns the set of locations within that state which should be considered to have been overwritten, + erasing any previous values. + """ + eval_transformer = EvaluationTransformer(before_state) + written_state = core.State() + evaluated_writes = eval_transformer.transform_write(self.location, self.value) + for loc, val, _ in evaluated_writes: + written_state.state[loc][val] = core.StateDebugLabel(core.get_debug_sequence_number(), False) + # Currently, never erases previous values. + return (written_state, set()) + + +@dataclass(frozen=True) +class StatementSet: + """Representation of a set of (simultaneous) write operations.""" + + #: The set of writes. + stmts: set[WriteStatement] + + def apply_effects(self, before_state: core.State) -> core.State: + """Apply the effect of the set of writes, returning the resulting state.""" + final_state = core.State() + final_overwritten_locs: set[facts.Location] = set() + for stmt in self.stmts: + written_state, overwritten_locs = stmt.perform_write(before_state) + for loc in overwritten_locs: + final_overwritten_locs.add(loc) + core.transfer_state(written_state, final_state, debug_is_copy=False) + + core.transfer_state(before_state, final_state, core.ExcludedLocsStateTransferFilter(final_overwritten_locs)) + return final_state + + @staticmethod + def union(*stmt_sets: StatementSet) -> StatementSet: + """Combine multiple write sets into one.""" + stmts: set[WriteStatement] = set() + for stmt_set in stmt_sets: + for stmt in stmt_set.stmts: + stmts.add(stmt) + return StatementSet(stmts) + + +class ParameterPlaceholderTransformer: + """Expression transformer which replaces parameter placeholders with their corresponding bound values.""" + + #: Whether to raise an exception if a parameter is found with no provided binding. + allow_unbound_params: bool + #: Bindings for value parameter placeholders, mapping parameter name to bound value expression. + value_parameter_binds: dict[str, facts.Value] + #: Bindings for location parameter placeholders, mapping parameter name to bound location expression. + location_parameter_binds: dict[str, facts.LocationSpecifier] + #: Bindings for scope parameter placeholders, mapping parameter name to bound scope. + scope_parameter_binds: dict[str, facts.Scope] + + def __init__( + self, + allow_unbound_params: bool = True, + value_parameter_binds: dict[str, facts.Value] | None = None, + location_parameter_binds: dict[str, facts.LocationSpecifier] | None = None, + scope_parameter_binds: dict[str, facts.Scope] | None = None, + ) -> None: + """Initialize transformer with bindings. + + Parameters + ---------- + allow_unbound_params: bool + Whether to raise an exception if a parameter is found with no provided binding. + value_parameter_binds: dict[str, facts.Value] | None + Bindings for value parameter placeholders, mapping parameter name to bound value expression. + location_parameter_binds: dict[str, facts.Value] | None + Bindings for location parameter placeholders, mapping parameter name to bound location expression. + scope_parameter_binds: dict[str, facts.Value] | None + Bindings for scope parameter placeholders, mapping parameter name to bound scope. + """ + self.allow_unbound_params = allow_unbound_params + self.value_parameter_binds = value_parameter_binds or {} + self.location_parameter_binds = location_parameter_binds or {} + self.scope_parameter_binds = scope_parameter_binds or {} + + def transform_value(self, value: facts.Value) -> facts.Value: + """Transform given value expression. + + Returns a value expression with any parameter placeholders replaced with their bound values. + """ + match value: + case facts.StringLiteral(_): + return value + case facts.Read(loc): + new_loc = self.transform_location(loc) + if new_loc is loc: + return value + return facts.Read(new_loc) + case facts.ArbitraryNewData(_): + return value + case facts.UnaryStringOp(op, operand): + new_operand = self.transform_value(operand) + if new_operand is operand: + return value + return facts.UnaryStringOp(op, new_operand) + case facts.BinaryStringOp(op, operand1, operand2): + new_operand1 = self.transform_value(operand1) + new_operand2 = self.transform_value(operand2) + + if op == facts.BinaryStringOperator.STRING_CONCAT: + return facts.BinaryStringOp.get_string_concat(new_operand1, new_operand2) + + # if new_operand1 is operand1 and new_operand2 is operand2: + # return value + # return facts.BinaryStringOp(op, new_operand1, new_operand2) + case facts.ParameterPlaceholderValue(name): + if name in self.value_parameter_binds: + return self.value_parameter_binds[name] + if not self.allow_unbound_params: + raise CallGraphError("unbound value parameter: " + name) + return value + case facts.InstalledPackage(name, version, distribution, url): + new_name = self.transform_value(name) + new_version = self.transform_value(version) + new_distribution = self.transform_value(distribution) + new_url = self.transform_value(url) + if new_name is name and new_version is version and new_distribution is distribution and new_url is url: + return value + return facts.InstalledPackage(new_name, new_version, new_distribution, new_url) + case facts.SingleBashTokenConstraint(val): + new_val = self.transform_value(val) + if new_val is val: + return value + return facts.SingleBashTokenConstraint(new_val) + case facts.Symbolic(sym_val): + new_sym_val = self.transform_value(sym_val) + if new_sym_val is sym_val: + return value + return facts.Symbolic(new_sym_val) + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def transform_location(self, location: facts.Location) -> facts.Location: + """Transform given location expression. + + Returns a location expression with any parameter placeholders replaced with their bound values. + """ + new_scope = self.transform_scope(location.scope) + new_location_spec = self.transform_location_specifier(location.loc) + if new_scope is location.scope and new_location_spec is location.loc: + return location + return facts.Location(new_scope, new_location_spec) + + def transform_location_specifier(self, location: facts.LocationSpecifier) -> facts.LocationSpecifier: + """Transform given location specifier expression. + + Returns a location specifier expression with any parameter placeholders replaced with their bound values. + """ + match location: + case facts.Filesystem(path): + new_path = self.transform_value(path) + if new_path is path: + return location + return facts.Filesystem(new_path) + case facts.Variable(name): + new_name = self.transform_value(name) + if new_name is name: + return location + return facts.Variable(new_name) + case facts.Artifact(name, file): + new_name = self.transform_value(name) + new_file = self.transform_value(file) + if new_name is name and new_file is file: + return location + return facts.Artifact(new_name, new_file) + case facts.FilesystemAnyUnderDir(path): + new_path = self.transform_value(path) + if new_path is path: + return location + return facts.FilesystemAnyUnderDir(new_path) + case facts.ArtifactAnyFilename(name): + new_name = self.transform_value(name) + if new_name is name: + return location + return facts.ArtifactAnyFilename(new_name) + case facts.ParameterPlaceholderLocation(name): + if name in self.location_parameter_binds: + return self.location_parameter_binds[name] + if not self.allow_unbound_params: + raise CallGraphError("unbound location parameter: " + name) + return location + case facts.Console(): + return location + case facts.Installed(name): + new_name = self.transform_value(name) + if new_name is name: + return location + return facts.Installed(new_name) + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + def transform_scope(self, scope: facts.Scope) -> facts.Scope: + """Transform given scope. + + Returns a scope with any parameter placeholders replaced with their bound values. + """ + if isinstance(scope, facts.ParameterPlaceholderScope): + if scope.name in self.scope_parameter_binds: + return self.scope_parameter_binds[scope.name] + if not self.allow_unbound_params: + raise CallGraphError("unbound scope parameter: " + scope.name) + return scope + + def transform_statement(self, statement: WriteStatement) -> WriteStatement: + """Transform given write statement. + + Returns a write statement with any parameter placeholders replaced with their bound values. + """ + new_location = self.transform_location(statement.location) + new_value = self.transform_value(statement.value) + if new_location is statement.location and new_value is statement.value: + return statement + return WriteStatement(new_location, new_value) + + def transform_statement_set(self, statement_set: StatementSet) -> StatementSet: + """Transform given write statement set. + + Returns a write statement set with any parameter placeholders replaced with their bound values. + """ + changed = False + new_stmts: set[WriteStatement] = set() + for stmt in statement_set.stmts: + new_stmt = self.transform_statement(stmt) + if new_stmt is not stmt: + changed = True + new_stmts.add(new_stmt) + + if not changed: + return statement_set + return StatementSet(new_stmts) + + +T = TypeVar("T") + + +def is_singleton(s: set[T], e: T) -> bool: + """Return whether the given set contains only the single given element.""" + return len(s) == 1 and next(iter(s)) == e + + +def is_singleton_no_bindings(s: set[tuple[T, ReadBindings]], e: T) -> bool: + """Return whether the given set contains only the single given element with no read bindings.""" + return len(s) == 1 and next(iter(s)) == (e, READBINDINGS_EMPTY) + + +def scope_matches(read_scope: facts.Scope, stored_scope: facts.Scope) -> bool: + """Return whether the given read scope matches the given stored scope. + + Matching means that a read of the read scope may return values from the stored scope. + """ + cur_scope: facts.Scope | None = read_scope + while cur_scope is not None: + if cur_scope == stored_scope: + return True + cur_scope = cur_scope.outer_scope + return False + + +def location_subsumes(loc: facts.LocationSpecifier, subloc: facts.LocationSpecifier) -> bool: + """Return whether the given location subsumes the given sub location. + + Subsumption means that a read of subloc may be considered to be a read of loc or some part thereof. + """ + if loc == subloc: + return True + + match loc, subloc: + case facts.Filesystem(facts.StringLiteral(loc_path_lit)), facts.Filesystem( + facts.StringLiteral(subloc_path_lit) + ): + # Ignore superficial differences in file path due to "./" relative paths. + if ( + not loc_path_lit.startswith("/") + and not subloc_path_lit.startswith("/") + and loc_path_lit.removeprefix("./") == subloc_path_lit.removeprefix("./") + ): + return True + case facts.FilesystemAnyUnderDir(facts.StringLiteral(dir_lit)), facts.Filesystem( + facts.StringLiteral(subloc_path_lit) + ): + # A file path under the same dir as a FilesystemAnyUnderDir is subsumed. + if subloc_path_lit.startswith(dir_lit.removesuffix("/") + "/"): + return True + return False + + +def get_values_for_subsumed_read( + read_loc: facts.LocationSpecifier, state_loc: facts.LocationSpecifier, state_vals: set[facts.Value] +) -> set[facts.Value]: + """Return the set of values stored in the state location, if relevant for the given read location.""" + match read_loc, state_loc: + case facts.ArtifactAnyFilename(read_artifact_name), facts.Artifact(state_artifact_name, state_artifact_file): + if read_artifact_name == state_artifact_name: + return {state_artifact_file} + + if location_subsumes(state_loc, read_loc): + return state_vals + + return set() + + +class ReadBindings: + """Set of bindings of read expressions to values bound as the result of those read expressions.""" + + #: Mapping of read expressions to bound values. + bindings: frozendict[facts.Read, facts.Value] + + def __init__(self, binds: frozendict[facts.Read, facts.Value] | None = None) -> None: + """Initialize with given bindings.""" + self.bindings = binds or frozendict() + + def __len__(self) -> int: + """Return the number of bindings in the set.""" + return len(self.bindings) + + def with_binding(self, read: facts.Read, value: facts.Value) -> ReadBindings | None: + """Return bindings with the given additional binding, or None if the bindings conflict.""" + if read in self.bindings: + if self.bindings[read] != value: + return None + return self + new_binds = self.bindings.set(read, value) + return ReadBindings(new_binds) + + def with_bindings(self, bindings: ReadBindings) -> ReadBindings | None: + """Return bindings with the given additional bindings, or None if the bindings conflict.""" + if len(bindings) == 0: + return self + if len(self) == 0: + return bindings + + for read, val in bindings.bindings.items(): + if read in self.bindings: + if self.bindings[read] != val: + return None + + combined_bindings = frozendict({**self.bindings, **bindings.bindings}) + return ReadBindings(combined_bindings) + + @staticmethod + def combine_bindings(bindings_list: list[ReadBindings]) -> ReadBindings | None: + """Return bindings combining all bindings in the given list, or None if the bindings conflict.""" + if len(bindings_list) == 0: + return READBINDINGS_EMPTY + + cur_binding: ReadBindings | None = bindings_list[0] + for bindings in bindings_list[1:]: + cur_binding = cur_binding.with_bindings(bindings) if cur_binding is not None else None + if cur_binding is None: + return None + return cur_binding + + def __hash__(self) -> int: + return hash(self.bindings) + + def __eq__(self, other: object) -> bool: + if isinstance(other, ReadBindings): + return self.bindings == other.bindings + return False + + def __repr__(self) -> str: + return str(self.bindings) + + +# Convenience instance of empty bindings. +READBINDINGS_EMPTY = ReadBindings() + + +class EvaluationTransformer: + """Expression transformer which evaluates the expression to produce a set of resolved values. + + The expression is evaluated in the context of a specified abstract storage state. + """ + + #: The state from which to resolve reads. + state: core.State + + def __init__(self, state: core.State) -> None: + """Initialize transformer with state from which to resolve reads.""" + self.state = state + + def transform_write( + self, location: facts.Location, value: facts.Value + ) -> set[tuple[facts.Location, facts.Value, ReadBindings]]: + """Transform a write location and value, returning the set of resolved values with the necessary bindings.""" + evaluated_locations = self.transform_location(location) + evaluated_values = self.transform_value(value) + result: set[tuple[facts.Location, facts.Value, ReadBindings]] = set() + for loc, loc_bindings in evaluated_locations: + for val, val_bindings in evaluated_values: + combined_bindings = loc_bindings.with_bindings(val_bindings) + if combined_bindings is not None: + result.add((loc, val, combined_bindings)) + return result + + def transform_value(self, value: facts.Value) -> set[tuple[facts.Value, ReadBindings]]: + """Transform a value expression, returning the set of resolved values with the necessary bindings.""" + match value: + case facts.StringLiteral(_): + return {(value, READBINDINGS_EMPTY)} + case facts.Read(loc): + # Read values from the state. + new_locs = self.transform_location(loc) + read_vals: set[tuple[facts.Value, ReadBindings]] = set() + for new_loc, new_loc_bindings in new_locs: + read_vals.add((facts.Symbolic(facts.Read(new_loc)), new_loc_bindings)) + + for state_loc, state_vals in self.state.state.items(): + if scope_matches(new_loc.scope, state_loc.scope): + for read_val in get_values_for_subsumed_read( + new_loc.loc, state_loc.loc, set(state_vals.keys()) + ): + combined_bindings = new_loc_bindings.with_binding(value, read_val) + if combined_bindings is not None: + read_vals.add((read_val, combined_bindings)) + return read_vals + case facts.ArbitraryNewData(_): + return {(value, READBINDINGS_EMPTY)} + case facts.UnaryStringOp(op, operand): + new_operands = self.transform_value(operand) + if op == facts.UnaryStringOperator.BASENAME: + # Concretely evaluate basename operator for string literal. + basename_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand, new_operand_bindings in new_operands: + if isinstance(new_operand, facts.StringLiteral): + basename_result.add( + (facts.StringLiteral(os.path.basename(new_operand.literal)), new_operand_bindings) + ) + return basename_result + if op == facts.UnaryStringOperator.BASE64DECODE: + # Concretely evaluate base64 decode operator for string literal + base64_decode_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand, new_operand_bindings in new_operands: + if isinstance(new_operand, facts.StringLiteral): + base64_decode_result.add( + ( + facts.StringLiteral(base64.b64decode(new_operand.literal).decode("utf-8")), + new_operand_bindings, + ) + ) + return base64_decode_result + return set() + case facts.BinaryStringOp(op, operand1, operand2): + new_operand1s = self.transform_value(operand1) + new_operand2s = self.transform_value(operand2) + if op == facts.BinaryStringOperator.STRING_CONCAT: + # Concretely evaluate string concatenation for concat of 2 string literals. + concat_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand1, new_operand1_bindings in new_operand1s: + for new_operand2, new_operand2_bindings in new_operand2s: + if isinstance(new_operand1, facts.StringLiteral) and isinstance( + new_operand2, facts.StringLiteral + ): + combined_bindings = new_operand1_bindings.with_bindings(new_operand2_bindings) + if combined_bindings is not None: + # TODO Have some truncated symbolic representation for + # excessively long strings rather than just dropping them. + if len(new_operand1.literal) + len(new_operand2.literal) < 10000: + concat_result.add( + ( + facts.StringLiteral(new_operand1.literal + new_operand2.literal), + combined_bindings, + ) + ) + return concat_result + + # return set() + case facts.SingleBashTokenConstraint(operand): + # For single bash token constraint, to evaluate a string literal, the literal is parsed + # as a bash expression, and if that results in a single element, then the constraint + # is met and the unmodified literal is returned, if it parses as multiple elements, then + # no resolved values are produced for that literal. + # + # Otherwise returns the constrained expression as is, while simplifying redundant + # multiply-nested constraints. + # + new_operands = self.transform_value(operand) + constraint_result: set[tuple[facts.Value, ReadBindings]] = set() + for new_operand, new_operand_bindings in new_operands: + match new_operand: + case facts.StringLiteral(lit): + parsed_bash_expr = bash.parse_bash_expr(lit) + if parsed_bash_expr is not None and len(parsed_bash_expr) == 1: + constraint_result.add((new_operand, new_operand_bindings)) + + case facts.SingleBashTokenConstraint(suboperand): + constraint_result.add((facts.SingleBashTokenConstraint(suboperand), new_operand_bindings)) + case _: + constraint_result.add((facts.SingleBashTokenConstraint(new_operand), new_operand_bindings)) + return constraint_result + case facts.ParameterPlaceholderValue(name): + return set() + case facts.InstalledPackage(name, version, distribution, url): + # Resolve parameters and return every combination. + new_names = self.transform_value(name) + new_versions = self.transform_value(version) + new_distributions = self.transform_value(distribution) + new_urls = self.transform_value(url) + if ( + is_singleton_no_bindings(new_names, name) + and is_singleton_no_bindings(new_versions, version) + and is_singleton_no_bindings(new_distributions, distribution) + and is_singleton_no_bindings(new_urls, url) + ): + return {(value, READBINDINGS_EMPTY)} + result: set[tuple[facts.Value, ReadBindings]] = set() + for new_name, new_name_bindings in new_names: + for new_version, new_version_bindings in new_versions: + version_combined_bindings = new_name_bindings.with_bindings(new_version_bindings) + if version_combined_bindings is None: + continue + for new_distribution, new_distribution_bindings in new_distributions: + distribution_combined_bindings = version_combined_bindings.with_bindings( + new_distribution_bindings + ) + if distribution_combined_bindings is None: + continue + for new_url, new_url_bindings in new_urls: + url_combined_bindings = distribution_combined_bindings.with_bindings(new_url_bindings) + if url_combined_bindings is not None: + result.add( + ( + facts.InstalledPackage(new_name, new_version, new_distribution, new_url), + url_combined_bindings, + ) + ) + return result + case facts.Symbolic(_): + return {(value, READBINDINGS_EMPTY)} + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def transform_location(self, location: facts.Location) -> set[tuple[facts.Location, ReadBindings]]: + """Transform a location expression, returning the set of resolved values with the necessary bindings.""" + new_location_specs = self.transform_location_specifier(location.loc) + if is_singleton_no_bindings(new_location_specs, location.loc): + return {(location, READBINDINGS_EMPTY)} + return { + (facts.Location(location.scope, new_location_spec), new_location_spec_bindings) + for new_location_spec, new_location_spec_bindings in new_location_specs + } + + def transform_location_specifier( + self, location: facts.LocationSpecifier + ) -> set[tuple[facts.LocationSpecifier, ReadBindings]]: + """Transform a location specifier expression, returning the set of resolved values with the necessary bindings.""" + match location: + case facts.Filesystem(path): + new_paths = self.transform_value(path) + if is_singleton_no_bindings(new_paths, path): + return {(location, READBINDINGS_EMPTY)} + return {(facts.Filesystem(new_path), new_path_bindings) for new_path, new_path_bindings in new_paths} + case facts.Variable(name): + new_names = self.transform_value(name) + if is_singleton_no_bindings(new_names, name): + return {(location, READBINDINGS_EMPTY)} + return {(facts.Variable(new_name), new_name_bindings) for new_name, new_name_bindings in new_names} + case facts.Artifact(name, file): + new_names = self.transform_value(name) + new_files = self.transform_value(file) + if is_singleton_no_bindings(new_names, name) and is_singleton_no_bindings(new_files, file): + return {(location, READBINDINGS_EMPTY)} + artifact_result: set[tuple[facts.LocationSpecifier, ReadBindings]] = set() + for new_name, new_name_bindings in new_names: + for new_file, new_file_bindings in new_files: + combined_bindings = new_name_bindings.with_bindings(new_file_bindings) + if combined_bindings is not None: + artifact_result.add((facts.Artifact(new_name, new_file), combined_bindings)) + return artifact_result + case facts.FilesystemAnyUnderDir(path): + new_paths = self.transform_value(path) + if is_singleton_no_bindings(new_paths, path): + return {(location, READBINDINGS_EMPTY)} + return { + (facts.FilesystemAnyUnderDir(new_path), new_path_bindings) + for new_path, new_path_bindings in new_paths + } + case facts.ArtifactAnyFilename(name): + new_names = self.transform_value(name) + if is_singleton_no_bindings(new_names, name): + return {(location, READBINDINGS_EMPTY)} + return { + (facts.FilesystemAnyUnderDir(new_name), new_name_bindings) + for new_name, new_name_bindings in new_names + } + case facts.ParameterPlaceholderLocation(name): + return {(location, READBINDINGS_EMPTY)} + case facts.Console(): + return {(location, READBINDINGS_EMPTY)} + case facts.Installed(name): + new_names = self.transform_value(name) + return {(facts.Installed(new_name), new_name_bindings) for new_name, new_name_bindings in new_names} + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + +# TODO generalise visitors +class ContainsSymbolicVisitor: + """Visitor to determine whether a given expression contains any symbolic expressions.""" + + def visit_value(self, value: facts.Value) -> bool: + """Search value expression for symbolic expressions and return whether any were found.""" + match value: + case facts.StringLiteral(_): + return False + case facts.Read(loc): + return self.visit_location(loc) + case facts.ArbitraryNewData(_): + return False + case facts.UnaryStringOp(_, operand): + return self.visit_value(operand) + case facts.BinaryStringOp(_, operand1, operand2): + return self.visit_value(operand1) or self.visit_value(operand2) + case facts.ParameterPlaceholderValue(name): + return False + case facts.InstalledPackage(name, version, distribution, url): + return ( + self.visit_value(name) + or self.visit_value(version) + or self.visit_value(distribution) + or self.visit_value(url) + ) + case facts.SingleBashTokenConstraint(operand): + return self.visit_value(operand) + case facts.Symbolic(_): + return True + raise CallGraphError("unknown facts.Value type: " + value.__class__.__name__) + + def visit_location(self, location: facts.Location) -> bool: + """Search location expression for symbolic expressions and return whether any were found.""" + return self.visit_location_specifier(location.loc) + + def visit_location_specifier(self, location: facts.LocationSpecifier) -> bool: + """Search location specifier expression for symbolic expressions and return whether any were found.""" + match location: + case facts.Filesystem(path): + return self.visit_value(path) + case facts.Variable(name): + return self.visit_value(name) + case facts.Artifact(name, file): + return self.visit_value(name) or self.visit_value(file) + case facts.FilesystemAnyUnderDir(path): + return self.visit_value(path) + case facts.ArtifactAnyFilename(name): + return self.visit_value(name) + case facts.ParameterPlaceholderLocation(name): + return False + case facts.Console(): + return False + case facts.Installed(name): + return self.visit_value(name) + raise CallGraphError("unknown location type: " + location.__class__.__name__) + + +def filter_symbolic_values(values: set[tuple[facts.Value, ReadBindings]]) -> set[tuple[facts.Value, ReadBindings]]: + """Filter out symbolic values. + + Returns a set containing all elements from the given set that do not contain any symbolic expressions. + """ + return {val for val in values if not ContainsSymbolicVisitor().visit_value(val[0])} + + +def filter_symbolic_locations( + locs: set[tuple[facts.Location, ReadBindings]], +) -> set[tuple[facts.Location, ReadBindings]]: + """Filter out symbolic locations. + + Returns a set containing all elements from the given set that do not contain any symbolic expressions. + """ + return {loc for loc in locs if not ContainsSymbolicVisitor().visit_location(loc[0])} + + +def filter_symbolic_location_specifiers( + locs: set[tuple[facts.LocationSpecifier, ReadBindings]], +) -> set[tuple[facts.LocationSpecifier, ReadBindings]]: + """Filter out symbolic location specifiers. + + Returns a set containing all elements from the given set that do not contain any symbolic expressions. + """ + return {loc for loc in locs if not ContainsSymbolicVisitor().visit_location_specifier(loc[0])} + + +def get_single_resolved_str(resolved_values: set[tuple[facts.Value, ReadBindings]]) -> str | None: + """If the given set contains only a single string literal value, return that string, or else None.""" + resolved_values = filter_symbolic_values(resolved_values) + if len(resolved_values) == 1: + val = next(iter(resolved_values))[0] + if isinstance(val, facts.StringLiteral): + return val.literal + return None + + +def get_single_resolved_str_with_default( + resolved_values: set[tuple[facts.Value, ReadBindings]], default_value: str +) -> str: + """If the given set contains only a single string literal value, return that string, else return default value.""" + result = get_single_resolved_str(resolved_values) + if result is not None: + return result + return default_value + + +def parse_str_expr_split(str_expr: facts.Value, delimiter_char: str, maxsplit: int = -1) -> list[facts.Value]: + """Split a string expression on the appearance of the delimiter char in literal parts of the expression.""" + if len(delimiter_char) != 1: + raise CallGraphError("delimiter_char must be single char") + + match str_expr: + case facts.StringLiteral(literal): + split_str = literal.split(delimiter_char, maxsplit=maxsplit) + return [facts.StringLiteral(s) for s in split_str] + case facts.BinaryStringOp(facts.BinaryStringOperator.STRING_CONCAT, o1, o2): + split_lhs = parse_str_expr_split(o1, delimiter_char, maxsplit) + split_rhs = parse_str_expr_split( + o2, delimiter_char, -1 if maxsplit == -1 else maxsplit - (len(split_lhs) - 1) + ) + if len(split_lhs) == 1 and len(split_rhs) == 1: + return [str_expr] + return ( + split_lhs[:-1] + [facts.BinaryStringOp.get_string_concat(split_lhs[-1], split_rhs[0])] + split_rhs[1:] + ) + return [str_expr] diff --git a/src/macaron/code_analyzer/dataflow_analysis/facts.py b/src/macaron/code_analyzer/dataflow_analysis/facts.py new file mode 100644 index 000000000..28d0f869d --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/facts.py @@ -0,0 +1,702 @@ +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Definitions of dataflow analysis representation for value expressions and abstract storage locations. + +Also includes an incomplete implementation of serialization/deserialization to a Souffle-datalog-compatible representation, +which originated as a remnant of a previous prototype version that involved the datalog engine in the analysis, but +is retained here because the serialization is useful for producing a human-readable string representation for debugging purposes, +and it may be necessary in future to make these expressions available to the policy engine (which uses datalog). +Deserialization is currently non-functional primarily due to the inability to deserialize scope identity, but may +potentially be revisited in future, so is left here for posterity. +""" + +from __future__ import annotations + +import abc +from dataclasses import dataclass +from enum import Enum, auto + +from macaron.errors import CallGraphError, ParseError + + +class Value(abc.ABC): + """Base class for value expressions. + + Subclasses should be comparable by structural equality. + """ + + @abc.abstractmethod + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +class LocationSpecifier(abc.ABC): + """Base class for location expressions. + + Subclasses should be comparable by structural equality. + """ + + @abc.abstractmethod + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +# Sequence number to automatically give scopes unique names. +# note: not thread safe +SCOPE_SEQUENCE_NUMBER = 0 + + +class Scope: + """Representation of a scope in which a location may exist. + + This allows for distinct locations with the same name/path/expression to exist separately in different namespaces. + + A scope may have an outer scope, such that a read from a scope may return values from + the outer scope(s). + + Unlike other expression classes, scopes are distinguished by object identity and not + structural equality (TODO now that scopes have names, maybe should revisit this since + it makes serialization/deserialization difficult). + """ + + #: Name for display purposes. + identifier: str + #: Outer scope, if any. + outer_scope: Scope | None + + def __init__(self, name: str, outer_scope: Scope | None = None) -> None: + """Initialize scope. + + Parameters + ---------- + name: str + Name for display purposes (a sequence number will automatically be appended to make it unique). + outer_scope: Scope | None + Outer scope, if any. + """ + self.outer_scope = outer_scope + global SCOPE_SEQUENCE_NUMBER # pylint: disable=global-statement + self.identifier = str(SCOPE_SEQUENCE_NUMBER) + "_" + name + SCOPE_SEQUENCE_NUMBER += 1 + + def __hash__(self) -> int: + return id(self) + + def __eq__(self, other: object) -> bool: + return self is other + + def to_datalog_fact_string(self, include_outer_scope: bool = False) -> str: + """Return string representation of scope (in datalog serialized format).""" + return ( + "$Scope(" + + enquote_datalog_string_literal(self.identifier) + + ( + ", " + self.outer_scope.to_datalog_fact_string() + if include_outer_scope and self.outer_scope is not None + else "" + ) + + ")" + ) + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +class ParameterPlaceholderScope(Scope): + """Special scope placeholder to allow generic parameterized expressions. + + TODO This is not really a proper subclass of Scope, should revisit type relationship. + """ + + #: Parameter name. + name: str + + def __init__(self, name: str) -> None: # pylint: disable=super-init-not-called + """Initialize placeholder scope with given parameter name.""" + self.identifier = "param_" + name + self.name = name + + def __hash__(self) -> int: + return hash(self.name) + + def __eq__(self, other: object) -> bool: + return isinstance(other, ParameterPlaceholderScope) and other.name == self.name + + def to_datalog_fact_string(self, include_outer_scope: bool = False) -> str: + """Return string representation of scope (in datalog serialized format).""" + return "$ParameterPlaceholderScope(" + enquote_datalog_string_literal(self.name) + ")" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +@dataclass(frozen=True, repr=False) +class Location: + """A location expression qualified with the scope it resides in.""" + + #: Scope the location resides in. + scope: Scope + #: Location expression. + loc: LocationSpecifier + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "[" + self.scope.to_datalog_fact_string() + ", " + self.loc.to_datalog_fact_string() + "]" + + def __str__(self) -> str: + return self.to_datalog_fact_string() + + def __repr__(self) -> str: + return self.__str__() + + +@dataclass(frozen=True, repr=False) +class StringLiteral(Value): + """Value expression representing a string literal.""" + + #: String literal. + literal: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$StringLiteral(" + enquote_datalog_string_literal(self.literal) + ")" + + +@dataclass(frozen=True, repr=False) +class Read(Value): + """Value expression representing a read of the value stored at a location.""" + + #: Read value location. + loc: Location + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Read(" + self.loc.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class ArbitraryNewData(Value): + """Value expression representing some arbitrary data.""" + + #: Name distiguishing the origin of the data. + at: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ArbitraryNewData(" + enquote_datalog_string_literal(self.at) + ")" + + +@dataclass(frozen=True, repr=False) +class InstalledPackage(Value): + """Value expression representing an installed package, with identifying metadata (name, version, etc.).""" + + #: Package name. + name: Value + #: Package version. + version: Value + #: Package distribution. + distribution: Value + #: URL of the package. + url: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return ( + "$InstalledPackage(" + + self.name.to_datalog_fact_string() + + ", " + + self.version.to_datalog_fact_string() + + ", " + + self.distribution.to_datalog_fact_string() + + ", " + + self.url.to_datalog_fact_string() + + ")" + ) + + +class UnaryStringOperator(Enum): + """Unary operators.""" + + BASENAME = auto() + BASE64_ENCODE = auto() + BASE64DECODE = auto() + + +def un_op_to_datalog_fact_string(op: UnaryStringOperator) -> str: + """Return string representation of operator (in datalog serialized format).""" + if op == UnaryStringOperator.BASENAME: + return "$BaseName" + if op == UnaryStringOperator.BASE64_ENCODE: + return "$Base64Encode" + if op == UnaryStringOperator.BASE64DECODE: + return "$Base64Decode" + raise CallGraphError("unknown UnaryStringOperator") + + +class BinaryStringOperator(Enum): + """Binary operators.""" + + STRING_CONCAT = auto() + + +def bin_op_to_datalog_fact_string(op: BinaryStringOperator) -> str: + """Return string representation of operator (in datalog serialized format).""" + if op == BinaryStringOperator.STRING_CONCAT: + return "$StringConcat" + raise CallGraphError("unknown BinaryStringOperator") + + +@dataclass(frozen=True, repr=False) +class UnaryStringOp(Value): + """Value expression representing a unary operator.""" + + #: Operator. + op: UnaryStringOperator + #: Operand value. + operand: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return ( + "$UnaryStringOp(" + + un_op_to_datalog_fact_string(self.op) + + ", " + + self.operand.to_datalog_fact_string() + + ")" + ) + + +@dataclass(frozen=True, repr=False) +class BinaryStringOp(Value): + """Value expression representing a binary operator.""" + + #: Operator. + op: BinaryStringOperator + #: First operand value. + operand1: Value + #: Second operand value. + operand2: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return ( + "$BinaryStringOp(" + + bin_op_to_datalog_fact_string(self.op) + + ", " + + self.operand1.to_datalog_fact_string() + + ", " + + self.operand2.to_datalog_fact_string() + + ")" + ) + + @staticmethod + def get_string_concat(operand1: Value, operand2: Value) -> Value: + """Construct a string concatenation operator. + + Applies some simple constant-folding simplifications. + """ + match operand1, operand2: + # "a" + "b" = "ab" + case StringLiteral(op1_lit), StringLiteral(op2_lit): + return StringLiteral(op1_lit + op2_lit) + # "" + x = x + case StringLiteral(""), _: + return operand2 + # x + "" = x + case _, StringLiteral(""): + return operand1 + # (x + "a") + "b" = x + "ab" + case BinaryStringOp(BinaryStringOperator.STRING_CONCAT, subop1, StringLiteral(subop2_lit)), StringLiteral( + op2_lit + ): + return BinaryStringOp(BinaryStringOperator.STRING_CONCAT, subop1, StringLiteral(subop2_lit + op2_lit)) + # "a" + ("b" + x) = "ab" + x + case StringLiteral(op1_lit), BinaryStringOp( + BinaryStringOperator.STRING_CONCAT, StringLiteral(subop1_lit), subop2 + ): + return BinaryStringOp(BinaryStringOperator.STRING_CONCAT, StringLiteral(op1_lit + subop1_lit), subop2) + + return BinaryStringOp(BinaryStringOperator.STRING_CONCAT, operand1, operand2) + + +@dataclass(frozen=True, repr=False) +class ParameterPlaceholderValue(Value): + """Special placeholder value to allow generic parameterized expressions.""" + + #: Parameter name. + name: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ParameterPlaceholderValue(" + enquote_datalog_string_literal(self.name) + ")" + + +@dataclass(frozen=True, repr=False) +class Symbolic(Value): + """Value expression representing a symbolic expression. + + Represents an expression that has been "frozen" in symbolic form rather than evaluated concretely. + """ + + #: Symbolic expression. + val: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Symbolic(" + self.val.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class SingleBashTokenConstraint(Value): + """Value expression representing a constraint that the underlying value does not parse as multiple Bash tokens.""" + + #: Constrained expression. + val: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$SingleBashTokenConstraint(" + self.val.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class Filesystem(LocationSpecifier): + """Location expression representing a filesystem location at a particular file path.""" + + #: Filepath value. + path: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Filesystem(" + self.path.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class Variable(LocationSpecifier): + """Location expression representing a variable.""" + + #: Variable name. + name: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Variable(" + self.name.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class Artifact(LocationSpecifier): + """Location expression representing a file stored within some named artifact storage location.""" + + #: Artifact name. + name: Value + #: File name within artifact. + file: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Artifact(" + self.name.to_datalog_fact_string() + ", " + self.file.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class FilesystemAnyUnderDir(LocationSpecifier): + """Location expression representing any file under a particular directory.""" + + #: Directory file path. + path: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$FilesystemAnyUnderDir(" + self.path.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class ArtifactAnyFilename(LocationSpecifier): + """Location expression representing any file contained with a named artifact storage location.""" + + #: Artifact name. + name: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ArtifactAnyFilename(" + self.name.to_datalog_fact_string() + ")" + + +@dataclass(frozen=True, repr=False) +class ParameterPlaceholderLocation(LocationSpecifier): + """Special placeholder location expression to allow generic parameterized expressions.""" + + #: Parameter name. + name: str + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$ParameterPlaceholderLocation(" + enquote_datalog_string_literal(self.name) + ")" + + +@dataclass(frozen=True, repr=False) +class Console(LocationSpecifier): + """Location expression representing a console, pipe or other text stream.""" + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Console" + + +@dataclass(frozen=True, repr=False) +class Installed(LocationSpecifier): + """Location expression representing an installed package.""" + + #: Package name. + name: Value + + def to_datalog_fact_string(self) -> str: + """Return string representation of expression (in datalog serialized format).""" + return "$Installed(" + self.name.to_datalog_fact_string() + ")" + + +def enquote_datalog_string_literal(literal: str) -> str: + """Enquote a datalog string literal, with appropriate escaping.""" + return '"' + literal.replace("\\", "\\\\").replace('"', '\\"') + '"' + + +class FactParseError(Exception): + """Happens when an error occurs during fact parsing.""" + + +def consume_whitespace(text: str) -> str: + """Consume leading whitespace, returning the remainder to the text.""" + text_end_idx = len(text) + space_end_idx = text_end_idx + idx = 0 + while idx < text_end_idx: + if text[idx].isspace(): + idx = idx + 1 + else: + space_end_idx = idx + break + return text[space_end_idx:text_end_idx] + + +def consume(text: str, token: str) -> str: + """Consume the leading token from the text. + + Raises exception if text does not start with the token. + """ + if text.startswith(token): + return text[len(token) :] + raise FactParseError(text) + + +def parse_qualified_name(text: str) -> tuple[str, str]: + """Parse a qualified name, returning the name and the remainder of the text.""" + text = consume_whitespace(text) + text_end_idx = len(text) + name_end_idx = text_end_idx + idx = 0 + while idx < text_end_idx: + if text[idx].isalnum() or text[idx] == "_" or text[idx] == "?" or text[idx] == ".": + idx = idx + 1 + else: + name_end_idx = idx + break + return text[0:name_end_idx], text[name_end_idx:text_end_idx] + + +def parse_symbol(text: str) -> tuple[str, str]: + """Parse datalog-serialized string literal.""" + text = consume(text, '"') + text_end_idx = len(text) + str_end_idx = text_end_idx + idx = 0 + in_escape = False + char_list = [] + while idx < text_end_idx: + if text[idx] == "\\": + if not in_escape: + in_escape = True + else: + char_list.append("\\") + in_escape = False + elif text[idx] == '"': + if not in_escape: + str_end_idx = idx + break + char_list.append('"') + in_escape = False + else: + char_list.append(text[idx]) + idx = idx + 1 + + lit = "".join(char_list) + text = text[str_end_idx:] + text = consume(text, '"') + return lit, text + + +def parse_location_specifier(text: str) -> tuple[LocationSpecifier, str]: + """Deserialize location specifier from string representation (in datalog serialized format).""" + text = consume(text, "$") + kind, text = parse_qualified_name(text) + match kind: + case "Filesystem": + text = consume(text, "(") + path_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Filesystem(path_val), text + case "Variable": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Variable(name_val), text + case "Artifact": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + file_val, text = parse_value(text) + text = consume(text, ")") + return Artifact(name_val, file_val), text + case "FilesystemAnyUnderDir": + text = consume(text, "(") + path_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return FilesystemAnyUnderDir(path_val), text + case "ArtifactAnyFilename": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return ArtifactAnyFilename(name_val), text + case "Console": + return Console(), text + case "Installed": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Installed(name_val), text + + raise FactParseError() + + +def parse_location(text: str) -> tuple[Location, str]: + """Deserialize location from string representation (in datalog serialized format). + + Currently non-functional primarily due to the inability to deserialize scope identity. + """ + raise ParseError("cannot parse, need fix") + + +def parse_value(text: str) -> tuple[Value, str]: + """Deserialize value expression from string representation (in datalog serialized format).""" + text = consume(text, "$") + kind, text = parse_qualified_name(text) + match kind: + case "StringLiteral": + text = consume(text, "(") + lit, text = parse_symbol(text) + text = consume_whitespace(text) + text = consume(text, ")") + return StringLiteral(lit), text + case "Read": + text = consume(text, "(") + loc, text = parse_location(text) + text = consume_whitespace(text) + text = consume(text, ")") + return Read(loc), text + case "ArbitraryNewData": + text = consume(text, "(") + at, text = parse_symbol(text) + text = consume_whitespace(text) + text = consume(text, ")") + return ArbitraryNewData(at), text + case "UnaryStringOp": + text = consume(text, "(") + un_operator, text = parse_un_op(text) + text = consume(text, ",") + text = consume_whitespace(text) + operand_val, text = parse_value(text) + text = consume(text, ")") + return UnaryStringOp(un_operator, operand_val), text + case "BinaryStringOp": + text = consume(text, "(") + bin_operator, text = parse_bin_op(text) + text = consume(text, ",") + text = consume_whitespace(text) + operand1, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + operand2, text = parse_value(text) + text = consume(text, ")") + return BinaryStringOp(bin_operator, operand1, operand2), text + case "ParameterPlaceholderValue": + text = consume(text, "(") + name, text = parse_symbol(text) + text = consume_whitespace(text) + text = consume(text, ")") + return ParameterPlaceholderValue(name), text + case "SingleBashTokenConstraint": + text = consume(text, "(") + operand, text = parse_value(text) + text = consume(text, ")") + return SingleBashTokenConstraint(operand), text + case "InstalledPackage": + text = consume(text, "(") + name_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + version_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + distribution_val, text = parse_value(text) + text = consume(text, ",") + text = consume_whitespace(text) + url_val, text = parse_value(text) + text = consume(text, ")") + return InstalledPackage(name_val, version_val, distribution_val, url_val), text + raise FactParseError() + + +def parse_un_op(text: str) -> tuple[UnaryStringOperator, str]: + """Deserialize unary operator from string representation (in datalog serialized format).""" + text = consume(text, "$") + name, text = parse_qualified_name(text) + match name: + case "BaseName": + return UnaryStringOperator.BASENAME, text + case "Base64Encode": + return UnaryStringOperator.BASE64_ENCODE, text + case "Base64Decode": + return UnaryStringOperator.BASE64DECODE, text + raise FactParseError() + + +def parse_bin_op(text: str) -> tuple[BinaryStringOperator, str]: + """Deserialize binary operator from string representation (in datalog serialized format).""" + text = consume(text, "$") + name, text = parse_qualified_name(text) + match name: + case "StringConcat": + return BinaryStringOperator.STRING_CONCAT, text + raise FactParseError() diff --git a/src/macaron/code_analyzer/dataflow_analysis/github.py b/src/macaron/code_analyzer/dataflow_analysis/github.py new file mode 100644 index 000000000..6da30e745 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/github.py @@ -0,0 +1,1314 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Dataflow analysis implementation for analysing GitHub Actions Workflow build pipelines.""" + +from __future__ import annotations + +from collections import defaultdict +from collections.abc import Callable, Iterator +from dataclasses import dataclass +from graphlib import TopologicalSorter + +from macaron.code_analyzer.dataflow_analysis import bash, core, evaluation, facts, github_expr, models, printing +from macaron.errors import CallGraphError +from macaron.parsers import github_workflow_model + + +@dataclass(frozen=True) +class GitHubActionsWorkflowContext(core.Context): + """Context for the top-level scope of a GitHub Actions Workflow.""" + + #: Outer analysis context. + analysis_context: core.ContextRef[core.AnalysisContext] + #: Scope for artifact storage within the pipeline execution (for upload/download artifact). + artifacts: core.ContextRef[facts.Scope] + #: Scope for artifacts published as GitHub releases by the pipeline. + releases: core.ContextRef[facts.Scope] + #: Scope for environment variables (env block at top-level of workflow). + env: core.ContextRef[facts.Scope] + #: Scope for variables within the workflow. + workflow_variables: core.ContextRef[facts.Scope] + #: Scope for console output. + console: core.ContextRef[facts.Scope] + #: Filepath of workflow file. + source_filepath: str + + @staticmethod + def create( + analysis_context: core.ContextRef[core.AnalysisContext], source_filepath: str + ) -> GitHubActionsWorkflowContext: + """Create a new workflow context and its associated scopes. + + Parameters + ---------- + analysis_context: core.ContextRef[core.AnalysisContext] + Outer analysis context. + source_filepath: str + Filepath of workflow file. + + Returns + ------- + GitHubActionsWorkflowContext + The new workflow context. + """ + return GitHubActionsWorkflowContext( + analysis_context=analysis_context.get_non_owned(), + artifacts=core.OwningContextRef(facts.Scope("artifacts")), + releases=core.OwningContextRef(facts.Scope("releases")), + env=core.OwningContextRef(facts.Scope("env")), + workflow_variables=core.OwningContextRef(facts.Scope("workflow_vars")), + console=core.OwningContextRef(facts.Scope("console")), + source_filepath=source_filepath, + ) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.analysis_context + yield self.artifacts + yield self.releases + yield self.env + yield self.workflow_variables + yield self.console + + +@dataclass(frozen=True) +class GitHubActionsJobContext(core.Context): + """Context for a job within a GitHub Actions Workflow.""" + + #: Outer workflow context. + workflow_context: core.ContextRef[GitHubActionsWorkflowContext] + #: Scope for filesystem used by the job and its steps. + filesystem: core.ContextRef[facts.Scope] + #: Scope for environment variables (env block at job level). + env: core.ContextRef[facts.Scope] + #: Scope for variables within the job (step output variables, etc.). + job_variables: core.ContextRef[facts.Scope] + + @staticmethod + def create(workflow_context: core.ContextRef[GitHubActionsWorkflowContext]) -> GitHubActionsJobContext: + """Create a new job context and its associated scopes. + + Env and job variables scopes inherit from outer context. + + Parameters + ---------- + workflow_context: core.ContextRef[GitHubActionsWorkflowContext] + Outer workflow context. + + Returns + ------- + GitHubActionsJobContext + The new job context. + """ + return GitHubActionsJobContext( + workflow_context=workflow_context.get_non_owned(), + filesystem=core.OwningContextRef(facts.Scope("filesystem")), + env=core.OwningContextRef(facts.Scope("env", workflow_context.ref.env.ref)), + job_variables=core.OwningContextRef(facts.Scope("job_vars", workflow_context.ref.workflow_variables.ref)), + ) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.workflow_context + yield self.filesystem + yield self.env + yield self.job_variables + + +@dataclass(frozen=True) +class GitHubActionsStepContext(core.Context): + """Context for a step within a job within a GitHub Actions Workflow.""" + + #: Outer job context. + job_context: core.ContextRef[GitHubActionsJobContext] + #: Scope for environment variables (env block at step level) + env: core.ContextRef[facts.Scope] + #: Name prefix for step output variables (stored in the job variables) + #: belonging to this step (e.g. "steps.step_id.outputs.") + output_var_prefix: str | None + + @staticmethod + def create(job_context: core.ContextRef[GitHubActionsJobContext], step_id: str | None) -> GitHubActionsStepContext: + """Create a new step context and its associated scopes. + + Env scope inherits from outer context. Output var prefix is derived from step_id. + + Parameters + ---------- + job_context: core.ContextRef[GitHubActionsJobContext] + Outer job context. + step_id: str | None + Step id. If provided, used to derive name previx for step output variables. + + Returns + ------- + GitHubActionsStepContext + The new step context. + """ + return GitHubActionsStepContext( + job_context=job_context.get_non_owned(), + env=core.OwningContextRef(facts.Scope("env", job_context.ref.env.ref)), + output_var_prefix=("steps." + step_id + ".outputs.") if step_id is not None else None, + ) + + def direct_refs(self) -> Iterator[core.ContextRef[core.Context] | core.ContextRef[facts.Scope]]: + """Yield the direct references of the context, either to scopes or to other contexts.""" + yield self.job_context + yield self.env + + +class RawGitHubActionsWorkflowNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Workflow. + + Defines how to interpret a parsed workflow and generate its analysis representation. + """ + + #: Parsed workflow AST. + definition: github_workflow_model.Workflow + + #: Workflow context + context: core.ContextRef[GitHubActionsWorkflowContext] + + def __init__( + self, definition: github_workflow_model.Workflow, context: core.ContextRef[GitHubActionsWorkflowContext] + ) -> None: + """Initialize node. + + Typically, construction should be done via the create function rather than using this constructor directly. + """ + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret the workflow AST to generate control flow representation.""" + + def build_workflow_node() -> core.Node: + return GitHubActionsWorkflowNode.create(self.definition, self.context.get_non_owned()) + + return {"default": build_workflow_node} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the workflow name and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "name" in self.definition: + result["workflow name"] = {(None, self.definition["name"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + @staticmethod + def create( + workflow: github_workflow_model.Workflow, + analysis_context: core.ContextRef[core.AnalysisContext], + source_filepath: str, + ) -> RawGitHubActionsWorkflowNode: + """Create workflow node and its associated context. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + Parsed workflow AST. + analysis_context: core.ContextRef[core.AnalysisContext] + Outer analysis context. + source_filepath: str + Filepath of workflow file. + + Returns + ------- + RawGitHubActionsWorkflowNode + The new workflow node. + """ + workflow_context = GitHubActionsWorkflowContext.create(analysis_context, source_filepath) + + return RawGitHubActionsWorkflowNode(workflow, core.OwningContextRef(workflow_context)) + + +class GitHubActionsWorkflowNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a GitHub Actions Workflow. + + Control flow structure executes each job in an arbitrary linear sequence + (by default a topological sort satsifying the job dependencies). If an env block exists, + it is applied beforehand. + """ + + #: Parsed workflow AST. + definition: github_workflow_model.Workflow + #: Workflow context. + context: core.ContextRef[GitHubActionsWorkflowContext] + #: Node to apply effects of env block, if any. + env_block: RawGitHubActionsEnvNode | None + #: Job nodes, identified by their job id. + jobs: dict[str, RawGitHubActionsJobNode] + #: List of job ids specifying job execution order. + order: list[str] + #: Control flow graph. + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: github_workflow_model.Workflow, + context: core.ContextRef[GitHubActionsWorkflowContext], + env_block: RawGitHubActionsEnvNode | None, + jobs: dict[str, RawGitHubActionsJobNode], + order: list[str], + ) -> None: + """Initialize workflow node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: github_workflow_model.Workflow + Parsed workflow AST. + context: core.ContextRef[GitHubActionsWorkflowContext] + Workflow context. + env_block: RawGitHubActionsEnvNode | None + Node to apply effects of env block, if any. + jobs: dict[str, RawGitHubActionsJobNode] + List of job ids specifying job execution order. + order: list[str] + List of job ids specifying job execution order. + """ + super().__init__() + self.definition = definition + self.context = context + self.env_block = env_block + self.jobs = jobs + self.order = order + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.env_block] + [self.jobs[job_id] for job_id in self.order])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the child nodes of this node.""" + if self.env_block is not None: + yield self.env_block + for job_id in self.order: + yield self.jobs[job_id] + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the workflow name and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "name" in self.definition: + result["workflow name"] = {(None, self.definition["name"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + @staticmethod + def create( + workflow: github_workflow_model.Workflow, context: core.NonOwningContextRef[GitHubActionsWorkflowContext] + ) -> GitHubActionsWorkflowNode: + """Create workflow node from workflow AST. + + Also creates a job node for each job, and performs a topological sort of the job dependency graph + to choose an arbitrary valid sequential execution order. + + Parameters + ---------- + workflow: github_workflow_model.Workflow + Parsed workflow AST. + context: core.NonOwningContextRef[GitHubActionsWorkflowContext] + Workflow context. + + Returns + ------- + GitHubActionsWorkflowNode + The new workflow node. + """ + jobs: dict[str, RawGitHubActionsJobNode] = {} + + for job_id, job in workflow["jobs"].items(): + job_node = RawGitHubActionsJobNode( + job, job_id, core.OwningContextRef(GitHubActionsJobContext.create(context)) + ) + jobs[job_id] = job_node + + dependency_graph: dict[str, list[str]] = {} + for job_id, job_node in jobs.items(): + edges: list[str] = [] + if "needs" in job_node.definition: + needs = job_node.definition["needs"] + if isinstance(needs, list): + for need in needs: + # TODO invalid needs id? + edges.append(need) + elif isinstance(needs, str): + edges.append(needs) + dependency_graph[job_id] = edges + + ts = TopologicalSorter(dependency_graph) + order = list(ts.static_order()) + + env_block = None + if "env" in workflow: + env_block = RawGitHubActionsEnvNode(workflow["env"], context) + + return GitHubActionsWorkflowNode(workflow, context, env_block, jobs, order) + + +class RawGitHubActionsJobNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Job. + + Defines how to interpret the different kinds of jobs (normal jobs, reusable workflow call jobs), + and generate their analysis representation. + """ + + #: Parsed job AST. + definition: github_workflow_model.Job + #: Job id. + job_id: str + #: Job context. + context: core.ContextRef[GitHubActionsJobContext] + + def __init__( + self, definition: github_workflow_model.Job, job_id: str, context: core.ContextRef[GitHubActionsJobContext] + ) -> None: + """Initialize node.""" + super().__init__() + self.definition = definition + self.job_id = job_id + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret job AST to generate representation for either a normal job or a reusable workflow call job.""" + if github_workflow_model.is_normal_job(self.definition): + normal_job_definition = self.definition + + def build_normal_job() -> core.Node: + return GitHubActionsNormalJobNode.create( + normal_job_definition, self.job_id, self.context.get_non_owned() + ) + + return {"default": build_normal_job} + if github_workflow_model.is_reusable_workflow_call_job(self.definition): + raw_with_params = self.definition.get("with", {}) + call_def = self.definition + if isinstance(raw_with_params, dict): + + def build_reusable_workflow_call_job() -> core.Node: + uses_name, _, uses_version = call_def["uses"].rpartition("@") + + with_parameters: dict[str, facts.Value] = {} + for key, val in raw_with_params.items(): + if isinstance(val, str): + parsed_val = github_expr.extract_value_from_expr_string( + val, self.context.ref.job_variables.ref + ) + if parsed_val is not None: + with_parameters[key] = parsed_val + elif isinstance(val, bool): + with_parameters[key] = facts.StringLiteral("true") if val else facts.StringLiteral("false") + else: + with_parameters[key] = facts.StringLiteral(str(val)) + + return GitHubActionsReusableWorkflowCallNode( + call_def, + self.job_id, + self.context.get_non_owned(), + uses_name, + uses_version if uses_version != "" else None, + with_parameters, + ) + + return {"default": build_reusable_workflow_call_job} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + raise CallGraphError("invalid job") + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the job id and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["job id"] = {(None, self.job_id)} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class GitHubActionsNormalJobNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a GitHub Actions Normal Job. + + Control flow structure executes each step in the order defined by the job, + preceded by applying the effects of the matrix and env blocks if they exist + and succeeded by applying the effects of the output block if it exists. + (TODO generating output block not yet implemented). + """ + + #: Parsed job AST. + definition: github_workflow_model.NormalJob + #: Job id. + job_id: str + #: Node to apply effects of matrix block, if any. + matrix_block: RawGitHubActionsMatrixNode | None + #: Node to apply effects of env block, if any. + env_block: RawGitHubActionsEnvNode | None + #: Step nodes, in execution order. + steps: list[RawGitHubActionsStepNode] + #: Node to apply effects of output block, if any. + output_block: core.Node | None # TODO More specific + #: Job context + context: core.ContextRef[GitHubActionsJobContext] + #: Control flow graph + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: github_workflow_model.NormalJob, + job_id: str, + matrix_block: RawGitHubActionsMatrixNode | None, + env_block: RawGitHubActionsEnvNode | None, + steps: list[RawGitHubActionsStepNode], + output_block: core.Node | None, + context: core.ContextRef[GitHubActionsJobContext], + ) -> None: + """Initialize job node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: github_workflow_model.NormalJob + Parsed job AST. + job_id: str + Job id. + matrix_block: RawGitHubActionsMatrixNode | None + Node to apply effects of matrix block, if any. + env_block: RawGitHubActionsEnvNode | None + Node to apply effects of env block, if any. + steps: list[RawGitHubActionsStepNode] + Step nodes, in execution order. + output_block: core.Node | None, + Node to apply effects of output block, if any. + context: core.ContextRef[GitHubActionsJobContext] + Job context. + """ + super().__init__() + self.definition = definition + self.job_id = job_id + self.matrix_block = matrix_block + self.env_block = env_block + self.steps = steps + self.output_block = output_block + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.matrix_block, self.env_block] + self.steps + [self.output_block])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the child nodes of this node.""" + if self.matrix_block is not None: + yield self.matrix_block + if self.env_block is not None: + yield self.env_block + yield from self.steps + if self.output_block is not None: + yield self.output_block + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the job id and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + result["job id"] = {(None, self.job_id)} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + job: github_workflow_model.NormalJob, job_id: str, context: core.NonOwningContextRef[GitHubActionsJobContext] + ) -> GitHubActionsNormalJobNode: + """Create normal job node from job AST. Also creates a step node for each step. + + Parameters + ---------- + job: github_workflow_model.NormalJob + Parsed job AST. + job_id: str + Job id. + context: core.NonOwningContextRef[GitHubActionsJobContext] + Job context. + + Returns + ------- + GitHubActionsNormalJobNode + The new job node. + """ + # TODO output block + + matrix_block = None + if "strategy" in job and "matrix" in job["strategy"]: + matrix_block = RawGitHubActionsMatrixNode(job["strategy"]["matrix"], context) + + env_block = None + if "env" in job: + env_block = RawGitHubActionsEnvNode(job["env"], context) + + steps = [ + RawGitHubActionsStepNode( + step, core.OwningContextRef(GitHubActionsStepContext.create(context, step.get("id"))) + ) + for step in job.get("steps", []) + ] + + return GitHubActionsNormalJobNode(job, job_id, matrix_block, env_block, steps, None, context) + + +class GitHubActionsReusableWorkflowCallNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Reusable Workflow Call Job. + + Defines how to interpret the semantics of different supported reusable workflows that may + be invoked (TODO currently none are supported). + """ + + #: Parsed reusable workflow call AST. + definition: github_workflow_model.ReusableWorkflowCallJob + #: Job id. + job_id: str + #: Job context. + context: core.ContextRef[GitHubActionsJobContext] + + #: Name of the reusable workflow being invoked (without version component). + uses_name: str + #: Version of the reusable workflow being invoked (if specified). + uses_version: str | None + + #: Input parameters specified for reusable workflow. + with_parameters: dict[str, facts.Value] + + def __init__( + self, + definition: github_workflow_model.ReusableWorkflowCallJob, + job_id: str, + context: core.ContextRef[GitHubActionsJobContext], + uses_name: str, + uses_version: str | None, + with_parameters: dict[str, facts.Value], + ) -> None: + """Initialize reusable workflow call node. + + Parameters + ---------- + definition: github_workflow_model.ReusableWorkflowCallJob + Parsed reusable workflow call AST. + job_id: str + Job id. + context: core.ContextRef[GitHubActionsJobContext] + Job context. + uses_name: str + Name of the reusable workflow being invoked (without version component). + uses_version: str | None + Version of the reusable workflow being invoked (if specified). + with_parameters: dict[str, facts.Value] + Input parameters specified for reusable workflow. + """ + super().__init__() + self.definition = definition + self.job_id = job_id + self.context = context + self.uses_name = uses_name + self.uses_version = uses_version + self.with_parameters = with_parameters + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Intepret the semantics of the different supported reusable workflows. + + (TODO currently none are supported). + """ + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the job id, reusable workflow name, and scopes. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + result["job id"] = {(None, self.job_id)} + result["uses"] = {(None, self.definition["uses"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class RawGitHubActionsStepNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Step. + + Defines how to interpret the different kinds of steps (run jobs, action steps), + and generate their analysis representation. + """ + + #: Parsed step AST. + definition: github_workflow_model.Step + #: Step context + context: core.ContextRef[GitHubActionsStepContext] + + def __init__( + self, definition: github_workflow_model.Step, context: core.ContextRef[GitHubActionsStepContext] + ) -> None: + """Intitialize node.""" + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret step AST to generate representation depending on whether it is a run step or an action step.""" + if github_workflow_model.is_action_step(self.definition): + action_step_definition = self.definition + + def build_action_step() -> core.Node: + return RawGitHubActionsActionStepNode(action_step_definition, self.context.get_non_owned()) + + return {"default": build_action_step} + if github_workflow_model.is_run_step(self.definition): + run_step_definition = self.definition + + def build_run_step() -> core.Node: + return GitHubActionsRunStepNode.create(run_step_definition, self.context.get_non_owned()) + + return {"default": build_run_step} + raise CallGraphError("invalid step") + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table. + + Contains the step id, name, action name (if action step), and scopes. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step name"] = {(None, self.definition["name"])} + if github_workflow_model.is_action_step(self.definition): + result["step uses"] = {(None, self.definition["uses"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class RawGitHubActionsActionStepNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Action Step. + + Defines how to extract the name, version and parameters used to invoke the action, + and generate a node with those details resolved for further interpretation. + """ + + #: Parsed step AST. + definition: github_workflow_model.ActionStep + #: Step context. + context: core.ContextRef[GitHubActionsStepContext] + + def __init__( + self, definition: github_workflow_model.ActionStep, context: core.ContextRef[GitHubActionsStepContext] + ) -> None: + """Initialize node.""" + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Intepret action step AST to extract the name, version and parameters.""" + raw_with_params = self.definition.get("with", {}) + if isinstance(raw_with_params, dict): + + def build_action() -> core.Node: + uses_name, _, uses_version = self.definition["uses"].rpartition("@") + + with_parameters: dict[str, facts.Value] = {} + for key, val in raw_with_params.items(): + if isinstance(val, str): + parsed_val = github_expr.extract_value_from_expr_string( + val, self.context.ref.job_context.ref.job_variables.ref + ) + if parsed_val is not None: + with_parameters[key] = parsed_val + elif isinstance(val, bool): + with_parameters[key] = facts.StringLiteral("true") if val else facts.StringLiteral("false") + else: + with_parameters[key] = facts.StringLiteral(str(val)) + + return GitHubActionsActionStepNode( + self.definition, + self.context.get_non_owned(), + uses_name, + uses_version if uses_version != "" else None, + with_parameters, + ) + + return {"default": build_action} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the step id, name, action name, and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step name"] = {(None, self.definition["name"])} + result["step uses"] = {(None, self.definition["uses"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class GitHubActionsActionStepNode(core.InterpretationNode): + """Interpretation node representing a GitHub Actions Action Step. + + Defines how to interpret the semantics of different supported actions that may + be invoked. + """ + + #: Parsed step AST. + definition: github_workflow_model.ActionStep + #: Step context. + context: core.ContextRef[GitHubActionsStepContext] + + #: Name of the action being invoked (without version component). + uses_name: str + #: Version of the action being invoked (if specified). + uses_version: str | None + + #: Input parameters specified for action. + with_parameters: dict[str, facts.Value] + + def __init__( + self, + definition: github_workflow_model.ActionStep, + context: core.ContextRef[GitHubActionsStepContext], + uses_name: str, + uses_version: str | None, + with_parameters: dict[str, facts.Value], + ) -> None: + """Initialize action step node. + + Parameters + ---------- + definition: github_workflow_model.ActionStep + Parsed step AST. + context: core.ContextRef[GitHubActionsStepContext] + Step context. + uses_name: str + Name of the action being invoked (without version component). + uses_version: str | None + Version of the action being invoked (if specified). + with_parameters: dict[str, facts.Value] + Input parameters specified for action. + """ + super().__init__() + self.definition = definition + self.context = context + self.uses_name = uses_name + self.uses_version = uses_version + self.with_parameters = with_parameters + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Intepret the semantics of the different supported actions.""" + match self.uses_name: + case "actions/checkout": + + def build_checkout() -> core.Node: + return models.GitHubActionsGitCheckoutModelNode() + + return {"default": build_checkout} + case "actions/setup-java": + # Installs Java toolchain + def build_setup_java() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("java"), + version=self.with_parameters.get("java-version", facts.StringLiteral("")), + distribution=self.with_parameters.get("distribution", facts.StringLiteral("")), + url=facts.StringLiteral("https://github.com/actions/setup-java"), + ) + + return {"default": build_setup_java} + case "graalvm/setup-graalvm": + # Installs Java toolchain + def build_setup_graalvm() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("java"), + version=self.with_parameters.get("java-version", facts.StringLiteral("")), + distribution=self.with_parameters.get("distribution", facts.StringLiteral("graalvm")), + url=facts.StringLiteral("https://github.com/graalvm/setup-graalvm"), + ) + + return {"default": build_setup_graalvm} + + case "oracle-actions/setup-java": + # Installs Java toolchain + def build_setup_oracle_java() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("java"), + version=self.with_parameters.get("release", facts.StringLiteral("")), + distribution=self.with_parameters.get("website", facts.StringLiteral("oracle.com")), + url=facts.StringLiteral("https://github.com/oracle-actions/setup-java"), + ) + + return {"default": build_setup_oracle_java} + case "actions/setup-python": + # Installs Python toolchain + def build_setup_python() -> core.Node: + return models.InstallPackageNode( + install_scope=self.context.ref.job_context.ref.filesystem.ref, + name=facts.StringLiteral("python"), + version=self.with_parameters.get("python-version", facts.StringLiteral("")), + distribution=facts.StringLiteral(""), + url=facts.StringLiteral(""), + ) + + return {"default": build_setup_python} + case "actions/upload-artifact": + # Uploads artifact to pipeline artifact storage. + if "name" in self.with_parameters and "path" in self.with_parameters: + split = evaluation.parse_str_expr_split(self.with_parameters["path"], "\n") + if len(split) == 1: + + def build_upload_artifact() -> core.Node: + return models.GitHubActionsUploadArtifactModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.artifacts.ref, + artifact_name=self.with_parameters["name"], + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, split[0]), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=split[0], + ) + + return {"default": build_upload_artifact} + + def build_multiple_upload_artifact() -> core.Node: + seq: list[core.Node] = [ + models.GitHubActionsUploadArtifactModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.artifacts.ref, + artifact_name=self.with_parameters["name"], + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, path), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=path, + ) + for path in [x for x in split if x != facts.StringLiteral("")] + ] + if len(seq) == 0: + return core.NoOpStatementNode() + return core.SimpleSequence(seq) + + return {"default": build_multiple_upload_artifact} + + case "actions/download-artifact": + # Downloads artifact from pipeline artifact storage. + if "name" in self.with_parameters: + + def build_download_artifact() -> core.Node: + return models.GitHubActionsDownloadArtifactModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.artifacts.ref, + artifact_name=self.with_parameters["name"], + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + ) + + return {"default": build_download_artifact} + case "softprops/action-gh-release": + # Creates a GitHub release. + if "files" in self.with_parameters: + split = evaluation.parse_str_expr_split(self.with_parameters["files"], "\n") + if len(split) == 1: + + def build_upload_release() -> core.Node: + return models.GitHubActionsReleaseModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.releases.ref, + artifact_name=facts.StringLiteral(str(id(self))), + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, split[0]), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=split[0], + ) + + return {"default": build_upload_release} + + def build_multiple_upload_release() -> core.Node: + return core.SimpleSequence( + [ + models.GitHubActionsReleaseModelNode( + artifacts_scope=self.context.ref.job_context.ref.workflow_context.ref.releases.ref, + artifact_name=facts.StringLiteral(str(id(self))), + artifact_file=facts.UnaryStringOp(facts.UnaryStringOperator.BASENAME, path), + filesystem_scope=self.context.ref.job_context.ref.filesystem.ref, + path=path, + ) + for path in [x for x in split if x != facts.StringLiteral("")] + ] + ) + + return {"default": build_multiple_upload_release} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the step id, name, action name, with parameters, and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step_name"] = {(None, self.definition["name"])} + result["step uses"] = {(None, self.definition["uses"])} + + for key, val in self.with_parameters.items(): + result["with(" + key + ")"] = {(None, val.to_datalog_fact_string())} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + + return result + + +class GitHubActionsRunStepNode(core.ControlFlowGraphNode): + """Control-flow-graph node representing a GitHub Actions Run Step. + + Control flow structure executes the shell script defined by the step. + If an env block exists, it is applied beforehand. + """ + + #: Parsed step AST. + definition: github_workflow_model.RunStep + #: Node to apply effects of env block, if any. + env_block: RawGitHubActionsEnvNode | None + #: Shell script to be run. + shell_block: bash.RawBashScriptNode + #: Step context. + context: core.ContextRef[GitHubActionsStepContext] + #: Control flow graph + _cfg: core.ControlFlowGraph + + def __init__( + self, + definition: github_workflow_model.RunStep, + env_block: RawGitHubActionsEnvNode | None, + shell_block: bash.RawBashScriptNode, + context: core.ContextRef[GitHubActionsStepContext], + ) -> None: + """Initialize run step node. + + Typically, construction should be done via the create function rather than using this constructor directly. + + Parameters + ---------- + definition: github_workflow_model.RunStep + Parsed step AST. + env_block: RawGitHubActionsEnvNode | None + Node to apply effects of env block, if any. + shell_block: bash.RawBashScriptNode + Shell script to be run. + context: core.ContextRef[GitHubActionsStepContext] + Step context. + """ + super().__init__() + self.definition = definition + self.env_block = env_block + self.shell_block = shell_block + self.context = context + + self._cfg = core.ControlFlowGraph.create_from_sequence( + list(filter(core.node_is_not_none, [self.env_block, self.shell_block])) + ) + + def children(self) -> Iterator[core.Node]: + """Yield the child nodes of this node.""" + if self.env_block is not None: + yield self.env_block + yield self.shell_block + + def get_entry(self) -> core.Node: + """Return the entry node.""" + return self._cfg.get_entry() + + def get_successors(self, node: core.Node, exit_type: core.ExitType) -> set[core.Node | core.ExitType]: + """Return the successors for a particular exit of a particular node.""" + return self._cfg.get_successors(node, core.DEFAULT_EXIT) + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the step id, name, and scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + if "id" in self.definition: + result["step id"] = {(None, self.definition["id"])} + elif "name" in self.definition: + result["step name"] = {(None, self.definition["name"])} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + @staticmethod + def create( + run_step: github_workflow_model.RunStep, context: core.NonOwningContextRef[GitHubActionsStepContext] + ) -> GitHubActionsRunStepNode: + """Create run step node from step AST. + + Parameters + ---------- + run_step: github_workflow_model.RunStep + Parsed step AST. + context: core.NonOwningContextRef[GitHubActionsStepContext] + Step context. + + Returns + ------- + GitHubActionsRunStepNode + The new run step node. + """ + env_block = None + if "env" in run_step: + env_block = RawGitHubActionsEnvNode(run_step["env"], context) + script_node = bash.RawBashScriptNode( + facts.StringLiteral(run_step["run"]), + core.OwningContextRef(bash.BashScriptContext.create_from_run_step(context, "")), + ) + return GitHubActionsRunStepNode(run_step, env_block, script_node, context) + + +class RawGitHubActionsEnvNode(core.InterpretationNode): + """Interpretation node representing an env block in a GitHub Actions Workflow/Job/Step. + + Defines how to interpret the declarative env block to generate imperative constructs to + write the values to the env variables. + """ + + #: Parsed env block AST. + definition: github_workflow_model.Env + #: Outer context. + context: core.ContextRef[GitHubActionsWorkflowContext | GitHubActionsJobContext | GitHubActionsStepContext] + + def __init__( + self, + definition: github_workflow_model.Env, + context: core.ContextRef[GitHubActionsWorkflowContext | GitHubActionsJobContext | GitHubActionsStepContext], + ) -> None: + """Initialize env block node. + + Parameters + ---------- + definition: github_workflow_model.Env + Parsed env block AST. + context: core.ContextRef[GitHubActionsWorkflowContext | GitHubActionsJobContext | GitHubActionsStepContext] + Outer context. + """ + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret declarative env block to generate imperative constructs to write to the env vars.""" + env = self.definition + if isinstance(env, dict): + + def build_env_writes() -> core.Node: + env_writes: dict[str, facts.Value] = {} + for key, val in env.items(): + if isinstance(val, str): + var_scope = ( + self.context.ref.job_context.ref.job_variables.ref + if isinstance(self.context.ref, GitHubActionsStepContext) + else ( + self.context.ref.job_variables.ref + if isinstance(self.context.ref, GitHubActionsJobContext) + else None + ) + ) + parsed_val = github_expr.extract_value_from_expr_string(val, var_scope) + if parsed_val is not None: + env_writes[key] = parsed_val + elif isinstance(val, bool): + env_writes[key] = facts.StringLiteral("true") if val else facts.StringLiteral("false") + else: + env_writes[key] = facts.StringLiteral(str(val)) + + if len(env_writes) == 0: + return core.NoOpStatementNode() + + return core.SimpleSequence( + [ + models.VarAssignNode( + models.VarAssignKind.GITHUB_ENV_VAR, self.context.ref.env.ref, facts.StringLiteral(var), val + ) + for var, val in env_writes.items() + ] + ) + + return {"default": build_env_writes} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result + + +class RawGitHubActionsMatrixNode(core.InterpretationNode): + """Interpretation node representing a matrix block in a GitHub Actions Job. + + Defines how to interpret the declarative matrix block to generate imperative constructs to + write the values to the matrix variables. + """ + + #: Parsed matrix block AST. + definition: github_workflow_model.Matrix + #: Outer job context. + context: core.ContextRef[GitHubActionsJobContext] + + def __init__( + self, + definition: github_workflow_model.Matrix, + context: core.ContextRef[GitHubActionsJobContext], + ) -> None: + """Initialize matrix node. + + Parameters + ---------- + definition: github_workflow_model.Matrix + Parsed matrix block AST. + context: core.ContextRef[GitHubActionsJobContext] + Outer job context. + """ + super().__init__() + self.definition = definition + self.context = context + + def identify_interpretations(self, state: core.State) -> dict[core.InterpretationKey, Callable[[], core.Node]]: + """Interpret declarative matrix block to generate imperative constructs to write to the matrix variables.""" + matrix = self.definition + if isinstance(matrix, dict): + + def build_matrix_writes() -> core.Node: + matrix_writes: dict[str, list[facts.Value]] = defaultdict(list) + if isinstance(matrix, dict): + for key, vals in matrix.items(): + if isinstance(vals, list): + var_scope = self.context.ref.job_variables.ref + + for val in vals: + if isinstance(val, str): + parsed_val = github_expr.extract_value_from_expr_string(val, var_scope) + if parsed_val is not None: + matrix_writes[key].append(parsed_val) + elif isinstance(val, bool): + matrix_writes[key].append( + facts.StringLiteral("true") if val else facts.StringLiteral("false") + ) + else: + matrix_writes[key].append(facts.StringLiteral(str(val))) + + if len(matrix_writes) == 0: + return core.NoOpStatementNode() + + return core.SimpleSequence( + [ + core.SimpleAlternatives( + [ + models.VarAssignNode( + models.VarAssignKind.GITHUB_JOB_VAR, + self.context.ref.job_variables.ref, + facts.StringLiteral("matrix." + key), + val, + ) + for val in vals + ] + ) + for key, vals in matrix_writes.items() + ] + ) + + return {"default": build_matrix_writes} + + def build_noop() -> core.Node: + return core.NoOpStatementNode() + + return {"default": build_noop} + + def get_exit_state_transfer_filter(self) -> core.StateTransferFilter: + """Return state transfer filter to clear scopes owned by this node after this node exits.""" + return core.ExcludedScopesStateTransferFilter(core.get_owned_scopes(self.context)) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties table containing the scopes.""" + result: dict[str, set[tuple[str | None, str]]] = {} + + printing.add_context_owned_scopes_to_properties_table(result, self.context) + return result diff --git a/src/macaron/code_analyzer/dataflow_analysis/github_expr.py b/src/macaron/code_analyzer/dataflow_analysis/github_expr.py new file mode 100644 index 000000000..8961750a4 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/github_expr.py @@ -0,0 +1,141 @@ +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Parser for GitHub Actions expression language.""" + +from typing import cast + +from lark import Lark, Token, Tree + +from macaron.code_analyzer.dataflow_analysis import facts + +# Parser for GitHub Actions expression language grammar. +github_expr_parser = Lark( + r""" + _expr: literal + | identifier + | _operator_expr + | function_call + + literal: BOOLEAN_LITERAL + | NULL_LITERAL + | NUMBER_LITERAL + | STRING_LITERAL + + BOOLEAN_LITERAL: "true" | "false" + + NULL_LITERAL: "null" + + NUMBER_LITERAL: SIGNED_NUMBER + + STRING_LITERAL: "'" STRING_INNER + "'" + + STRING_INNER: /.*?/s + + CNAMEWITHDASH: ("_"|LETTER) ("_"|"-"|LETTER|DIGIT)* + + identifier: CNAMEWITHDASH + + _operator_expr: paren_expr + | property_deref + | property_deref_object_filter + | index_expr + | not_expr + | and_expr + | or_expr + | less_than_expr + | less_than_equal_expr + | greater_than_expr + | greater_than_equal_expr + | equal_expr + | not_equal_expr + + paren_expr: "(" _expr ")" + property_deref: _expr "." identifier + property_deref_object_filter: _expr "." "*" + index_expr: _expr "[" _expr "]" + not_expr: "!" _expr + and_expr: _expr "&&" _expr + or_expr: _expr "||" _expr + less_than_expr: _expr "<" _expr + less_than_equal_expr: _expr "<=" _expr + greater_than_expr: _expr ">" _expr + greater_than_equal_expr: _expr ">=" _expr + equal_expr: _expr "==" _expr + not_equal_expr: _expr "!=" _expr + + function_call: identifier "(" _expr ("," _expr)* ")" + + %import common.SIGNED_NUMBER + %import common.WS + %import common.LETTER + %import common.DIGIT + %import common._STRING_INNER + %ignore WS + """, + start="_expr", +) + + +def extract_expr_variable_name(node: Token | Tree[Token]) -> str | None: + """Return variable access path for token. + + If the given node is a variable access or sequence of property accesses, return the + access path as a string, otherwise return None. + """ + if isinstance(node, Tree) and node.data == "property_deref": + rest = extract_expr_variable_name(node.children[0]) + property_identifier = cast(Tree, node.children[1]) + if rest is not None: + identifier = cast(Token, property_identifier.children[0]) + return rest + "." + identifier + elif isinstance(node, Tree) and node.data == "identifier": + identifier = cast(Token, node.children[0]) + return cast(str, identifier.value) + + return None + + +def extract_value_from_expr_string(s: str, var_scope: facts.Scope | None) -> facts.Value | None: + """Return a value expression representation of a string containing GitHub Actions expressions. + + GitHub Action expressions within the string are denoted by "${{ }}". + + Returns None if it is unrepresentable. + """ + cur_idx = 0 + cur_expr_begin = s.find("${{") + values: list[facts.Value] = [] + while cur_expr_begin != -1: + cur_str = s[cur_idx:cur_expr_begin] + values.append(facts.StringLiteral(cur_str)) + cur_expr_end = s.find("}}", cur_expr_begin) + cur_expr = s[cur_expr_begin + 3 : cur_expr_end] + parse_tree = github_expr_parser.parse(cur_expr) + + node = parse_tree.children[0] + + var_str = extract_expr_variable_name(node) + if var_str is not None and var_scope is not None: + values.append( + facts.Read( + loc=facts.Location(scope=var_scope, loc=facts.Variable(name=facts.StringLiteral(literal=var_str))) + ) + ) + else: + return None + + cur_idx = cur_expr_end + 2 + cur_expr_begin = s.find("${{", cur_idx) + last_str = s[cur_idx:] + + values.append(facts.StringLiteral(last_str)) + + if len(values) == 1: + return values[0] + + cur_concat = facts.BinaryStringOp.get_string_concat(values[0], values[1]) + + for val in values[2:]: + cur_concat = facts.BinaryStringOp.get_string_concat(cur_concat, val) + return cur_concat diff --git a/src/macaron/code_analyzer/dataflow_analysis/models.py b/src/macaron/code_analyzer/dataflow_analysis/models.py new file mode 100644 index 000000000..4528c1bc1 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/models.py @@ -0,0 +1,679 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Models of supported commands, actions, etc. that may be invoked by build pipelines. + +Defines how they are modelled by the dataflow analysis in terms of their effect on the abstract state. +""" + +from __future__ import annotations + +from enum import Enum, auto +from functools import cache + +from macaron.code_analyzer.dataflow_analysis import core, evaluation, facts + + +class BoundParameterisedStatementSet: + """Representation of a set of (simultaneous) write operations. + + Defined as a reference to a set of generic parameterised statements, along with a set of parameter bindings + that instantiate the parameterised statements with concrete subexpressions. + """ + + #: Set of generic parameterised statements. + parameterised_stmts: evaluation.StatementSet + #: Parameter bindings for values. + value_parameter_binds: dict[str, facts.Value] + #: Parameter bindings for locations. + location_parameter_binds: dict[str, facts.LocationSpecifier] + #: Parameter bindings for scopes. + scope_parameter_binds: dict[str, facts.Scope] + #: Instantiated statements. + instantiated_statements: evaluation.StatementSet + + def __init__( + self, + parameterised_stmts: evaluation.StatementSet, + value_parameter_binds: dict[str, facts.Value] | None = None, + location_parameter_binds: dict[str, facts.LocationSpecifier] | None = None, + scope_parameter_binds: dict[str, facts.Scope] | None = None, + ) -> None: + """Initialize bound parameterised statement set. + + Parameters + ---------- + parameterised_stmts: evaluation.StatementSet + Set of generic parameterised statements. + value_parameter_binds: dict[str, facts.Value] | None + Parameter bindings for value. + location_parameter_binds: dict[str, facts.LocationSpecifier] | None + Parameter bindings for locations. + scope_parameter_binds: dict[str, facts.Scope] | None + Parameter bindings for scopes. + """ + self.parameterised_stmts = parameterised_stmts + self.value_parameter_binds = value_parameter_binds or {} + self.location_parameter_binds = location_parameter_binds or {} + self.scope_parameter_binds = scope_parameter_binds or {} + + transformer = evaluation.ParameterPlaceholderTransformer( + allow_unbound_params=False, + value_parameter_binds=self.value_parameter_binds, + location_parameter_binds=self.location_parameter_binds, + scope_parameter_binds=self.scope_parameter_binds, + ) + self.instantiated_statements = transformer.transform_statement_set(parameterised_stmts) + + def get_statements(self) -> evaluation.StatementSet: + """Return instantiated statement set.""" + return self.instantiated_statements + + +class BoundParameterisedModelNode(core.StatementNode): + """Statement node that applies effects as defined in a provided model. + + Subclasses will define a statement node with a specific model. + """ + + #: Statement effects model. + stmts: BoundParameterisedStatementSet + + def __init__(self, stmts: BoundParameterisedStatementSet) -> None: + """Initialise model statement node.""" + super().__init__() + + self.stmts = stmts + + def apply_effects(self, before_state: core.State) -> dict[core.ExitType, core.State]: + """Apply effects as defined in a provided model.""" + return {core.DEFAULT_EXIT: self.stmts.get_statements().apply_effects(before_state)} + + +class InstallPackageNode(BoundParameterisedModelNode): + """Model for package installation. + + Stores a representation of the installed package into the abstract "installed packages" location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return the model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("install_scope"), + facts.Installed(name=facts.ParameterPlaceholderValue("name")), + ), + facts.InstalledPackage( + name=facts.ParameterPlaceholderValue("name"), + version=facts.ParameterPlaceholderValue("version"), + distribution=facts.ParameterPlaceholderValue("distribution"), + url=facts.ParameterPlaceholderValue("url"), + ), + ) + } + ) + + #: Scope into which to install. + install_scope: facts.Scope + #: Package name. + name: facts.Value + #: Package version. + version: facts.Value + #: Package distribution. + distribution: facts.Value + #: URL of package. + url: facts.Value + + def __init__( + self, + install_scope: facts.Scope, + name: facts.Value, + version: facts.Value, + distribution: facts.Value, + url: facts.Value, + ) -> None: + """Initialize install package node. + + Parameters + ---------- + install_scope: facts.Scope + Scope into which to install. + name: facts.Value + Package name. + version: facts.Value + Package version. + distribution: facts.Value + Package distribution. + url: facts.Value + URL of package. + """ + self.install_scope = install_scope + self.name = name + self.version = version + self.distribution = distribution + self.url = url + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"name": name, "version": version, "distribution": distribution, "url": url}, + scope_parameter_binds={"install_scope": install_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "install_scope": {(None, self.install_scope.to_datalog_fact_string())}, + "name": {(None, self.name.to_datalog_fact_string())}, + "version": {(None, self.version.to_datalog_fact_string())}, + "distribution": {(None, self.distribution.to_datalog_fact_string())}, + "url": {(None, self.url.to_datalog_fact_string())}, + } + + +class VarAssignKind(Enum): + """Kind of variable assignment.""" + + #: Bash environment variable. + BASH_ENV_VAR = auto() + #: Bash function declaration. + BASH_FUNC_DECL = auto() + #: GitHub job variable. + GITHUB_JOB_VAR = auto() + #: GitHub environment variable. + GITHUB_ENV_VAR = auto() + #: Other uncategorized variable. + OTHER = auto() + + +class VarAssignNode(BoundParameterisedModelNode): + """Model for variable assignment. + + Stores the assigned value to the variable location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return the model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("var_scope"), + facts.Variable(facts.ParameterPlaceholderValue("var_name")), + ), + facts.ParameterPlaceholderValue("value"), + ) + } + ) + + #: The kind of variable. + kind: VarAssignKind + #: The scope in which the variable is stored. + var_scope: facts.Scope + #: The name of the variable. + var_name: facts.Value + #: The value to assign to the variable. + value: facts.Value + + def __init__(self, kind: VarAssignKind, var_scope: facts.Scope, var_name: facts.Value, value: facts.Value) -> None: + """Initialize variable assignment node. + + Parameters + ---------- + kind: VarAssignKind + The kind of variable. + var_scope: facts.Scope + The scope in which the variable is stored. + var_name: facts.Value + The name of the variable. + value: facts.Value + The value to assign to the variable. + """ + self.kind = kind + self.var_scope = var_scope + self.var_name = var_name + self.value = value + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"var_name": var_name, "value": value}, + scope_parameter_binds={"var_scope": var_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "kind": {(None, self.kind.name)}, + "var_scope": {(None, self.var_scope.to_datalog_fact_string())}, + "var_name": {(None, self.var_name.to_datalog_fact_string())}, + "value": {(None, self.value.to_datalog_fact_string())}, + } + + +class GitHubActionsGitCheckoutModelNode(core.StatementNode): + """Model for GitHub git checkout operation. + + Currently modelled as a no-op. + """ + + def apply_effects(self, before_state: core.State) -> dict[core.ExitType, core.State]: + """Apply effects for git checkout (currently nothing).""" + state = core.State() + core.transfer_state(before_state, state) + return {core.DEFAULT_EXIT: state} + + +class GitHubActionsUploadArtifactModelNode(BoundParameterisedModelNode): + """Model for uploading artifacts to GitHub pipeline artifact storage. + + Stores the content read from a file to the artifact storage location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return the model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.Artifact( + name=facts.ParameterPlaceholderValue("artifact_name"), + file=facts.ParameterPlaceholderValue("artifact_file"), + ), + ), + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("filesystem_scope"), + facts.Filesystem(facts.ParameterPlaceholderValue("path")), + ) + ), + ) + } + ) + + #: Scope for pipeline artifact storage. + artifacts_scope: facts.Scope + #: Artifact name. + artifact_name: facts.Value + #: Artifact filename. + artifact_file: facts.Value + #: Scope for filesystem from which to read file. + filesystem_scope: facts.Scope + #: File path to read artifact content from. + path: facts.Value + + def __init__( + self, + artifacts_scope: facts.Scope, + artifact_name: facts.Value, + artifact_file: facts.Value, + filesystem_scope: facts.Scope, + path: facts.Value, + ) -> None: + """Initialize upload artifacts node. + + Parameters + ---------- + artifacts_scope: facts.Scope + Scope for pipeline artifact storage. + artifact_name: facts.Value + Artifact name. + artifact_file: facts.Value + Artifact filename. + filesystem_scope: facts.Scope + Scope for filesystem from which to read file. + path: facts.Value + File path to read artifact content from. + """ + self.artifacts_scope = artifacts_scope + self.artifact_name = artifact_name + self.artifact_file = artifact_file + self.filesystem_scope = filesystem_scope + self.path = path + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"artifact_name": artifact_name, "artifact_file": artifact_file, "path": path}, + scope_parameter_binds={"artifacts_scope": artifacts_scope, "filesystem_scope": filesystem_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "artifacts_scope": {(None, self.artifacts_scope.to_datalog_fact_string())}, + "artifact_name": {(None, self.artifact_name.to_datalog_fact_string())}, + "artifact_file": {(None, self.artifact_file.to_datalog_fact_string())}, + "filesystem_scope": {(None, self.filesystem_scope.to_datalog_fact_string())}, + "path": {(None, self.path.to_datalog_fact_string())}, + } + + +class GitHubActionsDownloadArtifactModelNode(BoundParameterisedModelNode): + """Model for downloading artifacts from GitHub pipeline artifact storage. + + For each file in the artifact, reads the content of that artifact and + stores it to the filesystem under the same filename. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("filesystem_scope"), + facts.Filesystem( + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.ArtifactAnyFilename(facts.ParameterPlaceholderValue("artifact_name")), + ) + ) + ), + ), + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.Artifact( + name=facts.ParameterPlaceholderValue("artifact_name"), + file=facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("artifacts_scope"), + facts.ArtifactAnyFilename(facts.ParameterPlaceholderValue("artifact_name")), + ) + ), + ), + ) + ), + ) + } + ) + + #: Scope for pipeline artifact storage. + artifacts_scope: facts.Scope + #: Artifact name. + artifact_name: facts.Value + #: Scope for filesystem to store artifacts to. + filesystem_scope: facts.Scope + + def __init__(self, artifacts_scope: facts.Scope, artifact_name: facts.Value, filesystem_scope: facts.Scope) -> None: + """Initialize download artifacts node. + + Parameters + ---------- + artifacts_scope: facts.Scope + Scope for pipeline artifact storage. + artifact_name: facts.Value + Artifact name. + filesystem_scope: facts.Scope + Scope for filesystem to store artifacts to. + """ + self.artifacts_scope = artifacts_scope + self.artifact_name = artifact_name + self.filesystem_scope = filesystem_scope + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"artifact_name": artifact_name}, + scope_parameter_binds={"artifacts_scope": artifacts_scope, "filesystem_scope": filesystem_scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "artifacts_scope": {(None, self.artifacts_scope.to_datalog_fact_string())}, + "artifact_name": {(None, self.artifact_name.to_datalog_fact_string())}, + "filesystem_scope": {(None, self.filesystem_scope.to_datalog_fact_string())}, + } + + +class GitHubActionsReleaseModelNode(GitHubActionsUploadArtifactModelNode): + """Model for uploading artifacts to a GitHub release. + + Modelled in the same way as artifact upload. + """ + + +class BashEchoNode(BoundParameterisedModelNode): + """Model for Bash echo command, which writes the echoed value to some location.""" + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("out_loc_scope"), + facts.ParameterPlaceholderLocation("out_loc_spec"), + ), + facts.ParameterPlaceholderValue("value"), + ) + } + ) + + #: Output location. + out_loc: facts.Location + #: Value written. + value: facts.Value + + def __init__(self, out_loc: facts.Location, value: facts.Value) -> None: + """Initialize echo node. + + Parameters + ---------- + out_loc: facts.Location + Output location. + value: facts.Value + Value written. + """ + self.out_loc = out_loc + self.value = value + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + value_parameter_binds={"value": value}, + location_parameter_binds={"out_loc_spec": out_loc.loc}, + scope_parameter_binds={"out_loc_scope": out_loc.scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "out_loc": {(None, self.out_loc.to_datalog_fact_string())}, + "value": {(None, self.value.to_datalog_fact_string())}, + } + + +class Base64EncodeNode(BoundParameterisedModelNode): + """Model for Base64 encode operation. + + Reads a value from some location, Base64-encodes it and writes the result to another location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("out_loc_scope"), + facts.ParameterPlaceholderLocation("out_loc_spec"), + ), + facts.UnaryStringOp( + facts.UnaryStringOperator.BASE64_ENCODE, + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("in_loc_scope"), + facts.ParameterPlaceholderLocation("in_loc_spec"), + ) + ), + ), + ) + } + ) + + #: Location to read input from. + in_loc: facts.Location + #: Location to write encoded output to. + out_loc: facts.Location + + def __init__(self, in_loc: facts.Location, out_loc: facts.Location) -> None: + """Initialize Base64 encode node. + + Parameters + ---------- + in_loc: facts.Location + Location to read input from. + out_loc: facts.Location + Location to write encoded output to. + """ + self.in_loc = in_loc + self.out_loc = out_loc + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + location_parameter_binds={"out_loc_spec": out_loc.loc, "in_loc_spec": in_loc.loc}, + scope_parameter_binds={"out_loc_scope": out_loc.scope, "in_loc_scope": in_loc.scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "in_loc": {(None, self.in_loc.to_datalog_fact_string())}, + "out_loc": {(None, self.out_loc.to_datalog_fact_string())}, + } + + +class Base64DecodeNode(BoundParameterisedModelNode): + """Model for Base64 decode operation. + + Reads a value from some location, Base64-decodes it and writes the result to another location. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("out_loc_scope"), + facts.ParameterPlaceholderLocation("out_loc_spec"), + ), + facts.UnaryStringOp( + facts.UnaryStringOperator.BASE64DECODE, + facts.Read( + facts.Location( + facts.ParameterPlaceholderScope("in_loc_scope"), + facts.ParameterPlaceholderLocation("in_loc_spec"), + ) + ), + ), + ) + } + ) + + #: Location to read input from. + in_loc: facts.Location + #: Location to write decoded output to. + out_loc: facts.Location + + def __init__(self, in_loc: facts.Location, out_loc: facts.Location) -> None: + """Initialize Base64 decode node. + + Parameters + ---------- + in_loc: facts.Location + Location to read input from. + out_loc: facts.Location + Location to write decoded output to. + """ + self.in_loc = in_loc + self.out_loc = out_loc + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), + location_parameter_binds={"out_loc_spec": out_loc.loc, "in_loc_spec": in_loc.loc}, + scope_parameter_binds={"out_loc_scope": out_loc.scope, "in_loc_scope": in_loc.scope}, + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return { + "in_loc": {(None, self.in_loc.to_datalog_fact_string())}, + "out_loc": {(None, self.out_loc.to_datalog_fact_string())}, + } + + +class MavenBuildModelNode(BoundParameterisedModelNode): + """Model for Maven build commands. + + Maven build behaviour is approximated as writing some files under the target directory. + """ + + @staticmethod + @cache + def get_model() -> evaluation.StatementSet: + """Return model.""" + return evaluation.StatementSet( + { + evaluation.WriteStatement( + facts.Location( + facts.ParameterPlaceholderScope("filesystem_scope"), + facts.FilesystemAnyUnderDir(facts.StringLiteral("./target")), + ), + facts.ArbitraryNewData("mvn"), # TODO something better? + ) + } + ) + + #: Scope for filesystem written to. + filesystem_scope: facts.Scope + + def __init__(self, filesystem_scope: facts.Scope) -> None: + """Initialize Maven build node. + + Parameters + ---------- + filesystem_scope: facts.Scope + Scope for filesystem written to. + """ + self.filesystem_scope = filesystem_scope + + bound_stmts = BoundParameterisedStatementSet( + parameterised_stmts=self.get_model(), scope_parameter_binds={"filesystem_scope": filesystem_scope} + ) + + super().__init__(bound_stmts) + + def get_printable_properties_table(self) -> dict[str, set[tuple[str | None, str]]]: + """Return a properties tables with the model parameters.""" + return {"filesystem_scope": {(None, self.filesystem_scope.to_datalog_fact_string())}} diff --git a/src/macaron/code_analyzer/dataflow_analysis/printing.py b/src/macaron/code_analyzer/dataflow_analysis/printing.py new file mode 100644 index 000000000..0ffd61813 --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/printing.py @@ -0,0 +1,681 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Functions for printing/displaying dataflow analysis nodes in the form of graphviz (dot) output. + +Allows the analysis representation and results to be rendered as a human-readable node-link graph. + +Makes use of graphviz's html-like label feature to add detailed information to each node. +Tables are specified in the form of a dict[str, set[tuple[str | None, str]], which is rendered as +a two-column table, with the first column containing each of the keys of the dict, and the second +column containing the corresponding set of values, as a nested vertical table, with each value having +an optional label that, if present, will be rendered in a visually distinguished manner alongside the +value. +""" + +from __future__ import annotations + +import dataclasses +from dataclasses import dataclass +from typing import TextIO + +from macaron.code_analyzer.dataflow_analysis import core + + +def print_as_dot_graph(node: core.Node, out: TextIO, include_properties: bool, include_states: bool) -> None: + """Print root node as dot graph. + + Parameters + ---------- + node: core.Node + The root node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + out.write("digraph {\n") + out.write('node [style="filled", fillcolor="white"]\n') + print_as_dot_string(node, out, include_properties=include_properties, include_states=include_states) + out.write("}\n") + + +def get_printable_table_for_state( + state: core.State, state_filter: core.StateTransferFilter | None = None +) -> dict[str, set[tuple[str | None, str]]]: + """Return a table of the stringified representation of the state. + + Consists of a mapping of storage locations to the set of values they may contain + (see module comment for description of the return type). + + Values are additionally labeled with whether they were new and not copied, and whether + they will be excluded by the given filter. + """ + result: dict[str, set[tuple[str | None, str]]] = {} + for key, vals in state.state.items(): + vals_strs: set[tuple[str | None, str]] = { + ( + str(label.sequence_number) + + ("*" if not label.copied else "") + + ("!" if state_filter is not None and not state_filter.should_transfer(key) else ""), + val.to_datalog_fact_string(), + ) + for val, label in vals.items() + } + key_str = key.to_datalog_fact_string() + result[key_str] = vals_strs + return result + + +def print_as_dot_string(node: core.Node, out: TextIO, include_properties: bool, include_states: bool) -> None: + """Print node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + node: core.Node + The node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + match node: + case core.ControlFlowGraphNode(): + print_cfg_node_as_dot_string(node, out, include_properties, include_states) + case core.StatementNode(): + print_statement_node_as_dot_string(node, out, include_properties, include_states) + case core.InterpretationNode(): + print_interpretation_node_as_dot_string(node, out, include_properties, include_states) + + +def print_cfg_node_as_dot_string( + cfg_node: core.ControlFlowGraphNode, out: TextIO, include_properties: bool, include_states: bool +) -> None: + """Print control-flow-graph node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + cfg_node: core.ControlFlowGraphNode + The control-flow-graph node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + out.write("subgraph cluster_n" + str(id(cfg_node)) + "{\n") + out.write("style=filled\n") + out.write('fillcolor="#fdf3e4ff"\n') + + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]] = [] + if include_properties: + properties_table = cfg_node.get_printable_properties_table() + if len(properties_table) > 0: + subtables.append( + ( + "Properties", + cfg_node.get_printable_properties_table(), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + + if include_states: + subtables.append( + ( + "Before State", + get_printable_table_for_state(cfg_node.before_state), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + if core.DEFAULT_EXIT in cfg_node.exit_states: + subtables.append( + ( + "Exit State", + get_printable_table_for_state( + cfg_node.exit_states[core.DEFAULT_EXIT], cfg_node.get_exit_state_transfer_filter() + ), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + for exit_type, exit_state in cfg_node.exit_states.items(): + if not isinstance(exit_type, core.DefaultExit): + subtables.append( + ( + "Exit State (" + exit_type.__class__.__name__ + ")", + get_printable_table_for_state(exit_state, cfg_node.get_exit_state_transfer_filter()), + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES, + ) + ) + + out.write( + produce_node_dot_def( + node_id=("n" + str(id(cfg_node))), + node_kind="ControlFlowGraph", + node_type=cfg_node.__class__.__name__, + node_label=( + "[" + + ", ".join( + [str(cfg_node.created_debug_sequence_num)] + + ["(" + str(b) + "-" + str(e) + ")" for b, e in cfg_node.processed_log] + ) + + "]" + if include_states + else None + ), + config=DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE, + subtables=subtables, + ) + + "\n" + ) + + i = 0 + out.write("n" + str(id(cfg_node)) + " -> " + "c" + str(id(cfg_node.get_entry())) + ' [label="entry"]\n') + + for child_node in cfg_node.children(): + out.write( + "c" + + str(id(child_node)) + + ' [label="' + + str(i) + + '", shape=circle, fontcolor="#ffffffff", fillcolor="#aa643bff"]\n' + ) + out.write( + "e" + + str(id(cfg_node)) + + '_exit [label="exit", shape=circle, fontcolor="#ffffffff", fillcolor="#aa643bff"]\n' + ) + next_alt_exit_id = 0 + alt_exit_ids: dict[core.ExitType, int] = {} + + for exit_type in child_node.exit_states: + successors = cfg_node.get_successors(child_node, exit_type) + for successor in successors: + if isinstance(successor, core.Node): + out.write("c" + str(id(child_node)) + " -> " + "c" + str(id(successor)) + ' [label=""]\n') + elif isinstance(successor, core.DefaultExit): + out.write("c" + str(id(child_node)) + " -> " + "e" + str(id(cfg_node)) + "_exit" + ' [label=""]\n') + else: + if successor not in alt_exit_ids: + alt_exit_ids[successor] = next_alt_exit_id + next_alt_exit_id = next_alt_exit_id + 1 + alt_exit_id = alt_exit_ids[successor] + out.write( + "c" + + str(id(child_node)) + + " -> " + + "e" + + str(id(cfg_node)) + + "_alt_exit_" + + str(alt_exit_id) + + ' [label=""]\n' + ) + + for alt_exit_id in alt_exit_ids.values(): + out.write( + "e" + + str(id(cfg_node)) + + "_alt_exit_" + + str(alt_exit_id) + + ' [label="alt-exit", shape=circle, fontcolor="#ffffffff", fillcolor="#aa643bff"]\n' + ) + i = i + 1 + out.write("}\n") + + for child_node in cfg_node.children(): + out.write("c" + str(id(child_node)) + " -> " + "n" + str(id(child_node)) + ' [label=""]\n') + + for child_node in cfg_node.children(): + print_as_dot_string(child_node, out, include_properties=include_properties, include_states=include_states) + + +def print_statement_node_as_dot_string( + node: core.StatementNode, out: TextIO, include_properties: bool, include_states: bool +) -> None: + """Print statement node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + node: core.StatementNode + The statement node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]] = [] + + if include_properties: + properties_table = node.get_printable_properties_table() + if len(properties_table) > 0: + subtables.append( + ( + "Properties", + node.get_printable_properties_table(), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + + if include_states: + subtables.append( + ( + "Before State", + get_printable_table_for_state(node.before_state), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + if core.DEFAULT_EXIT in node.exit_states: + subtables.append( + ( + "Exit State", + get_printable_table_for_state( + node.exit_states[core.DEFAULT_EXIT], node.get_exit_state_transfer_filter() + ), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + for exit_type, exit_state in node.exit_states.items(): + if not isinstance(exit_type, core.DefaultExit): + subtables.append( + ( + "Exit State + (" + exit_type.__class__.__name__ + ")", + get_printable_table_for_state(exit_state, node.get_exit_state_transfer_filter()), + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES, + ) + ) + + out.write( + produce_node_dot_def( + node_id=("n" + str(id(node))), + node_kind="Statement", + node_type=node.__class__.__name__, + node_label=( + "[" + + ", ".join( + [str(node.created_debug_sequence_num)] + + ["(" + str(b) + "-" + str(e) + ")" for b, e in node.processed_log] + ) + + "]" + if include_states + else None + ), + config=DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE, + subtables=subtables, + ) + + "\n" + ) + + +def print_interpretation_node_as_dot_string( + node: core.InterpretationNode, out: TextIO, include_properties: bool, include_states: bool +) -> None: + """Print interpretation node as dot representation (to be embedded within a dot graph). + + Parameters + ---------- + node: core.InterpretationNode + The interpretation node to print. + out: TextIO + Output stream to print to. + include_properties: bool + Whether to include detail on the properties of each node (disable to make nodes simpler/smaller). + include_states: bool + Whether to include detail on the abstract state at each node (disable to make nodes simpler/smaller). + """ + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]] = [] + + if include_properties: + properties_table = node.get_printable_properties_table() + if len(properties_table) > 0: + subtables.append( + ( + "Properties", + node.get_printable_properties_table(), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + + if include_states: + subtables.append( + ( + "Before State", + get_printable_table_for_state(node.before_state), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + if core.DEFAULT_EXIT in node.exit_states: + subtables.append( + ( + "Exit State", + get_printable_table_for_state( + node.exit_states[core.DEFAULT_EXIT], node.get_exit_state_transfer_filter() + ), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + for exit_type, exit_state in node.exit_states.items(): + if not isinstance(exit_type, core.DefaultExit): + subtables.append( + ( + "Exit State + (" + exit_type.__class__.__name__ + ")", + get_printable_table_for_state(exit_state, node.get_exit_state_transfer_filter()), + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES, + ) + ) + + out.write( + produce_node_dot_def( + node_id=("n" + str(id(node))), + node_kind="Interpretation", + node_type=node.__class__.__name__, + node_label=( + "[" + + ", ".join( + [str(node.created_debug_sequence_num)] + + ["(" + str(b) + "-" + str(e) + ")" for b, e in node.processed_log] + ) + + "]" + if include_states + else None + ), + config=DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE, + subtables=subtables, + ) + + "\n" + ) + for child_node in node.interpretations.values(): + out.write("n" + str(id(node)) + " -> " + "n" + str(id(child_node)) + ' [label="interpretation"]\n') + for child_node in node.interpretations.values(): + print_as_dot_string(child_node, out, include_properties=include_properties, include_states=include_states) + + +def escape_for_dot_html_like_label(s: str) -> str: + """Return string escape for inclusion in a dot html-like label.""" + return s.replace("&", "&").replace('"', """).replace("<", "<").replace(">", ">") + + +@dataclass(frozen=True) +class DotHtmlLikeTableConfiguration: + """Configuration for rendering of dot html-like table.""" + + #: Background colour for table header. + header_colour: str + #: Font colour for table header. + header_font_colour: str + #: Font size for table header. + header_font_size: int + #: Whether font of table header should be bold. + header_font_bold: bool + #: Background colour for table body. + body_colour: str + #: Font colour for table body. + body_font_colour: str + #: Font size for table body. + body_font_size: int + + +DARK_BLUE = "#6f757eff" +LIGHT_BLUE = "#dae2efff" +DARK_BROWN = "#aa643bff" +LIGHT_BROWN = "#f5debdff" +DARK_PINK = "#a36472ff" +LIGHT_PINK = "#f6dae1ff" +LIGHT_TEXT = "#ffffffff" +DARK_TEXT = "#161513ff" +DARK_GREY = "#7a736eff" +LIGHT_GREY = "#e4e1dcff" + + +DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE = DotHtmlLikeTableConfiguration( + header_colour=DARK_PINK, + header_font_colour=LIGHT_TEXT, + header_font_size=24, + header_font_bold=True, + body_colour=LIGHT_PINK, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + +DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE_PROPERTIES = dataclasses.replace( + DOT_HTML_LIKE_TABLE_CONFIG_INTERPRETATION_NODE, header_font_size=12 +) + +DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE = DotHtmlLikeTableConfiguration( + header_colour=DARK_BROWN, + header_font_colour=LIGHT_TEXT, + header_font_size=24, + header_font_bold=True, + body_colour=LIGHT_BROWN, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + +DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE_PROPERTIES = dataclasses.replace( + DOT_HTML_LIKE_TABLE_CONFIG_CONTROL_FLOW_GRAPH_NODE, header_font_size=12 +) + +DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE = DotHtmlLikeTableConfiguration( + header_colour=DARK_BLUE, + header_font_colour=LIGHT_TEXT, + header_font_size=24, + header_font_bold=True, + body_colour=LIGHT_BLUE, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + +DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE_PROPERTIES = dataclasses.replace( + DOT_HTML_LIKE_TABLE_CONFIG_STATEMENT_NODE, header_font_size=12 +) + +DOT_HTML_LIKE_TABLE_CONFIG_STATE = DotHtmlLikeTableConfiguration( + header_colour=DARK_GREY, + header_font_colour=LIGHT_TEXT, + header_font_size=12, + header_font_bold=True, + body_colour=LIGHT_GREY, + body_font_colour=DARK_TEXT, + body_font_size=6, +) + + +def truncate_long_strings_for_display(s: str) -> str: + """Truncate long string if necessary for display.""" + if len(s) > 100: + return s[:100] + "..." + return s + + +def produce_dot_html_like_table( + header: str, data: dict[str, set[tuple[str | None, str]]], config: DotHtmlLikeTableConfiguration +) -> str: + """Return the given data table rendered as a dot html-like label table. + + See module comment for description of how data tables are rendered. + """ + lines: list[str] = [] + lines.append( + '' + ) + lines.append( + ' " + ) + + for key, vals in data.items(): + lines.append( + ' " + ) + lines.append( + ' ") + lines.append(" ") + + lines.append("
' + + ("" if config.header_font_bold else "") + + escape_for_dot_html_like_label(header) + + ("" if config.header_font_bold else "") + + "
' + + escape_for_dot_html_like_label(key) + + "' + ) + if len(vals) > 0: + for val in vals: + label_part = ( + ( + '[' + + escape_for_dot_html_like_label(val[0]) + + "] " + ) + if val[0] is not None + else "" + ) + lines.append( + " " + ) + else: + lines.append(" ") + + lines.append("
" + + label_part + + '' + + escape_for_dot_html_like_label(truncate_long_strings_for_display(val[1])) + + "
") + + return "\n".join(lines) + + +def produce_node_dot_html_like_label( + node_kind: str, + node_type: str, + node_label: str | None, + config: DotHtmlLikeTableConfiguration, + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]], +) -> str: + """Return the given node table data rendered as a dot html-like label table. + + Contains nested tables for each subtable (see module comment for description of how data tables are rendered). + """ + lines: list[str] = [] + lines.append( + '< ' + ) + lines.append( + ' " + ) + lines.append( + ' " + ) + if node_label is not None: + lines.append( + ' " + ) + + for subtable in subtables: + subtable_header, subtable_data, subtable_config = subtable + lines.append( + ' " + ) + + lines.append("
' + + '' + + ("" if config.header_font_bold else "") + + escape_for_dot_html_like_label(node_kind) + + ("" if config.header_font_bold else "") + + "
' + + ("" if config.header_font_bold else "") + + escape_for_dot_html_like_label(node_type) + + ("" if config.header_font_bold else "") + + "
' + + ( + ( + '' + + "" + + escape_for_dot_html_like_label(node_label) + + "" + ) + if node_label is not None + else "" + ) + + "
' + + produce_dot_html_like_table(subtable_header, subtable_data, subtable_config) + + "
>") + + return "\n".join(lines) + + +def produce_node_dot_def( + node_id: str, + node_kind: str, + node_type: str, + node_label: str | None, + config: DotHtmlLikeTableConfiguration, + subtables: list[tuple[str, dict[str, set[tuple[str | None, str]]], DotHtmlLikeTableConfiguration]], +) -> str: + """Return the given node table data rendered as a dot node containig a html-like label table. + + Contains nested tables for each subtable (see module comment for description of how data tables + are rendered). + """ + return ( + '"' + + node_id + + '" [shape=rectangle, fillcolor="' + + config.body_colour + + '" fontname="Oracle Sans Tab", label=' + + produce_node_dot_html_like_label(node_kind, node_type, node_label, config, subtables) + + "]" + ) + + +def add_context_owned_scopes_to_properties_table( + table: dict[str, set[tuple[str | None, str]]], context: core.ContextRef[core.Context] +) -> None: + """Add an entry to the given data table listing the scopes owned by the given context.""" + owned_scopes = core.get_owned_scopes(context) + if len(owned_scopes) > 0: + table["scopes"] = {(None, scope.to_datalog_fact_string(include_outer_scope=True)) for scope in owned_scopes} diff --git a/src/macaron/code_analyzer/dataflow_analysis/run_analysis_standalone.py b/src/macaron/code_analyzer/dataflow_analysis/run_analysis_standalone.py new file mode 100644 index 000000000..faaf084ea --- /dev/null +++ b/src/macaron/code_analyzer/dataflow_analysis/run_analysis_standalone.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Module providing entry point to run dataflow analysis independently of Macaron command. + +For experimentation and debugging purposes only. +""" + +import sys + +from macaron.code_analyzer.dataflow_analysis import analysis, bash, core, github, printing +from macaron.slsa_analyzer.build_tool import Maven + + +def main() -> None: + """Entry point for running standalone analysis.""" + raw_workflow_node = analysis.analyse_github_workflow_file(sys.argv[1], None) + with open("dot", "w", encoding="utf-8") as f: + printing.print_as_dot_graph(raw_workflow_node, f, include_properties=True, include_states=True) + + nodes: list[core.Node] = [raw_workflow_node] + while len(nodes) > 0: + node = nodes.pop() + + if isinstance(node, github.GitHubActionsActionStepNode): + print("Action {") # noqa: T201 + print(" name: " + node.uses_name) # noqa: T201 + print(" version: " + node.uses_version if node.uses_version is not None else "") # noqa: T201 + print(" with {") # noqa: T201 + for key, val in node.with_parameters.items(): + print(" " + key + ": " + val.to_datalog_fact_string()) # noqa: T201 + print(" }") # noqa: T201 + print("}") # noqa: T201 + if isinstance(node, bash.BashSingleCommandNode): + print("REACHABLE SECRETS: " + str(analysis.get_reachable_secrets(node))) # noqa: T201 + for child in node.children(): + nodes.append(child) + + build_tool = Maven() + + for build_cmd in analysis.get_build_tool_commands(core.NodeForest([raw_workflow_node]), build_tool): + print("build command: " + str(build_cmd["command"])) # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/src/macaron/parsers/bashparser.py b/src/macaron/parsers/bashparser.py index 0d5cd66c1..ac2ceed68 100644 --- a/src/macaron/parsers/bashparser.py +++ b/src/macaron/parsers/bashparser.py @@ -13,65 +13,16 @@ import logging import os import subprocess # nosec B404 -from enum import Enum -from typing import Any +from typing import cast -from macaron.code_analyzer.call_graph import BaseNode from macaron.config.defaults import defaults from macaron.config.global_config import global_config -from macaron.errors import CallGraphError, ParseError -from macaron.parsers.actionparser import get_run_step -from macaron.parsers.github_workflow_model import Step +from macaron.errors import ParseError +from macaron.parsers.bashparser_model import File, Word logger: logging.Logger = logging.getLogger(__name__) -class BashScriptType(Enum): - """This class is used for different bash script types.""" - - NONE = "None" - INLINE = "inline" # Inline bash script. - FILE = "file" # Bash script file. - - -class BashNode(BaseNode): - """This class represents a callgraph node for bash commands.""" - - def __init__( - self, - name: str, - node_type: BashScriptType, - source_path: str, - parsed_step_obj: Step | None, - parsed_bash_obj: dict, - **kwargs: Any, - ) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the bash script file or the step name if the script is inlined. - node_type : BashScriptType - The type of the script. - source_path : str - The path of the script. - parsed_step_obj : Step | None - The parsed step object. - parsed_bash_obj : dict - The parsed bash script object. - """ - super().__init__(**kwargs) - self.name = name - self.node_type: BashScriptType = node_type - self.source_path = source_path - self.parsed_step_obj = parsed_step_obj - self.parsed_bash_obj = parsed_bash_obj - - def __str__(self) -> str: - return f"BashNode({self.name},{self.node_type})" - - def parse_file(file_path: str, macaron_path: str | None = None) -> dict: """Parse a bash script file. @@ -157,111 +108,107 @@ def parse(bash_content: str, macaron_path: str | None = None) -> dict: raise ParseError("Error while loading the parsed bash script.") from error -def create_bash_node( - name: str, - node_id: str | None, - node_type: BashScriptType, - source_path: str, - ci_step_ast: Step | None, - repo_path: str, - caller: BaseNode, - recursion_depth: int, - macaron_path: str | None = None, -) -> BashNode: - """Create a callgraph node for a bash script. +def parse_raw(bash_content: str, macaron_path: str | None = None) -> File: + """Parse a bash script's content. + + Parameters + ---------- + bash_content : str + Bash script content + macaron_path : str | None + Macaron's root path (optional). + + Returns + ------- + bashparser_model.File + The parsed bash script AST in typed JSON (dict) format. + + Raises + ------ + ParseError + When parsing fails with errors. + """ + if not macaron_path: + macaron_path = global_config.macaron_path + cmd = [ + os.path.join(macaron_path, "bin", "bashparser"), + "-input", + bash_content, + "-raw", + ] + + try: + result = subprocess.run( # nosec B603 + cmd, + capture_output=True, + check=True, + cwd=macaron_path, + timeout=defaults.getint("bashparser", "timeout", fallback=30), + ) + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ) as error: + raise ParseError("Error while parsing bash script.") from error + + try: + if result.returncode == 0: + return cast(File, json.loads(result.stdout.decode("utf-8"))) + + raise ParseError(f"Bash script parser failed: {result.stderr.decode('utf-8')}") + + except json.JSONDecodeError as error: + raise ParseError("Error while loading the parsed bash script.") from error - A bash node can have the following types: - * :class:`BashScriptType.INLINE` when it is inlined in a CI workflow. - * :class:`BashScriptType.FILE` when it is a bash script file. +def parse_expr(bash_expr_content: str, macaron_path: str | None = None) -> list[Word]: + """Parse a bash script's content. Parameters ---------- - name: str - A name to be used as the identifier of the node. - node_id: str | None - The node ID if defined. - node_type: BashScriptType - The type of the node. - source_path: str - The file that contains the bash script. - ci_step_ast: Step | None - The AST of the CI step that runs a bash script. - repo_path: str - The path to the target repo. - caller: BaseNode - The caller node. - recursion_depth: int - The number of times this function is called recursively. - macaron_path=None - The path to the Macaron module. + bash_content : str + Bash script content + macaron_path : str | None + Macaron's root path (optional). Returns ------- - BashNode - A bash node object. + list[bashparser_model.Word] + The parsed bash expr AST in typed JSON (dict) format. Raises ------ - CallGraphError - When unable to create a bash node. + ParseError + When parsing fails with errors. """ - if recursion_depth > defaults.getint("bashparser", "recursion_depth", fallback=3): - raise CallGraphError(f"The analysis has reached maximum recursion depth {recursion_depth} at {source_path}.") - parsed_bash_script = {} - working_dir = None - match node_type: - case BashScriptType.INLINE: - if ci_step_ast is None: - raise CallGraphError(f"Unable to find the parsed AST for the CI step at {source_path}.") - working_dir = ci_step_ast.get("working-directory") - run_script = get_run_step(ci_step_ast) - if run_script is None: - raise CallGraphError(f"Invalid run step at {source_path}.") - try: - parsed_bash_script = parse(run_script, macaron_path=macaron_path) - except ParseError as error: - logger.debug(error) - case BashScriptType.FILE: - try: - parsed_bash_script = parse_file(source_path, macaron_path=macaron_path) - except ParseError as error: - logger.debug(error) - bash_node = BashNode( - name, - node_type, - source_path, - parsed_step_obj=ci_step_ast, - parsed_bash_obj=parsed_bash_script, - node_id=node_id, - caller=caller, - ) - caller_commands = parsed_bash_script.get("commands", []) - - # Parse the bash script files called from the current script. - if caller_commands and repo_path: - for cmd in caller_commands: - # Parse the scripts that end with `.sh`. - # TODO: parse Makefiles for bash commands. - if not cmd or not cmd[0] or not cmd[0].endswith(".sh"): - continue - - # Check for path traversal patterns before analyzing a bash file. - bash_file_path = os.path.realpath(os.path.join(repo_path, working_dir or "", cmd[0])) - if os.path.exists(bash_file_path) and bash_file_path.startswith(repo_path): - try: - callee = create_bash_node( - name=cmd[0], - node_id=node_id, - node_type=BashScriptType.FILE, - source_path=bash_file_path, - ci_step_ast=None, - repo_path=repo_path, - caller=bash_node, - recursion_depth=recursion_depth + 1, - macaron_path=macaron_path, - ) - except CallGraphError as error: - raise error - bash_node.add_callee(callee) - return bash_node + if not macaron_path: + macaron_path = global_config.macaron_path + cmd = [ + os.path.join(macaron_path, "bin", "bashexprparser"), + "-input", + bash_expr_content, + ] + try: + result = subprocess.run( # nosec B603 + cmd, + capture_output=True, + check=True, + cwd=macaron_path, + timeout=defaults.getint("bashparser", "timeout", fallback=30), + ) + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ) as error: + raise ParseError("Error while parsing bash expr.") from error + + try: + if result.returncode == 0: + return cast(list[Word], json.loads(result.stdout.decode("utf-8"))) + + raise ParseError(f"Bash script parser failed: {result.stderr.decode('utf-8')}") + + except json.JSONDecodeError as error: + raise ParseError("Error while loading the parsed bash script.") from error diff --git a/src/macaron/parsers/bashparser_model.py b/src/macaron/parsers/bashparser_model.py new file mode 100644 index 000000000..08ab38850 --- /dev/null +++ b/src/macaron/parsers/bashparser_model.py @@ -0,0 +1,836 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. +# Licensed under the Universal Permissive License v 1.0 as shown at https:#oss.oracle.com/licenses/upl/. +# pylint: skip-file +# flake8: noqa + +# Type definitions for Bash AST as produced (and json-serialised) by the "mvdan.cc/sh/v3/syntax" bash parser + +from __future__ import annotations + +from enum import Enum +from typing import Any, Literal, NotRequired, Optional, TypedDict, TypeGuard, Union + + +class Pos(TypedDict): + Offset: int + Line: int + Col: int + + +class Comment(TypedDict): + Hash: Pos + Text: str + + +WordPart = Union[ + "Lit", "SglQuoted", "DblQuoted", "ParamExp", "CmdSubst", "ArithmExp", "ProcSubst", "ExtGlob", "BraceExp" +] + +ArithmExpr = Union["BinaryArithm", "UnaryArithm", "ParenArithm", "Word"] + +UnAritOperator = int + + +class UnAritOperators(Enum): + Not = 34 # ! + BitNegation = 35 # ~ + Inc = 36 # ++ + Dec = 37 # -- + Plus = 68 # + + Minus = 70 # - + + +class UnaryArithm(TypedDict): + Type: Literal["UnaryArithm"] + Pos: Pos + End: Pos + OpPos: Pos + Op: UnAritOperator + Post: NotRequired[bool] + X: ArithmExpr + + +def is_unary_arithm(expr: ArithmExpr) -> TypeGuard[UnaryArithm]: + return expr.get("Type", "") == "UnaryArithm" + + +BinAritOperator = int + + +class BinAritOperators(Enum): + Add = 68 # + + Sub = 70 # - + Mul = 38 # * + Quo = 85 # / + Rem = 76 # % + Pow = 39 # ** + Eql = 40 # == + Gtr = 54 # > + Lss = 56 # < + Neq = 41 # != + Leq = 42 # <= + Geq = 43 # >= + And = 9 # & + Or = 12 # | + Xor = 80 # ^ + Shr = 55 # >> + Shl = 61 # << + + AndArit = 10 # && + OrArit = 11 # || + Comma = 82 # , + TernQuest = 72 # ? + TernColon = 87 # : + + Assgn = 74 # = + AddAssgn = 44 # += + SubAssgn = 45 # -= + MulAssgn = 46 # *= + QuoAssgn = 47 # /= + RemAssgn = 48 # %= + AndAssgn = 49 # &= + OrAssgn = 50 # |= + XorAssgn = 51 # ^= + ShlAssgn = 52 # <<= + ShrAssgn = 53 # >>= + + +class BinaryArithm(TypedDict): + Type: Literal["BinaryArithm"] + Pos: Pos + End: Pos + OpPos: Pos + Op: BinAritOperator + X: ArithmExpr + Y: ArithmExpr + + +def is_binary_arithm(expr: ArithmExpr) -> TypeGuard[BinaryArithm]: + return expr.get("Type", "") == "BinaryArithm" + + +class ParenArithm(TypedDict): + Type: Literal["ParenArithm"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + X: ArithmExpr + + +def is_paren_arithm(expr: ArithmExpr) -> TypeGuard[ParenArithm]: + return expr.get("Type", "") == "ParenArithm" + + +def is_word_arithm(expr: ArithmExpr) -> TypeGuard[Word]: + return "Type" not in expr + + +class Lit(TypedDict): + Type: Literal["Lit"] + Pos: Pos + End: Pos + ValuePos: Pos + ValueEnd: Pos + Value: str + + +def is_lit(part: WordPart) -> TypeGuard[Lit]: + return part["Type"] == "Lit" + + +class SglQuoted(TypedDict): + Type: Literal["SglQuoted"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Dollar: NotRequired[bool] + Value: str + + +def is_sgl_quoted(part: WordPart) -> TypeGuard[SglQuoted]: + return part["Type"] == "SglQuoted" + + +class DblQuoted(TypedDict): + Type: Literal["DblQuoted"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Dollar: NotRequired[bool] + Parts: NotRequired[list[WordPart]] + + +def is_dbl_quoted(part: WordPart) -> TypeGuard[DblQuoted]: + return part["Type"] == "DblQuoted" + + +class Slice(TypedDict): + Offset: ArithmExpr + Length: ArithmExpr + + +class Replace(TypedDict): + All: NotRequired[bool] + Orig: Word + With: Word + + +ParNamesOperator = int + + +class ParNamesOperators(Enum): + NamesPrefix = 38 # * + NamesPrefixWords = 84 # @ + + +ParExpOperator = int + + +class ParExpOperators(Enum): + AlternateUnset = 68 # + + AlternateUnsetOrNull = 69 # :+ + DefaultUnset = 70 # - + DefaultUnsetOrNull = 71 # :- + ErrorUnset = 72 # ? + ErrorUnsetOrNull = 73 # :? + AssignUnset = 74 # = + AssignUnsetOrNull = 75 # := + RemSmallSuffix = 76 # % + RemLargeSuffix = 77 # %% + RemSmallPrefix = 78 # # + RemLargePrefix = 79 # ## + UpperFirst = 80 # ^ + UpperAll = 81 # ^^ + LowerFirst = 82 # , + LowerAll = 83 # ,, + OtherParamOps = 84 # @ + + +class Expansion(TypedDict): + Op: ParExpOperator + Word: Word + + +class ParamExp(TypedDict): + Type: Literal["ParamExp"] + Pos: Pos + End: Pos + Dollar: NotRequired[Pos] + Rbrace: NotRequired[Pos] + Short: NotRequired[bool] + Excl: NotRequired[bool] + Length: NotRequired[bool] + Width: NotRequired[bool] + Param: Lit + Index: NotRequired[ArithmExpr] + Slice: NotRequired[Slice] + Repl: NotRequired[Replace] + Names: NotRequired[ParNamesOperator] + Exp: NotRequired[Expansion] + + +def is_param_exp(part: WordPart) -> TypeGuard[ParamExp]: + return part["Type"] == "ParamExp" + + +class CmdSubst(TypedDict): + Type: Literal["CmdSubst"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + Backquotes: NotRequired[bool] + TempFile: NotRequired[bool] + ReplyVar: NotRequired[bool] + + +def is_cmd_subst(part: WordPart) -> TypeGuard[CmdSubst]: + return part["Type"] == "CmdSubst" + + +class ArithmExp(TypedDict): + Type: Literal["ArithmExp"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Bracket: NotRequired[bool] + Unsigned: NotRequired[bool] + X: ArithmExpr + + +def is_arithm_exp(part: WordPart) -> TypeGuard[ArithmExp]: + return part["Type"] == "ArithmExp" + + +ProcOperator = int + + +class ProcOperators(Enum): + CmdIn = 66 # <( + CmdOut = 67 # >( + + +class ProcSubst(TypedDict): + Type: Literal["ProcSubst"] + Pos: Pos + End: Pos + OpPos: Pos + Rparen: Pos + Op: ProcOperator + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +def is_proc_subst(part: WordPart) -> TypeGuard[ProcSubst]: + return part["Type"] == "ProcSubst" + + +GlobOperator = int + + +class GlobOperators(Enum): + GlobZeroOrOne = 122 # ?( + GlobZeroOrMore = 123 # *( + GlobOneOrMore = 124 # +( + GlobOne = 125 # @( + GlobExcept = 126 # !( + + +class ExtGlob(TypedDict): + Type: Literal["ExtGlob"] + Pos: Pos + End: Pos + OpPos: Pos + Op: GlobOperator + Pattern: Lit + + +def is_ext_glob(part: WordPart) -> TypeGuard[ExtGlob]: + return part["Type"] == "ExtGlob" + + +class BraceExp(TypedDict): + Type: Literal["BraceExp"] + Pos: Pos + End: Pos + Sequence: NotRequired[bool] + Elems: list[Word] + + +def is_brace_exp(part: WordPart) -> TypeGuard[BraceExp]: + return part["Type"] == "BraceExp" + + +class Word(TypedDict): + Parts: list[WordPart] + + +RedirOperator = int + + +class RedirOperators(Enum): + RdrOut = 54 # > + AppOut = 55 # >> + RdrIn = 56 # < + RdrInOut = 57 # <> + DplIn = 58 # <& + DplOut = 59 # >& + ClbOut = 60 # >| + Hdoc = 61 # << + DashHdoc = 62 # <<- + WordHdoc = 63 # <<< + RdrAll = 64 # &> + AppAll = 65 # &>> + + +class Redirect(TypedDict): + Pos: Pos + End: Pos + OpPos: Pos + Op: RedirOperator + N: NotRequired[Lit] + Word: NotRequired[Word] + Hdoc: NotRequired[Word] + + +class ArrayElem(TypedDict): + Pos: Pos + End: Pos + Index: NotRequired[ArithmExpr] + Value: NotRequired[Word] + Comments: NotRequired[list[Comment]] + + +class ArrayExpr(TypedDict): + Pos: Pos + End: Pos + Lparent: Pos + Rparen: Pos + Elems: list[ArrayElem] + Last: NotRequired[list[Comment]] + + +class Assign(TypedDict): + Pos: Pos + End: Pos + Append: NotRequired[bool] + Naked: NotRequired[bool] + Name: Lit + Index: NotRequired[ArithmExpr] + Value: NotRequired[Word] + Array: NotRequired[ArrayExpr] + + +Command = Union[ + "CallExpr", + "IfClause", + "WhileClause", + "ForClause", + "CaseClause", + "Block", + "Subshell", + "BinaryCmd", + "FuncDecl", + "ArithmCmd", + "TestClause", + "DeclClause", + "LetClause", + "TimeClause", + "CoprocClause", + "TestDecl", +] + + +class CallExpr(TypedDict): + Type: Literal["CallExpr"] + Pos: Pos + End: Pos + Assigns: NotRequired[list[Assign]] + Args: NotRequired[list[Word]] + + +def is_call_expr(cmd: Command) -> TypeGuard[CallExpr]: + return cmd["Type"] == "CallExpr" + + +class IfClause(TypedDict): + Type: Literal["IfClause"] + Pos: Pos + End: Pos + Position: Pos + ThenPos: NotRequired[Pos] + FiPos: NotRequired[Pos] + Cond: list[Stmt] + CondLast: NotRequired[list[Comment]] + Then: list[Stmt] + ThenLast: NotRequired[list[Comment]] + Else: NotRequired[IfClause | ElseClause] + Last: NotRequired[list[Comment]] + + +def is_if_clause(cmd: Command) -> TypeGuard[IfClause]: + return cmd["Type"] == "IfClause" + + +class ElseClause(TypedDict): + Pos: Pos + End: Pos + Position: Pos + FiPos: NotRequired[Pos] + Then: list[Stmt] + ThenLast: NotRequired[list[Comment]] + Last: NotRequired[list[Comment]] + + +def is_else_clause(clause: IfClause | ElseClause) -> TypeGuard[ElseClause]: + return "Type" not in clause + + +class WhileClause(TypedDict): + Type: Literal["WhileClause"] + Pos: Pos + End: Pos + WhilePos: Pos + DoPos: Pos + DonePos: Pos + Cond: list[Stmt] + CondLast: NotRequired[list[Comment]] + Do: list[Stmt] + DoLast: NotRequired[list[Comment]] + + +def is_while_clause(cmd: Command) -> TypeGuard[WhileClause]: + return cmd["Type"] == "WhileClause" + + +Loop = Union["WordIter", "CStyleLoop"] + + +class WordIter(TypedDict): + Type: Literal["WordIter"] + Pos: Pos + End: Pos + Name: Lit + InPos: Pos + Items: list[Word] + + +def is_word_iter(loop: Loop) -> TypeGuard[WordIter]: + return loop["Type"] == "WordIter" + + +class CStyleLoop(TypedDict): + Type: Literal["CStyleLoop"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + Init: NotRequired[ArithmExpr] + Cond: NotRequired[ArithmExpr] + Post: NotRequired[ArithmExpr] + + +def is_cstyle_loop(loop: Loop) -> TypeGuard[CStyleLoop]: + return loop["Type"] == "CStyleLoop" + + +class ForClause(TypedDict): + Type: Literal["ForClause"] + Pos: Pos + End: Pos + ForPos: Pos + DoPos: Pos + DonePos: Pos + Select: NotRequired[bool] + Braces: NotRequired[bool] + Loop: Loop + Do: list[Stmt] + DoLast: NotRequired[list[Comment]] + + +def is_for_clause(cmd: Command) -> TypeGuard[ForClause]: + return cmd["Type"] == "ForClause" + + +CaseOperator = int + + +class CaseOperators(Enum): + Break = 30 # ;; + Fallthrough = 31 # ;& + Resume = 32 # ;;& + ResumeKorn = 33 # ;| + + +class CaseItem(TypedDict): + Pos: Pos + End: Pos + Op: CaseOperator + OpPos: Pos + Comments: NotRequired[list[Comment]] + Patterns: list[Word] + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +class CaseClause(TypedDict): + Type: Literal["CaseClause"] + Pos: Pos + End: Pos + Case: Pos + In: Pos + Esac: Pos + Braces: NotRequired[bool] + Word: Word + Items: list[CaseItem] + Last: NotRequired[list[Comment]] + + +def is_case_clause(cmd: Command) -> TypeGuard[CaseClause]: + return cmd["Type"] == "CaseClause" + + +class Block(TypedDict): + Type: Literal["Block"] + Pos: Pos + End: Pos + Lbrace: Pos + Rbrace: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +def is_block(cmd: Command) -> TypeGuard[Block]: + return cmd["Type"] == "Block" + + +class Subshell(TypedDict): + Type: Literal["Subshell"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] + + +def is_subshell(cmd: Command) -> TypeGuard[Subshell]: + return cmd["Type"] == "Subshell" + + +BinCmdOperator = int + + +class BinCmdOperators(Enum): + AndStmt = 10 # && + OrStmt = 11 # || + Pipe = 12 # | + PipeAll = 13 # |& + + +class BinaryCmd(TypedDict): + Type: Literal["BinaryCmd"] + Pos: Pos + End: Pos + OpPos: Pos + Op: BinCmdOperator + X: Stmt + Y: Stmt + + +def is_binary_cmd(cmd: Command) -> TypeGuard[BinaryCmd]: + return cmd["Type"] == "BinaryCmd" + + +class FuncDecl(TypedDict): + Type: Literal["FuncDecl"] + Pos: Pos + End: Pos + Position: Pos + RsrvWord: NotRequired[bool] + Parens: NotRequired[bool] + Name: Lit + Body: Stmt + + +def is_func_decl(cmd: Command) -> TypeGuard[FuncDecl]: + return cmd["Type"] == "FuncDecl" + + +class ArithmCmd(TypedDict): + Type: Literal["ArithmCmd"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + Unsigned: NotRequired[bool] + X: ArithmExpr + + +def is_arithm_cmd(cmd: Command) -> TypeGuard[ArithmCmd]: + return cmd["Type"] == "ArithmCmd" + + +TestExpr = Union["BinaryTest", "UnaryTest", "ParenTest", "Word"] + +BinTestOperator = int + + +class BinTestOperators(Enum): + TsReMatch = 112 # =~ + TsNewer = 113 # -nt + TsOlder = 114 # -ot + TsDevIno = 115 # -ef + TsEql = 116 # -eq + TsNeq = 117 # -ne + TsLeq = 118 # -le + TsGeq = 119 # -ge + TsLss = 120 # -lt + TsGtr = 121 # -gt + AndTest = 10 # && + OrTest = 11 # || + TsMatchShort = 74 # = + TsMatch = 40 # == + TsNoMatch = 41 # != + TsBefore = 56 # < + TsAfter = 54 # > + + +class BinaryTest(TypedDict): + Type: Literal["BinaryTest"] + Pos: Pos + End: Pos + OpPos: Pos + Op: BinTestOperator + X: TestExpr + Y: TestExpr + + +def is_binary_test(test_expr: TestExpr) -> TypeGuard[BinaryTest]: + return test_expr.get("Type", "") == "BinaryTest" + + +UnTestOperator = int + + +class UnTestOperators(Enum): + TsExists = 88 # -e + TsRegFile = 89 # -f + TsDirect = 90 # -d + TsCharSp = 91 # -c + TsBlckSp = 92 # -b + TsNmPipe = 93 # -p + TsSocket = 94 # -S + TsSmbLink = 95 # -L + TsSticky = 96 # -k + TsGIDSet = 97 # -g + TsUIDSet = 98 # -u + TsGrpOwn = 99 # -G + TsUsrOwn = 100 # -O + TsModif = 101 # -N + TsRead = 102 # -r + TsWrite = 103 # -w + TsExec = 104 # -x + TsNoEmpty = 105 # -s + TsFdTerm = 106 # -t + TsEmpStr = 107 # -z + TsNempStr = 108 # -n + TsOptSet = 109 # -o + TsVarSet = 110 # -v + TsRefVar = 111 # -R + TsNot = 34 # ! + + +class UnaryTest(TypedDict): + Type: Literal["UnaryTest"] + Pos: Pos + End: Pos + OpPos: Pos + Op: UnTestOperator + X: TestExpr + + +def is_unary_test(test_expr: TestExpr) -> TypeGuard[UnaryTest]: + return test_expr.get("Type", "") == "UnaryTest" + + +class ParenTest(TypedDict): + Type: Literal["ParenTest"] + Pos: Pos + End: Pos + Lparen: Pos + Rparen: Pos + X: TestExpr + + +def is_paren_test(test_expr: TestExpr) -> TypeGuard[ParenTest]: + return test_expr.get("Type", "") == "ParenTest" + + +def is_word_test(test_expr: TestExpr) -> TypeGuard[Word]: + return "Type" not in test_expr + + +class TestClause(TypedDict): + Type: Literal["TestClause"] + Pos: Pos + End: Pos + Left: Pos + Right: Pos + X: TestExpr + + +def is_test_clause(cmd: Command) -> TypeGuard[TestClause]: + return cmd["Type"] == "TestClause" + + +class DeclClause(TypedDict): + Type: Literal["DeclClause"] + Pos: Pos + End: Pos + Variant: Lit + Args: list[Assign] + + +def is_decl_clause(cmd: Command) -> TypeGuard[DeclClause]: + return cmd["Type"] == "DeclClause" + + +class LetClause(TypedDict): + Type: Literal["LetClause"] + Pos: Pos + End: Pos + Let: Pos + Exprs: list[ArithmExpr] + + +def is_let_clause(cmd: Command) -> TypeGuard[LetClause]: + return cmd["Type"] == "LetClause" + + +class TimeClause(TypedDict): + Type: Literal["TimeClause"] + Pos: Pos + End: Pos + Time: Pos + PosixFormat: NotRequired[bool] + Stmt: Stmt + + +def is_time_clause(cmd: Command) -> TypeGuard[TimeClause]: + return cmd["Type"] == "TimeClause" + + +class CoprocClause(TypedDict): + Type: Literal["CoprocClause"] + Pos: Pos + End: Pos + Coproc: Pos + Name: Word + Stmt: Stmt + + +def is_coproc_clause(cmd: Command) -> TypeGuard[CoprocClause]: + return cmd["Type"] == "CoprocClause" + + +class TestDecl(TypedDict): + Type: Literal["TestDecl"] + Pos: Pos + End: Pos + Position: Pos + Description: Word + Body: Stmt + + +def is_test_decl(cmd: Command) -> TypeGuard[TestDecl]: + return cmd["Type"] == "TestDecl" + + +class Stmt(TypedDict): + Comments: NotRequired[list[Comment]] + Cmd: Command + Pos: Pos + End: Pos + Position: Pos + Semicolon: NotRequired[Pos] + Negated: NotRequired[bool] + Background: NotRequired[bool] + Coprocess: NotRequired[bool] + Redirs: NotRequired[list[Redirect]] + + +class File(TypedDict): + Type: Literal["File"] + Name: NotRequired[str] + Pos: Pos + End: Pos + Stmts: list[Stmt] + Last: NotRequired[list[Comment]] diff --git a/src/macaron/slsa_analyzer/build_tool/base_build_tool.py b/src/macaron/slsa_analyzer/build_tool/base_build_tool.py index dfc286c4a..511f4c9df 100644 --- a/src/macaron/slsa_analyzer/build_tool/base_build_tool.py +++ b/src/macaron/slsa_analyzer/build_tool/base_build_tool.py @@ -3,6 +3,8 @@ """This module contains the BaseBuildTool class to be inherited by other specific Build Tools.""" +from __future__ import annotations + import glob import itertools import json @@ -14,14 +16,16 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import TypedDict +from typing import TYPE_CHECKING, TypedDict -from macaron.code_analyzer.call_graph import BaseNode from macaron.config.defaults import defaults from macaron.dependency_analyzer.cyclonedx import DependencyAnalyzer, NoneDependencyAnalyzer from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.checks.check_result import Confidence, Evidence, EvidenceWeightMap +if TYPE_CHECKING: + from macaron.code_analyzer.dataflow_analysis.core import Node + logger: logging.Logger = logging.getLogger(__name__) @@ -57,7 +61,7 @@ class BuildToolCommand(TypedDict): ci_path: str #: The CI step object that calls the command. - step_node: BaseNode | None + step_node: Node | None #: The list of name of reachable variables that contain secrets.""" reachable_secrets: list[str] diff --git a/src/macaron/slsa_analyzer/checks/build_as_code_check.py b/src/macaron/slsa_analyzer/checks/build_as_code_check.py index fd1260474..bf3693a78 100644 --- a/src/macaron/slsa_analyzer/checks/build_as_code_check.py +++ b/src/macaron/slsa_analyzer/checks/build_as_code_check.py @@ -5,27 +5,26 @@ import logging import os -from typing import cast from sqlalchemy import ForeignKey from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.analysis import get_build_tool_commands, get_containing_github_job +from macaron.code_analyzer.dataflow_analysis.core import traverse_bfs +from macaron.code_analyzer.dataflow_analysis.github import ( + GitHubActionsActionStepNode, + GitHubActionsReusableWorkflowCallNode, + GitHubActionsRunStepNode, +) from macaron.database.table_definitions import CheckFacts from macaron.errors import CallGraphError, ProvenanceError -from macaron.parsers.bashparser import BashNode -from macaron.parsers.github_workflow_model import ActionStep from macaron.provenance.provenance_extractor import ProvenancePredicate from macaron.slsa_analyzer.analyze_context import AnalyzeContext, store_inferred_build_info_results from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService, NoneCIService from macaron.slsa_analyzer.ci_service.circleci import CircleCI -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, -) from macaron.slsa_analyzer.ci_service.gitlab_ci import GitLabCI from macaron.slsa_analyzer.ci_service.travis import Travis from macaron.slsa_analyzer.registry import registry @@ -147,95 +146,94 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: if isinstance(ci_service, NoneCIService): continue + callgraph = ci_info["callgraph"] + trusted_deploy_actions = tool.ci_deploy_kws["github_actions"] or [] # Check for use of a trusted GitHub Actions workflow to publish/deploy. # TODO: verify that deployment is legitimate and not a test if trusted_deploy_actions: - for callee in ci_info["callgraph"].bfs(): - if isinstance(callee, GitHubWorkflowNode) and callee.node_type in [ - GitHubWorkflowType.EXTERNAL, - GitHubWorkflowType.REUSABLE, - ]: - workflow_name = callee.name.split("@")[0] - - if not workflow_name: - logger.debug("Workflow %s is not relevant. Skipping...", callee.name) - continue - if workflow_name in trusted_deploy_actions: - job_id = None - step_id = None - step_name = None - caller_path = "" - job = callee.caller - - # We always expect the caller of the node that calls a third-party - # or Reusable GitHub Action to be a GitHubJobNode. - if not isinstance(job, GitHubJobNode): - continue - - job_id = job.parsed_obj.id - caller_path = job.source_path - - # Only third-party Actions can be called from a step. - # Reusable workflows have to be directly called from the job. - # See https://docs.github.com/en/actions/sharing-automations/ \ - # reusing-workflows#calling-a-reusable-workflow - if callee.node_type == GitHubWorkflowType.EXTERNAL: - callee_step_obj = cast(ActionStep, callee.parsed_obj) - if "id" in callee_step_obj: - step_id = callee_step_obj["id"] - if "name" in callee_step_obj: - step_name = callee_step_obj["name"] - - trigger_link = ci_service.api_client.get_file_link( - ctx.component.repository.full_name, - ctx.component.repository.commit_sha, - file_path=( - ci_service.api_client.get_relative_path_of_workflow( - os.path.basename(caller_path) - ) - if caller_path - else "" - ), - ) + for root in ci_info["callgraph"].root_nodes: + for callee in traverse_bfs(root): + if isinstance(callee, (GitHubActionsReusableWorkflowCallNode, GitHubActionsActionStepNode)): + workflow_name = callee.uses_name + + if workflow_name in trusted_deploy_actions: + job_id = None + step_id = None + step_name = None + caller_path = "" + job = ( + get_containing_github_job(callee, callgraph.parents) + if isinstance(callee, GitHubActionsActionStepNode) + else callee + ) - trusted_workflow_confidence = tool.infer_confidence_deploy_workflow( - ci_path=caller_path, provenance_workflow=prov_workflow - ) - # Store or update the inferred build information if the confidence - # for the current check fact is bigger than the maximum score. - if ( - not result_tables - or trusted_workflow_confidence - > max(result_tables, key=lambda item: item.confidence).confidence - ): - store_inferred_build_info_results( - ctx=ctx, - ci_info=ci_info, - ci_service=ci_service, - trigger_link=trigger_link, - job_id=job_id, - step_id=step_id, - step_name=step_name, - callee_node_type=callee.node_type.value, + if not job: + continue + + job_id = job.job_id + caller_path = job.context.ref.workflow_context.ref.source_filepath + + # Only third-party Actions can be called from a step. + # Reusable workflows have to be directly called from the job. + # See https://docs.github.com/en/actions/sharing-automations/ \ + # reusing-workflows#calling-a-reusable-workflow + if isinstance(callee, GitHubActionsActionStepNode): + callee_node_type = "external" + if "id" in callee.definition: + step_id = callee.definition["id"] + if "name" in callee.definition: + step_name = callee.definition["name"] + else: + callee_node_type = "reusable" + + trigger_link = ci_service.api_client.get_file_link( + ctx.component.repository.full_name, + ctx.component.repository.commit_sha, + file_path=( + ci_service.api_client.get_relative_path_of_workflow( + os.path.basename(caller_path) + ) + if caller_path + else "" + ), ) - result_tables.append( - BuildAsCodeFacts( - build_tool_name=tool.name, - ci_service_name=ci_service.name, - build_trigger=trigger_link, - language=tool.language.value, - deploy_command=workflow_name, - confidence=trusted_workflow_confidence, + + trusted_workflow_confidence = tool.infer_confidence_deploy_workflow( + ci_path=caller_path, provenance_workflow=prov_workflow ) - ) - overall_res = CheckResultType.PASSED - try: - for build_command in ci_service.get_build_tool_commands( - callgraph=ci_info["callgraph"], build_tool=tool - ): + # Store or update the inferred build information if the confidence + # for the current check fact is bigger than the maximum score. + if ( + not result_tables + or trusted_workflow_confidence + > max(result_tables, key=lambda item: item.confidence).confidence + ): + store_inferred_build_info_results( + ctx=ctx, + ci_info=ci_info, + ci_service=ci_service, + trigger_link=trigger_link, + job_id=job_id, + step_id=step_id, + step_name=step_name, + callee_node_type=callee_node_type, + ) + result_tables.append( + BuildAsCodeFacts( + build_tool_name=tool.name, + ci_service_name=ci_service.name, + build_trigger=trigger_link, + language=tool.language.value, + deploy_command=workflow_name, + confidence=trusted_workflow_confidence, + ) + ) + overall_res = CheckResultType.PASSED + try: + for build_command in get_build_tool_commands(nodes=callgraph, build_tool=tool): # Yes or no with a confidence score. result, confidence = tool.is_deploy_command( build_command, @@ -256,23 +254,27 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: not result_tables or confidence > max(result_tables, key=lambda item: item.confidence).confidence ): + job_id = None + step_id = None + step_name = None + step_node = build_command["step_node"] + if step_node: + job_node = get_containing_github_job(step_node, callgraph.parents) + if job_node is not None: + job_id = job_node.job_id + + if isinstance(step_node, GitHubActionsRunStepNode): + step_id = step_node.definition.get("id") + step_name = step_node.definition.get("name") + store_inferred_build_info_results( ctx=ctx, ci_info=ci_info, ci_service=ci_service, trigger_link=trigger_link, - job_id=( - build_command["step_node"].caller.name - if build_command["step_node"] - and isinstance(build_command["step_node"].caller, GitHubJobNode) - else None - ), - step_id=build_command["step_node"].node_id if build_command["step_node"] else None, - step_name=( - build_command["step_node"].name - if isinstance(build_command["step_node"], BashNode) - else None - ), + job_id=job_id, + step_id=step_id, + step_name=step_name, ) result_tables.append( BuildAsCodeFacts( diff --git a/src/macaron/slsa_analyzer/checks/build_script_check.py b/src/macaron/slsa_analyzer/checks/build_script_check.py index ccd61cca1..76374eed1 100644 --- a/src/macaron/slsa_analyzer/checks/build_script_check.py +++ b/src/macaron/slsa_analyzer/checks/build_script_check.py @@ -10,6 +10,7 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.analysis import get_build_tool_commands from macaron.database.table_definitions import CheckFacts from macaron.errors import CallGraphError from macaron.slsa_analyzer.analyze_context import AnalyzeContext @@ -114,9 +115,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: if isinstance(ci_service, NoneCIService): continue try: - for build_command in ci_service.get_build_tool_commands( - callgraph=ci_info["callgraph"], build_tool=tool - ): + for build_command in get_build_tool_commands(ci_info["callgraph"], tool): trigger_link = ci_service.api_client.get_file_link( ctx.component.repository.full_name, ctx.component.repository.commit_sha, diff --git a/src/macaron/slsa_analyzer/checks/build_service_check.py b/src/macaron/slsa_analyzer/checks/build_service_check.py index cea689a7c..f2439d55a 100644 --- a/src/macaron/slsa_analyzer/checks/build_service_check.py +++ b/src/macaron/slsa_analyzer/checks/build_service_check.py @@ -10,6 +10,7 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.analysis import get_build_tool_commands from macaron.database.table_definitions import CheckFacts from macaron.errors import CallGraphError from macaron.slsa_analyzer.analyze_context import AnalyzeContext, store_inferred_build_info_results @@ -118,9 +119,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: continue try: - for build_command in ci_service.get_build_tool_commands( - callgraph=ci_info["callgraph"], build_tool=tool - ): + for build_command in get_build_tool_commands(nodes=ci_info["callgraph"], build_tool=tool): # Yes or no with a confidence score. result, confidence = tool.is_package_command( build_command, ci_service.get_third_party_configurations() diff --git a/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py b/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py index 967946bf1..48c6d445e 100644 --- a/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py +++ b/src/macaron/slsa_analyzer/checks/github_actions_vulnerability_check.py @@ -9,6 +9,12 @@ from sqlalchemy import ForeignKey, String from sqlalchemy.orm import Mapped, mapped_column +from macaron.code_analyzer.dataflow_analysis.analysis import get_containing_github_job +from macaron.code_analyzer.dataflow_analysis.core import traverse_bfs +from macaron.code_analyzer.dataflow_analysis.github import ( + GitHubActionsActionStepNode, + GitHubActionsReusableWorkflowCallNode, +) from macaron.database.db_custom_types import DBJsonList from macaron.database.table_definitions import CheckFacts from macaron.errors import APIAccessError @@ -16,7 +22,6 @@ from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.checks.base_check import BaseCheck, CheckResultType from macaron.slsa_analyzer.checks.check_result import CheckResultData, Confidence, JustificationType -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import GitHubWorkflowNode, GitHubWorkflowType from macaron.slsa_analyzer.package_registry.osv_dev import OSVDevService from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.slsa_req import ReqName @@ -87,47 +92,47 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: external_workflows: dict[str, list] = {} for ci_info in ci_services: - for callee in ci_info["callgraph"].bfs(): - if isinstance(callee, GitHubWorkflowNode) and callee.node_type in { - GitHubWorkflowType.EXTERNAL, - GitHubWorkflowType.REUSABLE, - }: - workflow_name = workflow_version = "" - if "@" in callee.name: - workflow_name, workflow_version = callee.name.split("@") - else: - # Most likely we have encountered an internal reusable workflow, which - # can be skipped. - logger.debug("GitHub Actions workflow %s misses a version. Skipping...", callee.name) - continue - - caller_path = callee.caller.source_path if callee.caller else None - - # Skip the workflow if `workflow_name` or `workflow_version` are missing, - # or if `callee.name` lacks an '@' which can indicate an internal workflow - # within the same repo . - if not workflow_name or not workflow_version: - logger.debug("Workflow %s is not relevant. Skipping...", callee.name) - continue - - ext_workflow: list = external_workflows.get(workflow_name, []) - ext_workflow.append( - { - "version": workflow_version, - "caller_path": ci_info["service"].api_client.get_file_link( - ctx.component.repository.full_name, - ctx.component.repository.commit_sha, - file_path=( - ci_info["service"].api_client.get_relative_path_of_workflow( - os.path.basename(caller_path) - ) - if caller_path - else "" + callgraph = ci_info["callgraph"] + for root in callgraph.root_nodes: + for callee in traverse_bfs(root): + if isinstance(callee, (GitHubActionsReusableWorkflowCallNode, GitHubActionsActionStepNode)): + workflow_name = callee.uses_name + workflow_version = callee.uses_version + if workflow_version is None: + # Most likely we have encountered an internal reusable workflow, which + # can be skipped. + logger.debug("GitHub Actions workflow %s misses a version. Skipping...", workflow_name) + continue + + job = ( + get_containing_github_job(callee, callgraph.parents) + if isinstance(callee, GitHubActionsActionStepNode) + else callee + ) + + if not job: + continue + + caller_path = job.context.ref.workflow_context.ref.source_filepath + + ext_workflow: list = external_workflows.get(workflow_name, []) + ext_workflow.append( + { + "version": workflow_version, + "caller_path": ci_info["service"].api_client.get_file_link( + ctx.component.repository.full_name, + ctx.component.repository.commit_sha, + file_path=( + ci_info["service"].api_client.get_relative_path_of_workflow( + os.path.basename(caller_path) + ) + if caller_path + else "" + ), ), - ), - } - ) - external_workflows[workflow_name] = ext_workflow + } + ) + external_workflows[workflow_name] = ext_workflow # If no external GitHub Actions are found, return passed result. if not external_workflows: diff --git a/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py b/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py index e9f629447..f6ef41014 100644 --- a/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py +++ b/src/macaron/slsa_analyzer/checks/trusted_builder_l3_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. @@ -11,16 +11,16 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.sql.sqltypes import String +from macaron.code_analyzer.dataflow_analysis.core import traverse_bfs +from macaron.code_analyzer.dataflow_analysis.github import ( + GitHubActionsActionStepNode, + GitHubActionsReusableWorkflowCallNode, +) from macaron.config.defaults import defaults from macaron.database.table_definitions import CheckFacts from macaron.slsa_analyzer.analyze_context import AnalyzeContext, store_inferred_build_info_results from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.slsa_req import ReqName @@ -114,37 +114,36 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: trusted_builders = defaults.get_list("ci.github_actions", "trusted_builders", fallback=[]) # Look for trusted builders called as GitHub Actions. - for callee in ci_info["callgraph"].bfs(): - if isinstance(callee, GitHubWorkflowNode): - workflow_name = callee.name.split("@")[0] - - # Check if the action is called as a third-party or reusable workflow. - if not workflow_name or callee.node_type not in [ - GitHubWorkflowType.EXTERNAL, - GitHubWorkflowType.REUSABLE, - ]: - logger.debug("Workflow %s is not relevant. Skipping...", callee.name) - continue - if workflow_name in trusted_builders: - caller_path = callee.caller.source_path if isinstance(callee.caller, GitHubJobNode) else "" - caller_link = ci_service.api_client.get_file_link( - ctx.component.repository.full_name, - ctx.component.repository.commit_sha, - ci_service.api_client.get_relative_path_of_workflow(os.path.basename(caller_path)), - ) - - store_inferred_build_info_results( - ctx=ctx, ci_info=ci_info, ci_service=ci_service, trigger_link=caller_link - ) - - found_builder = True - result_values.append( - { - "build_tool_name": callee.name, - "build_trigger": caller_link, - "ci_service_name": ci_service.name, - } - ) + for root in ci_info["callgraph"].root_nodes: + for callee in traverse_bfs(root): + if isinstance(callee, (GitHubActionsReusableWorkflowCallNode, GitHubActionsActionStepNode)): + + workflow_name = callee.uses_name + + if workflow_name in trusted_builders: + if isinstance(callee, GitHubActionsReusableWorkflowCallNode): + caller_path = callee.context.ref.workflow_context.ref.source_filepath + else: + caller_path = callee.context.ref.job_context.ref.workflow_context.ref.source_filepath + + caller_link = ci_service.api_client.get_file_link( + ctx.component.repository.full_name, + ctx.component.repository.commit_sha, + ci_service.api_client.get_relative_path_of_workflow(os.path.basename(caller_path)), + ) + + store_inferred_build_info_results( + ctx=ctx, ci_info=ci_info, ci_service=ci_service, trigger_link=caller_link + ) + + found_builder = True + result_values.append( + { + "build_tool_name": workflow_name, + "build_trigger": caller_link, + "ci_service_name": ci_service.name, + } + ) result_tables = [TrustedBuilderFacts(**result, confidence=Confidence.HIGH) for result in result_values] diff --git a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py index adaa3ce95..9df7e8e70 100644 --- a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py +++ b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py @@ -3,15 +3,14 @@ """This module contains the BaseCIService class to be inherited by a CI service.""" +from __future__ import annotations + import logging import os from abc import abstractmethod -from collections.abc import Iterable from datetime import datetime -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.errors import CallGraphError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool, BuildToolCommand +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.git_service.api_client import BaseAPIClient from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService @@ -92,7 +91,7 @@ def is_detected( return exists @abstractmethod - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -104,7 +103,7 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ raise NotImplementedError @@ -245,31 +244,6 @@ def workflow_run_deleted(self, timestamp: datetime) -> bool: """ return False - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """ - Traverse the callgraph and find all the reachable build tool commands. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - # By default we assume that there is no callgraph available for a CI service. - # Each CI service should override this method if a callgraph is generated for it. - raise CallGraphError("There is no callgraph for this CI service.") - def get_third_party_configurations(self) -> list[str]: """Get the list of third-party CI configuration files. @@ -309,7 +283,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -321,33 +295,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") - - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """ - Traverse the callgraph and find all the reachable build tool commands. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - raise CallGraphError("There is no callgraph for this CI service.") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/ci_service/circleci.py b/src/macaron/slsa_analyzer/ci_service/circleci.py index 1ac05bd86..72a838218 100644 --- a/src/macaron/slsa_analyzer/ci_service/circleci.py +++ b/src/macaron/slsa_analyzer/ci_service/circleci.py @@ -1,10 +1,11 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module analyze Circle CI.""" +from __future__ import annotations -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import defaults from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService @@ -42,7 +43,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -54,10 +55,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py b/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py deleted file mode 100644 index 3c234d755..000000000 --- a/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py +++ /dev/null @@ -1,801 +0,0 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""This module provides the intermediate representations and analysis functions for GitHub Actions.""" - -import logging -import os -import re -from collections.abc import Iterable -from dataclasses import dataclass -from enum import Enum -from typing import Any, TypeGuard, cast - -from macaron.code_analyzer.call_graph import BaseNode -from macaron.config.global_config import global_config -from macaron.errors import CallGraphError, GitHubActionsValueError, ParseError -from macaron.parsers.actionparser import get_step_input -from macaron.parsers.actionparser import parse as parse_action -from macaron.parsers.bashparser import BashNode, BashScriptType, create_bash_node -from macaron.parsers.github_workflow_model import ( - ActionStep, - Identified, - Job, - NormalJob, - ReusableWorkflowCallJob, - Step, - Workflow, - is_action_step, - is_normal_job, - is_reusable_workflow_call_job, -) -from macaron.slsa_analyzer.build_tool.language import BuildLanguage, Language - -logger: logging.Logger = logging.getLogger(__name__) - - -@dataclass(frozen=True) -class ThirdPartyAction: - """The representation for a third-party GitHub Action.""" - - #: The name of the GitHub Action. - action_name: str - - #: The version of the GitHub Action. - action_version: str | None - - -class GitHubWorkflowType(str, Enum): - """This class represents different GitHub Actions workflow types.""" - - INTERNAL = "internal" # Workflows declared in the repo. - EXTERNAL = "external" # Third-party workflows. - REUSABLE = "reusable" # Reusable workflows. - - -class GitHubWorkflowNode(BaseNode): - """This class represents a callgraph node for GitHub Actions workflows.""" - - def __init__( - self, - name: str, - node_type: GitHubWorkflowType, - source_path: str, - parsed_obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, - model: ThirdPartyAction | None = None, - **kwargs: Any, - ) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the workflow (or URL for reusable and external workflows). - node_type : GitHubWorkflowType - The type of workflow. - source_path : str - The path of the workflow. - parsed_obj : Workflow | Identified[ReusableWorkflowCallJob] | ActionStep - The parsed Actions workflow object. Actual type must correspond to node type. - (INTERNAL -> Workflow, REUSABLE -> Identified[ReusableWorkflowCallJob], EXTERNAL -> ActionStep) - caller: BaseNode | None - The caller node. - model: ThirdPartyAction | None - The static analysis abstraction for the third-party GitHub Action. - """ - super().__init__(**kwargs) - self.name = name - self.node_type: GitHubWorkflowType = node_type - self.source_path = source_path - self.parsed_obj = parsed_obj - self.model = model - - def __str__(self) -> str: - return f"GitHubWorkflowNode({self.name},{self.node_type})" - - -class GitHubJobNode(BaseNode): - """This class represents a callgraph node for GitHub Actions jobs.""" - - def __init__(self, name: str, source_path: str, parsed_obj: Identified[Job], **kwargs: Any) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the workflow (or URL for reusable and external workflows). - source_path : str - The path of the workflow. - parsed_obj : Identified[Job] - The parsed Actions workflow object. - caller: BaseNode - The caller node. - """ - super().__init__(**kwargs) - self.name = name - self.source_path = source_path - self.parsed_obj = parsed_obj - - def __str__(self) -> str: - return f"GitHubJobNode({self.name})" - - -def is_parsed_obj_workflow( - parsed_obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, -) -> TypeGuard[Workflow]: - """Type guard for Workflow parsed_obj.""" - return not isinstance(parsed_obj, Identified) and "jobs" in parsed_obj - - -def is_parsed_obj_reusable_workflow_call_job( - obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, -) -> TypeGuard[Identified[ReusableWorkflowCallJob]]: - """Type guard for ReusableWorkflowCallJob parsed_obj.""" - return isinstance(obj, Identified) - - -def is_parsed_obj_action_step( - parsed_obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, -) -> TypeGuard[ActionStep]: - """Type guard for ActionStep parsed_obj.""" - return not isinstance(parsed_obj, Identified) and "uses" in parsed_obj - - -def find_expression_variables(value: str, exp_var: str) -> Iterable[str]: - """Find all the matching GitHub Actions expression variables in a string value. - - GitHub Actions Expression syntax: ${{ }} - See https://docs.github.com/en/actions/learn-github-actions/expressions#about-expressions - - Parameters - ---------- - value: str - The value in which the expression values are searched. - exp_var: str - The expression variable name. - - Yields - ------ - Iterable[str] - The expression variable names. - - Examples - -------- - >>> list(find_expression_variables("echo ${{ inputs.foo }}", "inputs")) - ['foo'] - >>> list(find_expression_variables("echo ${{ inputs.foo }} ${{ inputs.bar }}", "inputs")) - ['foo', 'bar'] - >>> list(find_expression_variables("echo ${{ inputs.foo }} ${{ inputs.bar }}", "matric")) - [] - """ - expressions = re.findall(r"\$\{\{.*?\}\}", value) - pattern = r"\$\{\{\s+" + exp_var + r"\.(?P(.*?))\s+\}\}" - for exp in expressions: - match = re.match(pattern, exp) - if match: - yield match.group("variable") - - -def resolve_matrix_variable(job_node: GitHubJobNode, var: str) -> Iterable[str]: - """Resolve the value of a GitHub Actions matrix variable. - - For the specification of matrix variables in GitHub Actions see: - https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs - - Parameters - ---------- - job_node: GitHubJobNode - The target GitHub Actions job. - var: str - The matrix variable that needs to be resolved. - - Yields - ------ - str - The possible values of the matrix variable. - - Raises - ------ - GitHubActionsValueError - When the matrix variable cannot be found. - """ - job_obj = job_node.parsed_obj.obj - if "strategy" not in job_obj: - raise GitHubActionsValueError(f"Unable to find `strategy` in {job_node.source_path} GitHub Action.") - if "matrix" not in job_obj["strategy"]: - raise GitHubActionsValueError(f"Unable to find `matrix` in {job_node.source_path} GitHub Action.") - matrix = job_obj["strategy"]["matrix"] - if not isinstance(matrix, dict): - raise GitHubActionsValueError(f"Unable to resolve matrix in {job_node.source_path} GitHub Action.") - - matrix_vals = matrix.get(var) - if matrix_vals is None: - raise GitHubActionsValueError(f"Unable to find variable {var} in {job_node.source_path} GitHub Action.") - - if isinstance(matrix_vals, list): - for val in matrix_vals: - # TODO: type of val permits dict/list, how to handle it? Just return Configuration instead of str - # and let the caller handle it? - if isinstance(val, str): - yield val - if isinstance(val, int): - yield str(val) - if isinstance(val, float): - yield str(val) - if isinstance(val, bool): - yield "true" if val else "false" - else: - raise GitHubActionsValueError(f"Unable to resolve matrix in {job_node.source_path} GitHub Action.") - - -def is_expression(value: str) -> bool: - """Determine if a value is a GitHub Actions expression. - - Parameters - ---------- - value: str - The input value. - - Returns - ------- - bool - True if the input value is a GitHub Actions expression. - - Examples - -------- - >>> is_expression("${{ foo }}") - True - >>> is_expression("${{ foo }") - False - >>> is_expression("${ foo }") - False - """ - return re.match(r"\$\{\{.*?\}\}", value) is not None - - -def find_language_setup_action(job_node: GitHubJobNode, lang_name: BuildLanguage) -> Language | None: - """Find the step that calls a language setup GitHub Actions and return the model. - - Parameters - ---------- - job_node: GitHubJobNode - The target GitHub Actions job node. - lang_name: BuildLanguage - The target language used in the build. - - Returns - ------- - Language | None - The language model for the language setup GitHub Action or None. - """ - for callee in job_node.callee: - model = callee.model - # Check if the model implements the Language protocol. - if isinstance(model, Language): - if model.lang_name == lang_name: - return model - return None - - -def build_call_graph_from_node(node: GitHubWorkflowNode, repo_path: str) -> None: - """Analyze the GitHub Actions node to build the call graph. - - Parameters - ---------- - node : GitHubWorkflowNode - The node for a single GitHub Actions workflow. - repo_path: str - The file system path to the repo. - """ - if not is_parsed_obj_workflow(node.parsed_obj): - return - jobs = node.parsed_obj["jobs"] - for job_name, job in jobs.items(): - job_with_id = Identified[Job](job_name, job) - job_node = GitHubJobNode(name=job_name, source_path=node.source_path, parsed_obj=job_with_id, caller=node) - node.add_callee(job_node) - - if is_normal_job(job): - # Add third-party workflows. - steps = job.get("steps") - if steps is None: - continue - for step in steps: - if is_action_step(step): - # TODO: change source_path for external workflows. - action_name = step["uses"] - external_node = GitHubWorkflowNode( - name=action_name, - node_type=GitHubWorkflowType.EXTERNAL, - source_path="", - parsed_obj=step, - caller=job_node, - ) - external_node.model = create_third_party_action_model(external_node) - job_node.add_callee(external_node) - - # Check the shell type configuration. We currently can support `bash`` and `sh`. - # By default `bash`` is used on non-Windows runners, which we support. - # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#defaultsrunshell - # TODO: support Powershell for Windows runners, which is the default shell in GitHub Actions. - # Right now, the script with the default shell is passed to the parser, which will fail - # if the runner is Windows and Powershell is used. But there is no easy way to avoid passing - # the script because that means we need to accurately determine the runner's OS. - elif step.get("run") and ("shell" not in step or step["shell"] in {"bash", "sh"}): - try: - name = "UNKNOWN" - node_id = None - if "id" in step: - node_id = step["id"] - if "name" in step: - name = step["name"] - - callee = create_bash_node( - name=name, - node_id=node_id, - node_type=BashScriptType.INLINE, - source_path=node.source_path, - ci_step_ast=step, - repo_path=repo_path, - caller=job_node, - recursion_depth=0, - ) - except CallGraphError as error: - logger.debug(error) - continue - job_node.add_callee(callee) - - elif is_reusable_workflow_call_job(job): - workflow_call_job_with_id = Identified[ReusableWorkflowCallJob](job_name, job) - # Add reusable workflows. - logger.debug("Found reusable workflow: %s.", job["uses"]) - # TODO: change source_path for reusable workflows. - reusable_node = GitHubWorkflowNode( - name=job["uses"], - node_type=GitHubWorkflowType.REUSABLE, - source_path="", - parsed_obj=workflow_call_job_with_id, - caller=job_node, - ) - reusable_node.model = create_third_party_action_model(reusable_node) - job_node.add_callee(reusable_node) - - -def build_call_graph_from_path(root: BaseNode, workflow_path: str, repo_path: str, macaron_path: str = "") -> BaseNode: - """Build the call Graph for GitHub Actions workflows. - - At the moment it does not analyze third-party workflows to include their callees. - - Parameters - ---------- - root : BaseNode - The root call graph node. - workflow_path: str - The path to the CI workflow file. - repo_path: str - The path to the target repository. - macaron_path: str - Macaron's root path (optional). - - Returns - ------- - BaseNode - The callgraph node for the GitHub Actions workflow. - - Raises - ------ - ParseError - When parsing the workflow fails with error. - """ - if not macaron_path: - macaron_path = global_config.macaron_path - - # Parse GitHub Actions workflows. - logger.debug( - "Parsing %s", - workflow_path, - ) - try: - parsed_obj: Workflow = parse_action(workflow_path) - except ParseError as error: - logger.debug("Unable to parse GitHub Actions at the target %s: %s", repo_path, error) - raise ParseError from error - - # Add internal workflows. - workflow_name = os.path.basename(workflow_path) - workflow_node = GitHubWorkflowNode( - name=workflow_name, - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - build_call_graph_from_node(workflow_node, repo_path=repo_path) - - return workflow_node - - -def get_reachable_secrets(step_node: BashNode) -> Iterable[str]: - """Get reachable secrets to a GitHub Actions step. - - Parameters - ---------- - step_node: BashNode - The target GitHub Action step node. - - Yields - ------ - str - The reachable secret variable name. - """ - job_node = step_node.caller - if not isinstance(job_node, GitHubJobNode): - return - - def _find_secret_keys(ast: NormalJob | ReusableWorkflowCallJob | Step | None) -> Iterable[str]: - if ast is None: - return - if "uses" in ast: - return - normal_job = cast(NormalJob, ast) - if "env" in normal_job: - env = normal_job["env"] - if isinstance(env, dict): - for key, val in env.items(): - if isinstance(val, str): - if list(find_expression_variables(value=val, exp_var="secrets")): - yield key - - # Get reachable secrets set as environment variables in the job. - yield from _find_secret_keys(job_node.parsed_obj.obj) - - # Get reachable secrets set as environment variables in the step. - if step_node.node_type == BashScriptType.INLINE: - yield from _find_secret_keys(step_node.parsed_step_obj) - - -def get_ci_events(workflow_node: GitHubWorkflowNode) -> list[str] | None: - """Get the CI events that trigger the GitHub Action workflow. - - Parameters - ---------- - workflow_node: GitHubWorkflowNode - The target GitHub Action workflow node. - - Returns - ------- - list[str] | None - The list of event names or None. - """ - result: list[str] = [] - ast = workflow_node.parsed_obj - if not isinstance(ast, dict) or "on" not in ast: - raise GitHubActionsValueError(f"Unable to find `on` event in {workflow_node.source_path} GitHub Action.") - - on = cast(Workflow, ast)["on"] - - if isinstance(on, str): - result.append(on) - elif isinstance(on, list): - for hook in on: - result.append(hook) - else: - for key in on: - result.append(key) - - return result - - -class SetupJava(Language, ThirdPartyAction): - """This class models the official setup-java GitHub Action from GitHub. - - For the table of supported distributions see: - https://github.com/actions/setup-java?tab=readme-ov-file#supported-distributions - """ - - #: Name of the GitHub Action. - action_name = "actions/setup-java" - - #: Version of the GitHub Action. - action_version: None - - def __init__(self, external_node: GitHubWorkflowNode): - """Initialize the setup-java GitHub Action model. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Action workflow node. - """ - # external_node is assumed to be an EXTERNAL node with ActionStep parsed_obj - step = external_node.parsed_obj - if not is_parsed_obj_action_step(step): - raise ValueError("Expected an action step node") - self._lang_name = BuildLanguage.JAVA - self._lang_distributions = None - self._lang_versions = None - self._lang_url = "https://github.com/actions/setup-java" - lang_distribution_exp = None - lang_version_exp = None - if distribution := get_step_input(step, key="distribution"): - if not is_expression(distribution): - self._lang_distributions = [distribution] - else: - lang_distribution_exp = distribution - if java_version := get_step_input(step, key="java-version"): - if not is_expression(java_version): - self._lang_versions = [java_version] - else: - lang_version_exp = java_version - # Handle matrix values. - matrix_values = {} - if lang_distribution_exp and "matrix." in lang_distribution_exp: - matrix_values["lang_distribution_var"] = find_expression_variables( - value=lang_distribution_exp, exp_var="matrix" - ) - if lang_version_exp and "matrix." in lang_version_exp: - matrix_values["lang_version_var"] = find_expression_variables(value=lang_version_exp, exp_var="matrix") - - if matrix_values: - job_node = external_node.caller - if job_node is None: - logger.debug("Unable to find the caller GitHub Action job for step %s.", external_node.name) - return - try: - if (variables := matrix_values.get("lang_distribution_var")) is not None: - values: list[str] = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_distributions = values - except GitHubActionsValueError as error: - logger.debug(error) - - try: - if (variables := matrix_values.get("lang_version_var")) is not None: - values = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_versions = values - except GitHubActionsValueError as error: - logger.debug(error) - - @property - def lang_name(self) -> str: - """Get the name of the language.""" - return self._lang_name - - @property - def lang_versions(self) -> list[str] | None: - """Get the possible version of the language.""" - return self._lang_versions - - @property - def lang_distributions(self) -> list[str] | None: - """Get the possible distributions of the language.""" - return self._lang_distributions - - @property - def lang_url(self) -> str | None: - """Get the URL that provides information about the language distributions and versions.""" - return self._lang_url - - -class OracleSetupJava(Language, ThirdPartyAction): - """This class models the Oracle setup-java GitHub Action. - - For the table of supported distributions see: - # https://github.com/oracle-actions/setup-java?tab=readme-ov-file#input-overview - """ - - #: Name of the GitHub Action. - action_name = "oracle-actions/setup-java" - - #: Version of the GitHub Action. - action_version: None - - def __init__(self, external_node: GitHubWorkflowNode): - """Initialize the Oracle setup-java GitHub Action model. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Action workflow node. - """ - # external_node is assumed to be an EXTERNAL node with ActionStep parsed_obj - step = external_node.parsed_obj - if not is_parsed_obj_action_step(step): - raise ValueError("Expected an action step node") - self._lang_name = BuildLanguage.JAVA - self._lang_distributions = None - self._lang_versions = None - self._lang_url = "https://github.com/oracle-actions/setup-java" - lang_distribution_exp = None - lang_version_exp = None - if website := get_step_input(step, key="website"): - if not is_expression(website): - self._lang_distributions = [website] - else: - lang_distribution_exp = website - if java_release := get_step_input(step, key="release"): - if not is_expression(java_release): - self._lang_versions = [java_release] - else: - lang_version_exp = java_release - # Handle matrix values. - matrix_values = {} - if lang_distribution_exp and "matrix." in lang_distribution_exp: - matrix_values["lang_distribution_var"] = find_expression_variables( - value=lang_distribution_exp, exp_var="matrix" - ) - if lang_version_exp and "matrix." in lang_version_exp: - matrix_values["lang_version_var"] = find_expression_variables(value=lang_version_exp, exp_var="matrix") - - if matrix_values: - job_node = external_node.caller - if job_node is None: - logger.debug("Unable to find the caller GitHub Action job for step %s.", external_node.name) - return - try: - if (variables := matrix_values.get("lang_distribution_var")) is not None: - values: list[str] = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_distributions = values - except GitHubActionsValueError as error: - logger.debug(error) - - try: - if (variables := matrix_values.get("lang_version_var")) is not None: - values = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_versions = values - except GitHubActionsValueError as error: - logger.debug(error) - - @property - def lang_name(self) -> str: - """Get the name of the language.""" - return self._lang_name - - @property - def lang_versions(self) -> list[str] | None: - """Get the possible version of the language.""" - return self._lang_versions - - @property - def lang_distributions(self) -> list[str] | None: - """Get the possible distributions of the language.""" - return self._lang_distributions - - @property - def lang_url(self) -> str | None: - """Get the URL that provides information about the language distributions and versions.""" - return self._lang_url - - -class GraalVMSetup(Language, ThirdPartyAction): - """This class models the GraalVM setup GitHub Action from GitHub. - - For the table of supported distributions see: - https://github.com/graalvm/setup-graalvm - """ - - #: Name of the GitHub Action. - action_name = "graalvm/setup-graalvm" - - #: Version of the GitHub Action. - action_version: None - - def __init__(self, external_node: GitHubWorkflowNode): - """Initialize the setup-java GitHub Action model. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Action workflow node. - """ - # external_node is assumed to be an EXTERNAL node with ActionStep parsed_obj - step = external_node.parsed_obj - if not is_parsed_obj_action_step(step): - raise ValueError("Expected an action step node") - self._lang_name = BuildLanguage.JAVA - self._lang_distributions = None - self._lang_versions = None - self._lang_url = "https://github.com/graalvm/setup-graalvm" - lang_distribution_exp = None - lang_version_exp = None - if distribution := get_step_input(step, key="distribution"): - if not is_expression(distribution): - self._lang_distributions = [distribution] - else: - lang_distribution_exp = distribution - if java_version := get_step_input(step, key="java-version"): - if not is_expression(java_version): - self._lang_versions = [java_version] - else: - lang_version_exp = java_version - # Handle matrix values. - matrix_values = {} - if lang_distribution_exp and "matrix." in lang_distribution_exp: - matrix_values["lang_distribution_var"] = find_expression_variables( - value=lang_distribution_exp, exp_var="matrix" - ) - if lang_version_exp and "matrix." in lang_version_exp: - matrix_values["lang_version_var"] = find_expression_variables(value=lang_version_exp, exp_var="matrix") - - if matrix_values: - job_node = external_node.caller - if job_node is None: - logger.debug("Unable to find the caller GitHub Action job for step %s.", external_node.name) - return - try: - if (variables := matrix_values.get("lang_distribution_var")) is not None: - values: list[str] = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_distributions = values - except GitHubActionsValueError as error: - logger.debug(error) - - try: - if (variables := matrix_values.get("lang_version_var")) is not None: - values = [] - for var in variables: - values.extend(resolve_matrix_variable(job_node, var)) - if values: - self._lang_versions = values - except GitHubActionsValueError as error: - logger.debug(error) - - @property - def lang_name(self) -> str: - """Get the name of the language.""" - return self._lang_name - - @property - def lang_versions(self) -> list[str] | None: - """Get the possible version of the language.""" - return self._lang_versions - - @property - def lang_distributions(self) -> list[str] | None: - """Get the possible distributions of the language.""" - return self._lang_distributions - - @property - def lang_url(self) -> str | None: - """Get the URL that provides information about the language distributions and versions.""" - return self._lang_url - - -def create_third_party_action_model(external_node: GitHubWorkflowNode) -> ThirdPartyAction: - """Create an instances of third-party model object. - - Parameters - ---------- - external_node: GitHubWorkflowNode - The external GitHub Actions workflow node. - - Returns - ------- - ThirdPartyAction - An instance object for the ThirdPartyAction model. - """ - action_name = external_node.name - action_version = None - if "@" in external_node.name: - action_name, action_version = external_node.name.split("@", maxsplit=1) - match action_name: - case "actions/setup-java": - return SetupJava(external_node=external_node) - case "oracle-actions/setup-java": - return OracleSetupJava(external_node=external_node) - case "graalvm/setup-graalvm": - return GraalVMSetup(external_node=external_node) - return ThirdPartyAction(action_name=action_name, action_version=action_version) diff --git a/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py b/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py index c0fd6aa46..b24dc5963 100644 --- a/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py +++ b/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py @@ -3,29 +3,19 @@ """This module analyzes GitHub Actions CI.""" +from __future__ import annotations import glob import logging import os -from collections.abc import Iterable from datetime import datetime, timedelta, timezone -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import Node, NodeForest from macaron.config.defaults import defaults from macaron.config.global_config import global_config -from macaron.errors import CallGraphError, GitHubActionsValueError, ParseError -from macaron.parsers.bashparser import BashNode, BashScriptType -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool, BuildToolCommand +from macaron.errors import GitHubActionsValueError, ParseError from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_path, - find_language_setup_action, - get_ci_events, - get_reachable_secrets, -) from macaron.slsa_analyzer.git_service.api_client import GhAPIClient, get_default_gh_client from macaron.slsa_analyzer.git_service.base_git_service import BaseGitService from macaron.slsa_analyzer.git_service.github import GitHub @@ -386,7 +376,7 @@ def workflow_run_in_date_time_range( raise GitHubActionsValueError("GitHub Actions workflow run misses jobs information.") for job in run_jobs["jobs"]: # If the deploy step is a Reusable Workflow, there won't be any steps in the caller job. - if callee_node_type == GitHubWorkflowType.REUSABLE.value: + if callee_node_type == "reusable": if not job["name"].startswith(job_id) or job["conclusion"] != "success": continue started_at = datetime.fromisoformat(job["started_at"]) @@ -576,7 +566,7 @@ def has_kws_in_log(self, latest_run: dict, build_log: list) -> bool: logger.info("No build kw in log file. Continue ...") return False - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for GitHub Actions workflows. At the moment it does not analyze third-party workflows to include their callees. @@ -596,106 +586,18 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: if not macaron_path: macaron_path = global_config.macaron_path - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, repo_path) - # Parse GitHub Actions workflows. files = self.get_workflows(repo_path) + nodes: list[Node] = [] for workflow_path in files: try: - callee = build_call_graph_from_path( - root=root, workflow_path=workflow_path, repo_path=repo_path, macaron_path=macaron_path - ) + workflow_node = analyse_github_workflow_file(workflow_path, repo_path) + except ParseError: logger.debug("Skip adding workflow at %s to the callgraph.", workflow_path) continue - root.add_callee(callee) - return gh_cg - - def _get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """Traverse the callgraph and find all the reachable build tool commands.""" - for node in callgraph.bfs(): - # We are just interested in nodes that have bash commands. - if isinstance(node, BashNode): - # We collect useful contextual information for the called BashNode. - caller_node = node.caller - # The GitHub Actions workflow that triggers the path in the callgraph. - workflow_node = None - # The GitHub Actions job that triggers the path in the callgraph. - job_node = None - # The step in GitHub Actions job that triggers the path in the callgraph. - step_node = node if node.node_type == BashScriptType.INLINE else None - - # Walk up the callgraph to find the relevant caller nodes. - # In GitHub Actions a `GitHubWorkflowNode` may call several `GitHubJobNode`s - # and a `GitHubJobNode` may call several steps, which can be external `GitHubWorkflowNode` - # or inlined run nodes. We currently support the run steps that call shell scripts as - # `BashNode`. An inlined `BashNode` can call `BashNode` as bash files. - # TODO: revisit this implementation if analysis of external workflows is supported in - # the future, and decide if setting the caller workflow and job nodes to the nodes in the - # main triggering workflow is still expected. - while caller_node is not None: - match caller_node: - case GitHubWorkflowNode(): - workflow_node = caller_node - case GitHubJobNode(): - job_node = caller_node - case BashNode(node_type=BashScriptType.INLINE): - step_node = caller_node - - caller_node = caller_node.caller - - # Check if there was an issue in finding any of the caller nodes. - if workflow_node is None or job_node is None or step_node is None: - raise CallGraphError("Unable to traverse the call graph to find build commands.") - - # Find the bash commands that call the build tool. - for cmd in node.parsed_bash_obj.get("commands", []): - if build_tool.is_build_command(cmd): - lang_versions = lang_distributions = lang_url = None - if lang_model := find_language_setup_action(job_node, build_tool.language): - lang_versions = lang_model.lang_versions - lang_distributions = lang_model.lang_distributions - lang_url = lang_model.lang_url - yield BuildToolCommand( - ci_path=workflow_node.source_path, - command=cmd, - step_node=step_node, - language=build_tool.language, - language_versions=lang_versions, - language_distributions=lang_distributions, - language_url=lang_url, - reachable_secrets=list(get_reachable_secrets(step_node)), - events=get_ci_events(workflow_node), - ) - - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """Traverse the callgraph and find all the reachable build tool commands. - - This generator yields sorted build tool command objects to allow a deterministic behavior. - The objects are sorted based on the string representation of the build tool object. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - yield from sorted( - self._get_build_tool_commands(callgraph=callgraph, build_tool=build_tool), - key=str, - ) + nodes.append(workflow_node) + return NodeForest(nodes) def get_third_party_configurations(self) -> list[str]: """Get the list of third-party CI configuration files. diff --git a/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py b/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py index cd7e3210d..ede49002f 100644 --- a/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py +++ b/src/macaron/slsa_analyzer/ci_service/gitlab_ci.py @@ -1,9 +1,11 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module analyzes GitLab CI.""" -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from __future__ import annotations + +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import defaults from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService @@ -41,7 +43,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -53,10 +55,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/ci_service/jenkins.py b/src/macaron/slsa_analyzer/ci_service/jenkins.py index ebef614ca..c95edb4cb 100644 --- a/src/macaron/slsa_analyzer/ci_service/jenkins.py +++ b/src/macaron/slsa_analyzer/ci_service/jenkins.py @@ -3,21 +3,17 @@ """This module analyzes Jenkins CI.""" +from __future__ import annotations + import glob import logging import os import re -from collections.abc import Iterable -from enum import Enum -from typing import Any -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_bash_script +from macaron.code_analyzer.dataflow_analysis.core import Node, NodeForest from macaron.config.defaults import defaults from macaron.config.global_config import global_config -from macaron.errors import ParseError -from macaron.parsers import bashparser -from macaron.repo_verifier.repo_verifier import BaseBuildTool -from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService logger: logging.Logger = logging.getLogger(__name__) @@ -66,7 +62,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -78,114 +74,36 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest : NodeForest The call graph built for the CI. """ if not macaron_path: macaron_path = global_config.macaron_path - root: BaseNode = BaseNode() - call_graph = CallGraph(root, repo_path) - - # To match lines that start with sh '' or sh ''' ''' (either single or triple quotes) - # TODO: we need to support multi-line cases. + # # To match lines that start with sh '' or sh ''' ''' (either single or triple quotes) + # # TODO: we need to support multi-line cases. pattern = r"^\s*sh\s+'{1,3}(.*?)'{1,3}$" workflow_files = self.get_workflows(repo_path) + nodes: list[Node] = [] + for workflow_path in workflow_files: try: with open(workflow_path, encoding="utf-8") as wf: lines = wf.readlines() except OSError as error: logger.debug("Unable to read Jenkinsfile %s: %s", workflow_path, error) - return call_graph - - # Add internal workflow. - workflow_name = os.path.basename(workflow_path) - workflow_node = JenkinsNode( - name=workflow_name, - node_type=JenkinsNodeType.INTERNAL, - source_path=workflow_path, - caller=root, - ) - root.add_callee(workflow_node) + return NodeForest([]) # Find matching lines. for line in lines: match = re.match(pattern, line) if not match: continue + nodes.append(analyse_bash_script(match[1], workflow_path, repo_path)) - try: - parsed_bash_script = bashparser.parse(match.group(1), macaron_path=macaron_path) - except ParseError as error: - logger.debug(error) - continue - - # TODO: Similar to GitHub Actions, we should enable support for recursive calls to bash scripts - # within Jenkinsfiles. While the implementation should be relatively straightforward, it’s - # recommended to first refactor the bashparser to make it agnostic to GitHub Actions. - bash_node = bashparser.BashNode( - "jenkins_inline_cmd", - bashparser.BashScriptType.INLINE, - workflow_path, - parsed_step_obj=None, - parsed_bash_obj=parsed_bash_script, - node_id=None, - caller=workflow_node, - ) - workflow_node.add_callee(bash_node) - - return call_graph - - def get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """ - Traverse the callgraph and find all the reachable build tool commands. - - Parameters - ---------- - callgraph: CallGraph - The callgraph reachable from the CI workflows. - build_tool: BaseBuildTool - The corresponding build tool for which shell commands need to be detected. - - Yields - ------ - BuildToolCommand - The object that contains the build command as well useful contextual information. - - Raises - ------ - CallGraphError - Error raised when an error occurs while traversing the callgraph. - """ - yield from sorted( - self._get_build_tool_commands(callgraph=callgraph, build_tool=build_tool), - key=str, - ) - - def _get_build_tool_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: - """Traverse the callgraph and find all the reachable build tool commands.""" - for node in callgraph.bfs(): - # We are just interested in nodes that have bash commands. - if isinstance(node, bashparser.BashNode): - # The Jenkins configuration that triggers the path in the callgraph. - workflow_node = node.caller - - # Find the bash commands that call the build tool. - for cmd in node.parsed_bash_obj.get("commands", []): - if build_tool.is_build_command(cmd): - yield BuildToolCommand( - ci_path=workflow_node.source_path if workflow_node else "", - command=cmd, - step_node=None, - language=build_tool.language, - language_versions=None, - language_distributions=None, - language_url=None, - reachable_secrets=[], - events=None, - ) + # return call_graph + return NodeForest(nodes) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str @@ -214,41 +132,3 @@ def has_latest_run_passed( The feed back of the check, or empty if no passing workflow is found. """ return "" - - -class JenkinsNodeType(str, Enum): - """This class represents Jenkins node type.""" - - INTERNAL = "internal" # Configurations declared in one file. - - -class JenkinsNode(BaseNode): - """This class represents a callgraph node for Jenkinsfile configuration.""" - - def __init__( - self, - name: str, - node_type: JenkinsNodeType, - source_path: str, - **kwargs: Any, - ) -> None: - """Initialize instance. - - Parameters - ---------- - name : str - Name of the workflow. - node_type : JenkinsNodeType - The type of node. - source_path : str - The path of the workflow. - caller: BaseNode | None - The caller node. - """ - super().__init__(**kwargs) - self.name = name - self.node_type: JenkinsNodeType = node_type - self.source_path = source_path - - def __str__(self) -> str: - return f"JenkinsNodeType({self.name},{self.node_type})" diff --git a/src/macaron/slsa_analyzer/ci_service/travis.py b/src/macaron/slsa_analyzer/ci_service/travis.py index 8b34d27e8..a50936860 100644 --- a/src/macaron/slsa_analyzer/ci_service/travis.py +++ b/src/macaron/slsa_analyzer/ci_service/travis.py @@ -1,9 +1,11 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module analyzes Travis CI.""" -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from __future__ import annotations + +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import defaults from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService @@ -41,7 +43,7 @@ def load_defaults(self) -> None: def set_api_client(self) -> None: """Set the API client using the personal access token.""" - def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: + def build_call_graph(self, repo_path: str, macaron_path: str = "") -> NodeForest: """Build the call Graph for this CI service. Parameters @@ -53,10 +55,10 @@ def build_call_graph(self, repo_path: str, macaron_path: str = "") -> CallGraph: Returns ------- - CallGraph : CallGraph + NodeForest The call graph built for the CI. """ - return CallGraph(BaseNode(), "") + return NodeForest([]) def has_latest_run_passed( self, repo_full_name: str, branch_name: str | None, commit_sha: str, commit_date: str, workflow: str diff --git a/src/macaron/slsa_analyzer/specs/ci_spec.py b/src/macaron/slsa_analyzer/specs/ci_spec.py index 0f00e5bdb..ad928b792 100644 --- a/src/macaron/slsa_analyzer/specs/ci_spec.py +++ b/src/macaron/slsa_analyzer/specs/ci_spec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the BuildSpec class.""" @@ -6,7 +6,7 @@ from collections.abc import Sequence from typing import TypedDict -from macaron.code_analyzer.call_graph import CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.asset import AssetLocator from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload @@ -19,7 +19,7 @@ class CIInfo(TypedDict): service: BaseCIService """The CI service data.""" - callgraph: CallGraph + callgraph: NodeForest """The call graph for this CI service.""" provenance_assets: list[AssetLocator] diff --git a/tests/conftest.py b/tests/conftest.py index d4ed2ab1b..d3c0405c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,14 +10,13 @@ import pytest from pytest_httpserver import HTTPServer -import macaron from macaron.build_spec_generator.cli_command_parser.gradle_cli_parser import GradleCLICommandParser from macaron.build_spec_generator.cli_command_parser.maven_cli_parser import MavenCLICommandParser -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import create_defaults, defaults, load_defaults from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata, Repository -from macaron.parsers.bashparser import BashScriptType, create_bash_node -from macaron.parsers.github_workflow_model import Identified, Job, NormalJob, RunStep, Workflow +from macaron.parsers.github_workflow_model import NormalJob, RunStep, Workflow from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.build_tool.docker import Docker @@ -30,11 +29,6 @@ from macaron.slsa_analyzer.build_tool.yarn import Yarn from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService from macaron.slsa_analyzer.ci_service.circleci import CircleCI -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubJobNode, - GitHubWorkflowNode, - GitHubWorkflowType, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.ci_service.gitlab_ci import GitLabCI from macaron.slsa_analyzer.ci_service.jenkins import Jenkins @@ -423,7 +417,7 @@ def __init__( super().__init__(component, *args, **kwargs) -def build_github_actions_call_graph_for_commands(commands: list[str]) -> CallGraph: +def build_github_actions_call_graph_for_commands(commands: list[str]) -> NodeForest: """ Create a dummy callgraph that calls a list of bash commands for testing. @@ -432,37 +426,10 @@ def build_github_actions_call_graph_for_commands(commands: list[str]) -> CallGra commands: list[str] The list of bash commands. """ - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") run_step: RunStep = {"run": ";".join(commands)} job_obj: NormalJob = {"runs-on": "", "steps": [run_step]} workflow_obj: Workflow = {"on": "release", "jobs": {"release": job_obj}} - workflow_node = GitHubWorkflowNode( - name="", - node_type=GitHubWorkflowType.INTERNAL, - source_path="", - parsed_obj=workflow_obj, - caller=root, - ) - root.add_callee(workflow_node) - job_obj_with_id: Identified[Job] = Identified("release", job_obj) - job_node = GitHubJobNode(name="", source_path="", parsed_obj=job_obj_with_id, caller=workflow_node) - workflow_node.add_callee(job_node) - - job_node.add_callee( - create_bash_node( - name="run", - node_id=None, - node_type=BashScriptType.INLINE, - source_path="", - ci_step_ast=run_step, - repo_path="", - caller=job_node, - recursion_depth=0, - macaron_path=macaron.MACARON_PATH, - ) - ) - + gh_cg = NodeForest([analyse_github_workflow(workflow_obj, "test.yaml", None)]) return gh_cg diff --git a/tests/parsers/bashparser/test_bashparser.py b/tests/parsers/bashparser/test_bashparser.py index 3f8ff5331..97c431034 100644 --- a/tests/parsers/bashparser/test_bashparser.py +++ b/tests/parsers/bashparser/test_bashparser.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """ @@ -12,9 +12,8 @@ import pytest from macaron import MACARON_PATH -from macaron.code_analyzer.call_graph import BaseNode -from macaron.errors import CallGraphError, ParseError -from macaron.parsers.bashparser import BashScriptType, create_bash_node, parse, parse_file +from macaron.errors import ParseError +from macaron.parsers.bashparser import parse, parse_file @pytest.mark.parametrize( @@ -47,36 +46,3 @@ def test_bashparser_parse_invalid() -> None: # Parse the bash script file. with pytest.raises(ParseError): parse_file(file_path=file_path, macaron_path=MACARON_PATH) - - -def test_create_bash_node_recursively() -> None: - """Test creating bash nodes from recursive script.""" - resources_dir = Path(__file__).parent.joinpath("resources", "bash_files") - with pytest.raises(CallGraphError, match="The analysis has reached maximum recursion depth .*"): - create_bash_node( - name="run", - node_id=None, - node_type=BashScriptType.FILE, - source_path=os.path.join(resources_dir, "recursive.sh"), - ci_step_ast=None, - repo_path=str(resources_dir), - caller=BaseNode(), - recursion_depth=0, - macaron_path=MACARON_PATH, - ) - - -def test_create_bash_node_path_traversal_attack() -> None: - """Test creating bash nodes from a script that is vulnerable to path traversal attacks.""" - resources_dir = Path(__file__).parent.joinpath("resources", "bash_files") - assert not create_bash_node( - name="run", - node_id=None, - node_type=BashScriptType.FILE, - source_path=os.path.join(resources_dir, "path_traversal.sh"), - ci_step_ast=None, - repo_path=str(resources_dir), - caller=BaseNode(), - recursion_depth=0, - macaron_path=MACARON_PATH, - ).callee diff --git a/tests/provenance/test_provenance_finder.py b/tests/provenance/test_provenance_finder.py index 5a1148364..774d2ff9e 100644 --- a/tests/provenance/test_provenance_finder.py +++ b/tests/provenance/test_provenance_finder.py @@ -13,7 +13,7 @@ from packageurl import PackageURL from pydriller import Git -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.provenance.provenance_finder import ( find_gav_provenance, find_npm_provenance, @@ -165,7 +165,7 @@ def test_provenance_on_unsupported_ci(macaron_path: Path, service: BaseCIService ci_info = CIInfo( service=service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -190,7 +190,7 @@ def test_provenance_on_supported_ci(macaron_path: Path, test_dir: Path) -> None: ci_info = CIInfo( service=github_actions, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/build_tool/test_docker.py b/tests/slsa_analyzer/build_tool/test_docker.py index 17e8e0114..4f256e5c9 100644 --- a/tests/slsa_analyzer/build_tool/test_docker.py +++ b/tests/slsa_analyzer/build_tool/test_docker.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Docker build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.docker import Docker from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -126,7 +125,7 @@ def test_is_docker_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -215,7 +214,7 @@ def test_is_docker_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_go.py b/tests/slsa_analyzer/build_tool/test_go.py index 7f0cb431f..3f2796326 100644 --- a/tests/slsa_analyzer/build_tool/test_go.py +++ b/tests/slsa_analyzer/build_tool/test_go.py @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.go import Go from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -126,7 +125,7 @@ def test_is_go_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -215,7 +214,7 @@ def test_is_go_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_gradle.py b/tests/slsa_analyzer/build_tool/test_gradle.py index 4298e7fb8..6896159df 100644 --- a/tests/slsa_analyzer/build_tool/test_gradle.py +++ b/tests/slsa_analyzer/build_tool/test_gradle.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Gradle build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.gradle import Gradle from macaron.slsa_analyzer.build_tool.language import BuildLanguage @@ -177,7 +176,7 @@ def test_is_gradle_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -332,7 +331,7 @@ def test_is_gradle_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_maven.py b/tests/slsa_analyzer/build_tool/test_maven.py index 19cb9573f..c67f99298 100644 --- a/tests/slsa_analyzer/build_tool/test_maven.py +++ b/tests/slsa_analyzer/build_tool/test_maven.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Maven build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.maven import Maven @@ -177,7 +176,7 @@ def test_is_maven_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -332,7 +331,7 @@ def test_is_maven_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_npm.py b/tests/slsa_analyzer/build_tool/test_npm.py index 423e02199..f27b623f0 100644 --- a/tests/slsa_analyzer/build_tool/test_npm.py +++ b/tests/slsa_analyzer/build_tool/test_npm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the NPM build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.npm import NPM @@ -124,7 +123,7 @@ def test_is_npm_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -213,7 +212,7 @@ def test_is_npm_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_pip.py b/tests/slsa_analyzer/build_tool/test_pip.py index 4a7fb447f..50b1db4a3 100644 --- a/tests/slsa_analyzer/build_tool/test_pip.py +++ b/tests/slsa_analyzer/build_tool/test_pip.py @@ -1,11 +1,10 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Pip build functions.""" import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.pip import Pip @@ -91,7 +90,7 @@ def test_is_pip_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -202,7 +201,7 @@ def test_is_pip_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_poetry.py b/tests/slsa_analyzer/build_tool/test_poetry.py index 4923d23ef..ae42669af 100644 --- a/tests/slsa_analyzer/build_tool/test_poetry.py +++ b/tests/slsa_analyzer/build_tool/test_poetry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Poetry build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.poetry import Poetry @@ -122,7 +121,7 @@ def test_is_poetry_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -222,7 +221,7 @@ def test_is_poetry_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/build_tool/test_yarn.py b/tests/slsa_analyzer/build_tool/test_yarn.py index 06f645028..48f49977c 100644 --- a/tests/slsa_analyzer/build_tool/test_yarn.py +++ b/tests/slsa_analyzer/build_tool/test_yarn.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the Yarn build functions.""" @@ -7,7 +7,6 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode from macaron.slsa_analyzer.build_tool.base_build_tool import BuildToolCommand from macaron.slsa_analyzer.build_tool.language import BuildLanguage from macaron.slsa_analyzer.build_tool.yarn import Yarn @@ -124,7 +123,7 @@ def test_is_yarn_deploy_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), @@ -213,7 +212,7 @@ def test_is_yarn_package_command( language_distributions=language_distributions, language_url=None, ci_path=ci_path, - step_node=BaseNode(), + step_node=None, reachable_secrets=reachable_secrets, events=events, ), diff --git a/tests/slsa_analyzer/checks/test_build_as_code_check.py b/tests/slsa_analyzer/checks/test_build_as_code_check.py index b1bd82b12..75962cc18 100644 --- a/tests/slsa_analyzer/checks/test_build_as_code_check.py +++ b/tests/slsa_analyzer/checks/test_build_as_code_check.py @@ -9,19 +9,14 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.parsers.actionparser import parse as parse_action +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.build_tool.gradle import Gradle from macaron.slsa_analyzer.build_tool.pip import Pip from macaron.slsa_analyzer.checks.build_as_code_check import BuildAsCodeCheck, BuildAsCodeFacts from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_node, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.ci_service.jenkins import Jenkins from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload @@ -54,7 +49,7 @@ def test_build_as_code_check_no_callgraph( """Test the Build As Code Check when no callgraph is built for the CI service.""" ci_info = CIInfo( service=ci_services[ci_name], - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -143,7 +138,7 @@ def test_gha_workflow_deployment( check = BuildAsCodeCheck() ci_info = CIInfo( service=github_actions_service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -157,20 +152,8 @@ def test_gha_workflow_deployment( gha_deploy.dynamic_data["build_spec"]["tools"] = [pip_tool] gha_deploy.dynamic_data["ci_services"] = [ci_info] - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") workflow_path = os.path.join(workflows_dir, workflow_name) - parsed_obj = parse_action(workflow_path) - callee = GitHubWorkflowNode( - name=os.path.basename(workflow_path), - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - root.add_callee(callee) - build_call_graph_from_node(callee, repo_path="") - ci_info["callgraph"] = gh_cg + ci_info["callgraph"] = NodeForest([analyse_github_workflow_file(workflow_path, None)]) assert check.run_check(gha_deploy).result_type == expected_result @@ -189,7 +172,7 @@ def test_travis_ci_deploy( ci_info = CIInfo( service=travis_service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/checks/test_build_service_check.py b/tests/slsa_analyzer/checks/test_build_service_check.py index 4a5496c39..21ab9c1fe 100644 --- a/tests/slsa_analyzer/checks/test_build_service_check.py +++ b/tests/slsa_analyzer/checks/test_build_service_check.py @@ -8,7 +8,7 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.checks.build_service_check import BuildServiceCheck, BuildServiceFacts from macaron.slsa_analyzer.checks.check_result import CheckResultType @@ -44,7 +44,7 @@ def test_build_service_check_no_callgraph( """Test the Build Service Check when no callgraph is built for the CI service.""" ci_info = CIInfo( service=ci_services[ci_name], - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py b/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py index 883dfcc09..a58ceaf2b 100644 --- a/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py +++ b/tests/slsa_analyzer/checks/test_github_actions_vulnerability_check.py @@ -11,12 +11,12 @@ import pytest from pytest_httpserver import HTTPServer -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.config.defaults import load_defaults from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.github_actions_vulnerability_check import GitHubActionsVulnsCheck from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import build_call_graph_from_path from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload from macaron.slsa_analyzer.specs.ci_spec import CIInfo from macaron.slsa_analyzer.specs.inferred_provenance import InferredProvenance @@ -29,17 +29,14 @@ def get_ci_info(ci_services: dict[str, BaseCIService], ci_name: str, workflow_pa """Get CIInfo instance.""" ci_info = CIInfo( service=ci_services[ci_name], - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], build_info_results=InTotoV01Payload(statement=InferredProvenance().payload), ) if ci_name == "github_actions": - root_node: BaseNode = BaseNode() - workflow_node = build_call_graph_from_path(root_node, workflow_path=workflow_path, repo_path="") - root_node.add_callee(workflow_node) - ci_info["callgraph"] = CallGraph(root_node, "") + ci_info["callgraph"] = NodeForest([analyse_github_workflow_file(workflow_path, None)]) return ci_info diff --git a/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py b/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py index 8584e5f35..4abf8df64 100644 --- a/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py +++ b/tests/slsa_analyzer/checks/test_provenance_l3_content_check.py @@ -5,7 +5,7 @@ import os -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.asset import VirtualReleaseAsset from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.provenance_l3_content_check import ProvenanceL3ContentCheck @@ -82,7 +82,7 @@ def test_expectation_check(self) -> None: # Test GitHub Actions. ci_info = CIInfo( service=github_actions, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], diff --git a/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py b/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py index c36eba0d5..6f72ab739 100644 --- a/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py +++ b/tests/slsa_analyzer/checks/test_trusted_builder_l3_check.py @@ -8,15 +8,10 @@ import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.parsers.actionparser import parse as parse_action +from macaron.code_analyzer.dataflow_analysis.analysis import analyse_github_workflow_file +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.trusted_builder_l3_check import TrustedBuilderL3Check -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_node, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload from macaron.slsa_analyzer.specs.ci_spec import CIInfo @@ -47,7 +42,7 @@ def test_trusted_builder_l3_check( workflows_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "github", "workflow_files") ci_info = CIInfo( service=github_actions_service, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[], @@ -57,18 +52,6 @@ def test_trusted_builder_l3_check( ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="") ctx.dynamic_data["ci_services"] = [ci_info] - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") workflow_path = os.path.join(workflows_dir, workflow_name) - parsed_obj = parse_action(workflow_path) - callee = GitHubWorkflowNode( - name=workflow_name, - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - build_call_graph_from_node(callee, repo_path="") - root.add_callee(callee) - ci_info["callgraph"] = gh_cg + ci_info["callgraph"] = NodeForest([analyse_github_workflow_file(workflow_path, None)]) assert check.run_check(ctx).result_type == expected_result diff --git a/tests/slsa_analyzer/ci_service/test_github_actions.py b/tests/slsa_analyzer/ci_service/test_github_actions.py index 1995c3705..4da4f7d2a 100644 --- a/tests/slsa_analyzer/ci_service/test_github_actions.py +++ b/tests/slsa_analyzer/ci_service/test_github_actions.py @@ -1,21 +1,13 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests GitHub Actions CI service.""" -import os from datetime import datetime, timedelta from pathlib import Path import pytest -from macaron.code_analyzer.call_graph import BaseNode, CallGraph -from macaron.parsers.actionparser import parse as parse_action -from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( - GitHubWorkflowNode, - GitHubWorkflowType, - build_call_graph_from_node, -) from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions mock_repos = Path(__file__).parent.joinpath("mock_repos") @@ -30,59 +22,6 @@ def github_actions_() -> GitHubActions: return GitHubActions() -@pytest.mark.parametrize( - ( - "workflow_name", - "expect", - ), - [ - ( - "valid1.yaml", - [ - "GitHubWorkflowNode(valid1.yaml,GitHubWorkflowType.INTERNAL)", - "GitHubJobNode(build)", - "GitHubWorkflowNode(apache/maven-gh-actions-shared/.github/workflows/maven-verify.yml@v2,GitHubWorkflowType.REUSABLE)", - ], - ), - ( - "valid2.yaml", - [ - "GitHubWorkflowNode(valid2.yaml,GitHubWorkflowType.INTERNAL)", - "GitHubJobNode(build)", - "GitHubWorkflowNode(actions/checkout@v3,GitHubWorkflowType.EXTERNAL)", - "GitHubWorkflowNode(actions/cache@v3,GitHubWorkflowType.EXTERNAL)", - "GitHubWorkflowNode(actions/setup-java@v3,GitHubWorkflowType.EXTERNAL)", - "BashNode(Publish to Sonatype Snapshots,BashScriptType.INLINE)", - ], - ), - ], - ids=[ - "Internal and reusable workflows", - "Internal and external workflows", - ], -) -def test_build_call_graph(workflow_name: str, expect: list[str]) -> None: - """Test building call graphs for GitHub Actions workflows.""" - resources_dir = Path(__file__).parent.joinpath("resources", "github") - - # Parse GitHub Actions workflows. - root: BaseNode = BaseNode() - gh_cg = CallGraph(root, "") - workflow_path = os.path.join(resources_dir, workflow_name) - parsed_obj = parse_action(workflow_path) - - callee = GitHubWorkflowNode( - name=os.path.basename(workflow_path), - node_type=GitHubWorkflowType.INTERNAL, - source_path=workflow_path, - parsed_obj=parsed_obj, - caller=root, - ) - root.add_callee(callee) - build_call_graph_from_node(callee, repo_path="") - assert [str(node) for node in gh_cg.bfs()] == expect - - def test_is_detected(github_actions: GitHubActions) -> None: """Test detecting GitHub Action config files.""" assert github_actions.is_detected(str(ga_has_build_kws)) diff --git a/tests/slsa_analyzer/test_analyze_context.py b/tests/slsa_analyzer/test_analyze_context.py index 40a4ad881..4b1b1e776 100644 --- a/tests/slsa_analyzer/test_analyze_context.py +++ b/tests/slsa_analyzer/test_analyze_context.py @@ -6,7 +6,7 @@ from unittest import TestCase from unittest.mock import MagicMock -from macaron.code_analyzer.call_graph import BaseNode, CallGraph +from macaron.code_analyzer.dataflow_analysis.core import NodeForest from macaron.json_tools import JsonType from macaron.slsa_analyzer.asset import VirtualReleaseAsset from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions @@ -93,7 +93,7 @@ def test_provenances(self) -> None: gh_actions_ci_info = CIInfo( service=gh_actions, - callgraph=CallGraph(BaseNode(), ""), + callgraph=NodeForest([]), provenance_assets=[], release={}, provenances=[