insightsengineering · Copilot · Aug 7, 2025 · Aug 7, 2025 · Aug 8, 2025
diff --git a/R/utils-get_code_dependency.R b/R/utils-get_code_dependency.R
@@ -211,108 +211,254 @@
 #' @keywords internal
 #' @noRd
 extract_occurrence <- function(pd) {
-  is_in_function <- function(x) {
-    # If an object is a function parameter,
-    # then in calls_pd there is a `SYMBOL_FORMALS` entry for that object.
-    function_id <- x[x$token == "FUNCTION", "parent"]
-    if (length(function_id)) {
-      x$id %in% get_children(x, function_id[1])$id
-    } else {
-      rep(FALSE, nrow(x))
-    }
+  # Handle special function calls first
+  data_dependency <- handle_data_call(pd)
+  if (!is.null(data_dependency)) {
+    return(data_dependency)
+  }
+
+  assign_dependency <- handle_assign_call(pd)
+  if (!is.null(assign_dependency)) {
+    return(assign_dependency)
   }
-  in_parenthesis <- function(x) {
-    if (any(x$token %in% c("LBB", "'['"))) {
-      id_start <- min(x$id[x$token %in% c("LBB", "'['")])
-      id_end <- min(x$id[x$token == "']'"])
-      x$text[x$token == "SYMBOL" & x$id > id_start & x$id < id_end]
+
+  # Process general assignment expressions
+  extract_general_assignment(pd)
+}
+
+#' Handle data() function calls
+#'
+#' Extracts dependencies for data() function calls like data(object) or data("object").
+#'
+#' @param pd `data.frame` parse data for a single call
+#' @return Character vector with dependency info or NULL if not a data call
+#' @keywords internal
+#' @noRd
+handle_data_call <- function(pd) {
+  data_call_pos <- find_call(pd, "data")
+  if (data_call_pos == 0) {
+    return(NULL)
+  }
+
+  # Extract the object name from data(object)
+  object_name <- pd[data_call_pos + 1, "text"]
+  # Remove quotes if present: data("object") -> object
+  clean_name <- gsub("^['\"]|['\"]$", "", object_name)
+  c(clean_name, "<-")
+}
+
+#' Handle assign() function calls
+#'
+#' Extracts dependencies for assign() function calls with named or unnamed parameters.
+#'
+#' @param pd `data.frame` parse data for a single call
+#' @return Character vector with dependency info or NULL if not an assign call
+#' @keywords internal
+#' @noRd
+handle_assign_call <- function(pd) {
+  assign_call_pos <- find_call(pd, "assign")
+  if (assign_call_pos == 0) {
+    return(NULL)
+  }
+
+  param_position <- get_assign_param_position(pd)
+  if (param_position == 0) {
+    return(character(0L))
+  }
+
+  # Extract the assigned object name
+  object_name <- pd[assign_call_pos + param_position, "text"]
+  # Remove quotes if present: assign("object", value) -> object
+  clean_name <- gsub("^['\"]|['\"]$", "", object_name)
+  c(clean_name, "<-")
+}
+
+#' Get parameter position for assign() function
+#'
+#' Determines the position of the 'x' parameter in assign() calls,
+#' handling both named and unnamed parameter cases.
+#'
+#' @param pd `data.frame` parse data for a single call
+#' @return Integer position offset from assign call, or 0 if 'x' parameter not found
+#' @keywords internal
+#' @noRd
+get_assign_param_position <- function(pd) {
+  # Check if any parameters are named
+  has_named_params <- any(pd$token == "SYMBOL_SUB")
+
+  if (has_named_params) {
+    # Extract parameter tokens: named parameters (SYMBOL_SUB), commas, equals
+    param_tokens <- pd[pd$token %in% c("SYMBOL_SUB", "','", "EQ_SUB"), "text"]
+
+    # Clean up sequences of "=" followed by ","
+    cleaned_params <- remove_consecutive_equals_comma(param_tokens)
+
+    # Find position of 'x' parameter among named parameters
+    pos <- match("x", setdiff(cleaned_params, ","), nomatch = 0)
+    if (pos == 0) {
+      # If 'x' not found in named params, check for first unnamed parameter
+      pos <- match(",", cleaned_params, nomatch = 0)
     }
+    return(pos)
+  } else {
+    # All parameters are unnamed, 'x' is the first parameter
+    return(1)
   }
+}
 
-  # Handle data(object)/data("object")/data(object, envir = ) independently.
-  data_call <- find_call(pd, "data")
-  if (data_call) {
-    sym <- pd[data_call + 1, "text"]
-    return(c(gsub("^['\"]|['\"]$", "", sym), "<-"))
+#' Remove consecutive "=" and "," sequences from parameter tokens
+#'
+#' @param params Character vector of parameter tokens
+#' @return Character vector with "=", "," sequences removed
+#' @keywords internal
+#' @noRd
+remove_consecutive_equals_comma <- function(params) {
+  if (length(params) <= 1) {
+    return(params)
   }
-  # Handle assign(x = ).
-  assign_call <- find_call(pd, "assign")
-  if (assign_call) {
-    # Check if parameters were named.
-    # "','" is for unnamed parameters, where "SYMBOL_SUB" is for named.
-    # "EQ_SUB" is for `=` appearing after the name of the named parameter.
-    if (any(pd$token == "SYMBOL_SUB")) {
-      params <- pd[pd$token %in% c("SYMBOL_SUB", "','", "EQ_SUB"), "text"]
-      # Remove sequence of "=", ",".
-      if (length(params > 1)) {
-        remove <- integer(0)
-        for (i in 2:length(params)) {
-          if (params[i - 1] == "=" && params[i] == ",") {
-            remove <- c(remove, i - 1, i)
-          }
-        }
-        if (length(remove)) params <- params[-remove]
-      }
-      pos <- match("x", setdiff(params, ","), nomatch = match(",", params, nomatch = 0))
-      if (!pos) {
-        return(character(0L))
-      }
-      # pos is indicator of the place of 'x'
-      # 1. All parameters are named, but none is 'x' - return(character(0L))
-      # 2. Some parameters are named, 'x' is in named parameters: match("x", setdiff(params, ","))
-      # - check "x" in params being just a vector of named parameters.
-      # 3. Some parameters are named, 'x' is not in named parameters
-      # - check first appearance of "," (unnamed parameter) in vector parameters.
-    } else {
-      # Object is the first entry after 'assign'.
-      pos <- 1
+
+  indices_to_remove <- integer(0)
+  for (i in 2:length(params)) {
+    if (params[i - 1] == "=" && params[i] == ",") {
+      indices_to_remove <- c(indices_to_remove, i - 1, i)
     }
-    sym <- pd[assign_call + pos, "text"]
-    return(c(gsub("^['\"]|['\"]$", "", sym), "<-"))
   }
 
-  # What occurs in a function body is not tracked.
-  x <- pd[!is_in_function(pd), ]
-  sym_cond <- which(x$token %in% c("SPECIAL", "SYMBOL", "SYMBOL_FUNCTION_CALL"))
-  sym_fc_cond <- which(x$token == "SYMBOL_FUNCTION_CALL")
+  if (length(indices_to_remove) > 0) {
+    params[-indices_to_remove]
+  } else {
+    params
+  }
+}
 
-  if (length(sym_cond) == 0) {
+#' Extract dependencies from general assignment expressions
+#'
+#' Processes standard assignment operations (<-, =, ->) and extracts
+#' object dependencies while handling special cases.
+#'
+#' @param pd `data.frame` parse data for a single call
+#' @return Character vector with dependency information
+#' @keywords internal
+#' @noRd
+extract_general_assignment <- function(pd) {
+  # Filter out symbols that are function parameters
+  filtered_pd <- pd[!is_symbol_in_function_body(pd), ]
+
+  # Find all symbol positions
+  symbol_indices <- which(filtered_pd$token %in% c("SPECIAL", "SYMBOL", "SYMBOL_FUNCTION_CALL"))
+  function_call_indices <- which(filtered_pd$token == "SYMBOL_FUNCTION_CALL")
+
+  if (length(symbol_indices) == 0) {
     return(character(0L))
   }
-  # Watch out for SYMBOLS after $ and @. For x$a x@a: x is object, a is not.
-  # For x$a, a's ID is $'s ID-2 so we need to remove all IDs that have ID = $ID - 2.
-  dollar_ids <- x[x$token %in% c("'$'", "'@'"), "id"]
-  if (length(dollar_ids)) {
-    object_ids <- x[sym_cond, "id"]
-    after_dollar <- object_ids[(object_ids - 2) %in% dollar_ids]
-    sym_cond <- setdiff(sym_cond, which(x$id %in% after_dollar))
+
+  # Remove symbols that come after $ or @ operators (e.g., in x$a, remove 'a')
+  symbol_indices <- exclude_symbols_after_operators(filtered_pd, symbol_indices)
+
+  # Look for assignment operators
+  assignment_indices <- grep("ASSIGN", filtered_pd$token)
+  if (length(assignment_indices) == 0) {
+    # No assignment found, return all symbols as dependencies
+    return(c("<-", unique(filtered_pd[symbol_indices, "text"])))
   }
 
-  assign_cond <- grep("ASSIGN", x$token)
-  if (!length(assign_cond)) {
-    return(c("<-", unique(x[sym_cond, "text"])))
+  # Process assignment expression
+  process_assignment_expression(filtered_pd, symbol_indices, function_call_indices, assignment_indices, pd)
+}
+
+#' Check if symbols are in function body
+#'
+#' @param pd `data.frame` parse data
+#' @return Logical vector indicating which rows are function parameters
+#' @keywords internal
+#' @noRd
+is_symbol_in_function_body <- function(pd) {
+  function_ids <- pd[pd$token == "FUNCTION", "parent"]
+  if (length(function_ids) == 0) {
+    return(rep(FALSE, nrow(pd)))
   }
 
-  # For cases like 'eval(expression(c <- b + 2))' removes 'eval(expression('.
-  sym_cond <- sym_cond[!(sym_cond < min(assign_cond) & sym_cond %in% sym_fc_cond)]
+  # Get all children of function definitions
+  function_children <- get_children(pd, function_ids[1])$id
+  pd$id %in% function_children
+}
 
-  # If there was an assignment operation detect direction of it.
-  if (unique(x$text[assign_cond]) == "->") { # What if there are 2 assignments: e.g. a <- b -> c.
-    sym_cond <- rev(sym_cond)
+#' Exclude symbols that appear after $ or @ operators
+#'
+#' @param pd `data.frame` parse data
+#' @param symbol_indices Integer vector of symbol positions
+#' @return Integer vector of filtered symbol positions
+#' @keywords internal
+#' @noRd
+exclude_symbols_after_operators <- function(pd, symbol_indices) {
+  # Find $ and @ operator positions
+  operator_ids <- pd[pd$token %in% c("'$'", "'@'"), "id"]
+  if (length(operator_ids) == 0) {
+    return(symbol_indices)
   }
 
-  after <- match(min(x$id[assign_cond]), sort(x$id[c(min(assign_cond), sym_cond)])) - 1
-  ans <- append(x[sym_cond, "text"], "<-", after = max(1, after))
-  ans <- move_functions_after_arrow(ans, unique(x[sym_fc_cond, "text"]))
-  roll <- in_parenthesis(pd)
-  if (length(roll)) {
-    c(setdiff(ans, roll), roll)
+  # For x$a, a's ID is $'s ID-2, so we exclude symbols with ID = operator_ID - 2
+  symbol_ids <- pd[symbol_indices, "id"]
+  symbols_after_operators <- symbol_ids[(symbol_ids - 2) %in% operator_ids]
+
+  # Remove these symbols from our indices
+  setdiff(symbol_indices, which(pd$id %in% symbols_after_operators))
+}
+
+#' Process assignment expression and build dependency vector
+#'
+#' @param pd `data.frame` filtered parse data
+#' @param symbol_indices Integer vector of symbol positions
+#' @param function_call_indices Integer vector of function call positions
+#' @param assignment_indices Integer vector of assignment operator positions
+#' @param original_pd `data.frame` original parse data for bracket processing
+#' @return Character vector with dependency information
+#' @keywords internal
+#' @noRd
+process_assignment_expression <- function(pd, symbol_indices, function_call_indices, assignment_indices, original_pd) {
+  # Remove function calls that appear before assignment (e.g., in eval(expression(c <- b)))
+  symbol_indices <- symbol_indices[!(symbol_indices < min(assignment_indices) & symbol_indices %in% function_call_indices)]
+
+  # Handle right-to-left assignment (->) by reversing symbol order
+  if (unique(pd$text[assignment_indices]) == "->") {
+    symbol_indices <- rev(symbol_indices)
+  }
+
+  # Build dependency vector with assignment operator in correct position
+  assignment_pos <- match(min(pd$id[assignment_indices]), sort(pd$id[c(min(assignment_indices), symbol_indices)])) - 1
+  dependency_vector <- append(pd[symbol_indices, "text"], "<-", after = max(1, assignment_pos))
+
+  # Move function names to right side of dependency arrow
+  dependency_vector <- move_functions_after_arrow(dependency_vector, unique(pd[function_call_indices, "text"]))
+
+  # Handle symbols in brackets/parentheses
+  bracket_symbols <- extract_symbols_in_brackets(original_pd)
+  if (length(bracket_symbols) > 0) {
+    c(setdiff(dependency_vector, bracket_symbols), bracket_symbols)
   } else {
-    ans
+    dependency_vector
   }
 }
 
-#' Moves function names to the right side of dependency graph
+#' Extract symbols that appear within brackets or parentheses
+#'
+#' @param pd `data.frame` parse data
+#' @return Character vector of symbols in brackets, or NULL
+#' @keywords internal
+#' @noRd
+extract_symbols_in_brackets <- function(pd) {
+  # Look for bracket/parenthesis tokens
+  has_brackets <- any(pd$token %in% c("LBB", "'['"))
+  if (!has_brackets) {
+    return(NULL)
+  }
+
+  start_id <- min(pd$id[pd$token %in% c("LBB", "'['")])
+  end_id <- min(pd$id[pd$token == "']'"])
+
+  # Extract symbols between brackets
+  pd$text[pd$token == "SYMBOL" & pd$id > start_id & pd$id < end_id]
+}
 #'
 #' Changes status of the function call from dependent to dependency if occurs in the lhs.
 #' Technically, it means to move function names after the dependency operator.