Fix linters where strange comments produce garbled metadata (#2897)

MichaelChirico · web-flow · commit 38db1acd5460 · 2025-07-26T12:31:34.000-07:00
* Improve robustness to comments

* need strip_comments_from_subtree

* fix linter logic producing garbled metadata

* also need the new NAMESPACE entry
diff --git a/NAMESPACE b/NAMESPACE
@@ -196,4 +196,5 @@ importFrom(xml2,xml_find_first)
 importFrom(xml2,xml_find_lgl)
 importFrom(xml2,xml_find_num)
 importFrom(xml2,xml_name)
+importFrom(xml2,xml_parent)
 importFrom(xml2,xml_text)
diff --git a/R/comparison_negation_linter.R b/R/comparison_negation_linter.R
@@ -65,13 +65,13 @@ comparison_negation_linter <- function() {
 
     bad_expr <- xml_find_all(xml, xpath)
 
-    comparator_node <- xml_find_first(bad_expr, "expr/expr/*[2]")
+    comparator_node <- xml_find_first(bad_expr, "expr/expr/*[not(self::COMMENT)][2]")
     comparator_name <- xml_name(comparator_node)
 
     # "typical" case is assumed to be !(x == y), so try that first, and back
     #   up to the less nested case. there may be a cleaner way to do this...
     unnested <- !comparator_name %in% names(comparator_inverses)
-    comparator_node[unnested] <- xml_find_first(bad_expr[unnested], "expr/*[2]")
+    comparator_node[unnested] <- xml_find_first(bad_expr[unnested], "expr/*[not(self::COMMENT)][2]")
     comparator_name[unnested] <- xml_name(comparator_node[unnested])
 
     comparator_text <- xml_text(comparator_node)
diff --git a/R/ifelse_censor_linter.R b/R/ifelse_censor_linter.R
@@ -36,20 +36,19 @@
 #' @export
 ifelse_censor_linter <- function() {
   xpath <- glue("
-  following-sibling::expr[
+  self::*[expr[
     (LT or GT or LE or GE)
     and expr[1] = following-sibling::expr
     and expr[2] = following-sibling::expr
-  ]
-    /parent::expr
-  ")
+  ]]")
 
   Linter(linter_level = "expression", function(source_expression) {
-    ifelse_calls <- source_expression$xml_find_function_calls(ifelse_funs)
+    ifelse_calls <- xml_parent(source_expression$xml_find_function_calls(ifelse_funs))
+    ifelse_calls <- strip_comments_from_subtree(ifelse_calls)
     bad_expr <- xml_find_all(ifelse_calls, xpath)
 
     matched_call <- xp_call_name(bad_expr)
-    operator <- xml_find_chr(bad_expr, "string(expr[2]/*[2])")
+    operator <- xml_find_chr(bad_expr, "string(expr[2]/*[not(self::COMMENT)][2])")
     match_first <- !is.na(xml_find_first(bad_expr, "expr[2][expr[1] = following-sibling::expr[1]]"))
     optimizer <- ifelse((operator %in% c("<", "<=")) == match_first, "pmin", "pmax")
     first_var <- rep_len("x", length(match_first))
diff --git a/R/lintr-package.R b/R/lintr-package.R
@@ -15,7 +15,8 @@
 #' @importFrom tools R_user_dir
 #' @importFrom utils capture.output getParseData  globalVariables head relist tail
 #' @importFrom xml2 as_list
-#'   xml_attr xml_children xml_find_all xml_find_chr xml_find_lgl xml_find_num xml_find_first xml_name xml_text
+#'   xml_attr xml_children xml_find_all xml_find_chr xml_find_lgl xml_find_num
+#'   xml_find_first xml_name xml_parent xml_text
 ## lintr namespace: end
 NULL
 
diff --git a/R/nzchar_linter.R b/R/nzchar_linter.R
@@ -112,8 +112,9 @@ nzchar_linter <- function() {
   #   its "opposite" (not inverse) if the bad usage is on the RHS,
   #   e.g. 0 < nchar(x) has to be treated as nchar(x) > 0.
   op_for_msg <- function(expr, const) {
-    op <- xml_name(xml_find_first(expr, "*[2]"))
-    maybe_needs_flip <- !is.na(xml_find_first(expr, sprintf("*[1][%s]", const)))
+    op <- xml_name(xml_find_first(expr, "*[not(self::COMMENT)][2]"))
+    maybe_needs_flip <-
+      !is.na(xml_find_first(expr, sprintf("*[not(self::COMMENT)][1][%s]", const)))
 
     ordered_ops <- c("GT", "GE", "LE", "LT")
     ordered_idx <- match(op, ordered_ops)
diff --git a/R/vector_logic_linter.R b/R/vector_logic_linter.R
@@ -77,7 +77,7 @@ vector_logic_linter <- function() {
       and preceding-sibling::*[
         self::IF
         or self::WHILE
-        or self::expr[SYMBOL_FUNCTION_CALL[text() = 'expect_true' or text() = 'expect_false']]
+        or self::expr/SYMBOL_FUNCTION_CALL[text() = 'expect_true' or text() = 'expect_false']
       ]
     ]
     and not(ancestor::expr[
@@ -100,7 +100,7 @@ vector_logic_linter <- function() {
       and not(preceding-sibling::OP-LEFT-BRACKET)
       and not(preceding-sibling::*[not(self::COMMENT)][2][self::SYMBOL_SUB and text() = 'circular'])
     ]
-    /*[2]
+    /*[not(self::COMMENT)][2]
   "
 
   Linter(linter_level = "expression", function(source_expression) {
diff --git a/tests/testthat/test-comparison_negation_linter.R b/tests/testthat/test-comparison_negation_linter.R
@@ -2,16 +2,16 @@ test_that("comparison_negation_linter skips allowed usages", {
   linter <- comparison_negation_linter()
 
   # doesn't apply to joint statements
-  expect_lint("!(x == y | y == z)", NULL, linter)
+  expect_no_lint("!(x == y | y == z)", linter)
   # don't force de Morgan's laws
-  expect_lint("!(x & y)", NULL, linter)
+  expect_no_lint("!(x & y)", linter)
 
   # naive xpath will include !foo(x) cases
-  expect_lint("!any(x > y)", NULL, linter)
+  expect_no_lint("!any(x > y)", linter)
   # ditto for tidyeval cases
-  expect_lint("!!target == 1 ~ 'target'", NULL, linter)
+  expect_no_lint("!!target == 1 ~ 'target'", linter)
   # ditto for !x[f == g]
-  expect_lint("!passes.test[stage == 1]", NULL, linter)
+  expect_no_lint("!passes.test[stage == 1]", linter)
 })
 
 local({
@@ -61,3 +61,14 @@ test_that("Lints vectorize", {
     comparison_negation_linter()
   )
 })
+
+test_that("logic survives adversarial comments", {
+  expect_lint(
+    trim_some("
+      !(x #
+      > y)
+    "),
+    rex::rex("Use x <= y, not !(x > y)"),
+    comparison_negation_linter()
+  )
+})
diff --git a/tests/testthat/test-ifelse_censor_linter.R b/tests/testthat/test-ifelse_censor_linter.R
@@ -1,8 +1,8 @@
 test_that("ifelse_censor_linter skips allowed usages", {
   linter <- ifelse_censor_linter()
 
-  expect_lint("ifelse(x == 2, x, y)", NULL, linter)
-  expect_lint("ifelse(x > 2, x, y)", NULL, linter)
+  expect_no_lint("ifelse(x == 2, x, y)", linter)
+  expect_no_lint("ifelse(x > 2, x, y)", linter)
 })
 
 test_that("ifelse_censor_linter blocks simple disallowed usages", {
@@ -56,13 +56,30 @@ test_that("ifelse_censor_linter blocks simple disallowed usages", {
   )
 
   # more complicated expression still matches
-  lines <- trim_some("
-    ifelse(2 + p + 104 + 1 > ncols,
-           ncols, 2 + p + 104 + 1
-           )
-  ")
   expect_lint(
-    lines,
+    trim_some("
+      ifelse(2 + p + 104 + 1 > ncols,
+            ncols, 2 + p + 104 + 1
+            )
+    "),
+    rex::rex("pmin(x, y) is preferable to ifelse(x > y, y, x)"),
+    linter
+  )
+
+  # including with comments
+  expect_lint(
+    trim_some("
+      ifelse(2 + p + 104 + 1 #comment
+      > ncols, ncols, 2 + p + 104 + 1)
+    "),
+    rex::rex("pmin(x, y) is preferable to ifelse(x > y, y, x)"),
+    linter
+  )
+  expect_lint(
+    trim_some("
+      ifelse(2 + p + 104 + # comment
+      1 > ncols, ncols, 2 + p + 104 + 1)
+    "),
     rex::rex("pmin(x, y) is preferable to ifelse(x > y, y, x)"),
     linter
   )
diff --git a/tests/testthat/test-nzchar_linter.R b/tests/testthat/test-nzchar_linter.R
@@ -33,13 +33,22 @@ test_that("nzchar_linter skips as appropriate for other nchar args", {
 
 test_that("nzchar_linter blocks simple disallowed usages", {
   linter <- nzchar_linter()
-  lint_msg_quote <- rex::rex('Use !nzchar(x) instead of x == ""')
-  lint_msg_nchar <- rex::rex("Use nzchar() instead of comparing nchar(x) to 0")
+  lint_msg <- rex::rex("Use !nzchar(x) instead of nchar(x) == 0")
 
-  expect_lint("which(x == '')", lint_msg_quote, linter)
+  expect_lint("which(x == '')", rex::rex('Use !nzchar(x) instead of x == ""'), linter)
   expect_lint("any(nchar(x) >= 0)", rex::rex("nchar(x) >= 0 is always true, maybe you want nzchar(x)?"), linter)
-  expect_lint("all(nchar(x) == 0L)", rex::rex("Use !nzchar(x) instead of nchar(x) == 0"), linter)
+  expect_lint("all(nchar(x) == 0L)", lint_msg, linter)
   expect_lint("sum(0.0 < nchar(x))", rex::rex("Use nzchar(x) instead of nchar(x) > 0"), linter)
+
+  # adversarial comment
+  expect_lint(
+    trim_some("
+      all(nchar(x) #comment
+      == 0L)
+    "),
+    lint_msg,
+    linter
+  )
 })
 
 test_that("nzchar_linter skips comparison to '' in if/while statements", {
diff --git a/tests/testthat/test-vector_logic_linter.R b/tests/testthat/test-vector_logic_linter.R
@@ -1,30 +1,31 @@
 test_that("vector_logic_linter skips allowed usages", {
   linter <- vector_logic_linter()
 
-  expect_lint("if (TRUE) 5 else if (TRUE) 2", NULL, linter)
-  expect_lint("if (TRUE || FALSE) 1; while (TRUE && FALSE) 2", NULL, linter)
+  expect_no_lint("if (TRUE) 5 else if (TRUE) 2", linter)
+  expect_no_lint("if (TRUE || FALSE) 1; while (TRUE && FALSE) 2", linter)
 
   # function calls and extractions may aggregate to scalars -- only catch
   #   usages at the highest logical level
-  expect_lint("if (agg_function(x & y)) 1", NULL, linter)
-  expect_lint("if (DT[x | y, cond]) 1", NULL, linter)
+  expect_no_lint("if (agg_function(x & y)) 1", linter)
+  expect_no_lint("if (DT[x | y, cond]) 1", linter)
 
   # don't match potentially OK usages nested within calls
-  expect_lint("if (TRUE && any(TRUE | FALSE)) 4", NULL, linter)
+  expect_no_lint("if (TRUE && any(TRUE | FALSE)) 4", linter)
   # even if the usage is nested in those calls (b/181915948)
-  expect_lint("if (TRUE && any(TRUE | FALSE | TRUE)) 4", NULL, linter)
+  expect_no_lint("if (TRUE && any(TRUE | FALSE | TRUE)) 4", linter)
 
   # don't match potentially OK usages in the branch itself
-  lines <- trim_some("
-    if (TRUE) {
-      x | y
-    }
-  ")
-  expect_lint(lines, NULL, linter)
-
+  expect_no_lint(
+    trim_some("
+      if (TRUE) {
+        x | y
+      }
+    "),
+    linter
+  )
 
   # valid nested usage within aggregator
-  expect_lint("testthat::expect_false(any(TRUE | TRUE))", NULL, linter)
+  expect_no_lint("testthat::expect_false(any(TRUE | TRUE))", linter)
 })
 
 test_that("vector_logic_linter blocks simple disallowed usages", {
@@ -63,41 +64,40 @@ test_that("vector_logic_linter catches usages in expect_true()/expect_false()",
 })
 
 test_that("vector_logic_linter doesn't get mixed up from complex usage", {
-  expect_lint(
+  expect_no_lint(
     trim_some("
       if (a) {
         expect_true(ok)
         x <- 2
         a | b
       }
     "),
-    NULL,
     vector_logic_linter()
   )
 })
 
 test_that("vector_logic_linter recognizes some false positves around bitwise &/|", {
   linter <- vector_logic_linter()
 
-  expect_lint("if (info & as.raw(12)) { }", NULL, linter)
-  expect_lint("if (as.raw(12) & info) { }", NULL, linter)
-  expect_lint("if (info | as.raw(12)) { }", NULL, linter)
-  expect_lint("if (info & as.octmode('100')) { }", NULL, linter)
-  expect_lint("if (info | as.octmode('011')) { }", NULL, linter)
-  expect_lint("if (info & as.hexmode('100')) { }", NULL, linter)
-  expect_lint("if (info | as.hexmode('011')) { }", NULL, linter)
+  expect_no_lint("if (info & as.raw(12)) { }", linter)
+  expect_no_lint("if (as.raw(12) & info) { }", linter)
+  expect_no_lint("if (info | as.raw(12)) { }", linter)
+  expect_no_lint("if (info & as.octmode('100')) { }", linter)
+  expect_no_lint("if (info | as.octmode('011')) { }", linter)
+  expect_no_lint("if (info & as.hexmode('100')) { }", linter)
+  expect_no_lint("if (info | as.hexmode('011')) { }", linter)
   # implicit as.octmode() coercion
-  expect_lint("if (info & '100') { }", NULL, linter)
-  expect_lint("if (info | '011') { }", NULL, linter)
-  expect_lint("if ('011' | info) { }", NULL, linter)
+  expect_no_lint("if (info & '100') { }", linter)
+  expect_no_lint("if (info | '011') { }", linter)
+  expect_no_lint("if ('011' | info) { }", linter)
 
   # further nesting
-  expect_lint("if ((info & as.raw(12)) == as.raw(12)) { }", NULL, linter)
-  expect_lint("if ((info | as.raw(12)) == as.raw(12)) { }", NULL, linter)
-  expect_lint('if ((mode & "111") != as.octmode("111")) { }', NULL, linter)
-  expect_lint('if ((mode | "111") != as.octmode("111")) { }', NULL, linter)
-  expect_lint('if ((mode & "111") != as.hexmode("111")) { }', NULL, linter)
-  expect_lint('if ((mode | "111") != as.hexmode("111")) { }', NULL, linter)
+  expect_no_lint("if ((info & as.raw(12)) == as.raw(12)) { }", linter)
+  expect_no_lint("if ((info | as.raw(12)) == as.raw(12)) { }", linter)
+  expect_no_lint('if ((mode & "111") != as.octmode("111")) { }', linter)
+  expect_no_lint('if ((mode | "111") != as.octmode("111")) { }', linter)
+  expect_no_lint('if ((mode & "111") != as.hexmode("111")) { }', linter)
+  expect_no_lint('if ((mode | "111") != as.hexmode("111")) { }', linter)
 })
 
 test_that("incorrect subset/filter usage is caught", {
@@ -128,46 +128,62 @@ test_that("subsetting logic handles nesting", {
   expect_lint("filter(x, a & b || c)", or_msg, linter)
   expect_lint("filter(x, a && b | c)", and_msg, linter)
 
+  # adversarial commenting
+  expect_lint(
+    trim_some("
+      filter(x, a #comment
+      && b | c)
+    "),
+    and_msg,
+    linter
+  )
+
+  expect_lint(
+    trim_some("
+      filter(x, a && #comment
+      b | c)
+    "),
+    and_msg,
+    linter
+  )
+
   # but not valid usage
-  expect_lint("filter(x, y < mean(y, na.rm = AA && BB))", NULL, linter)
-  expect_lint("subset(x, y < mean(y, na.rm = AA && BB) & y > 0)", NULL, linter)
-  expect_lint("subset(x, y < x[y > 0, drop = AA && BB, y])", NULL, linter)
+  expect_no_lint("filter(x, y < mean(y, na.rm = AA && BB))", linter)
+  expect_no_lint("subset(x, y < mean(y, na.rm = AA && BB) & y > 0)", linter)
+  expect_no_lint("subset(x, y < x[y > 0, drop = AA && BB, y])", linter)
 })
 
 test_that("filter() handling is conservative about stats::filter()", {
   linter <- vector_logic_linter()
   and_msg <- rex::rex("Use `&` in subsetting expressions")
 
   # NB: this should be invalid, filter= is a vector argument
-  expect_lint("stats::filter(x, y && z)", NULL, linter)
+  expect_no_lint("stats::filter(x, y && z)", linter)
   # The only logical argument to stats::filter(), exclude by keyword
-  expect_lint("filter(x, circular = y && z)", NULL, linter)
+  expect_no_lint("filter(x, circular = y && z)", linter)
   # But presence of circular= doesn't invalidate lint
   expect_lint("filter(x, circular = TRUE, y && z)", and_msg, linter)
   expect_lint("filter(x, y && z, circular = TRUE)", and_msg, linter)
-  expect_lint(
+  expect_no_lint(
     trim_some("
       filter(x, circular # comment
       = y && z)
     "),
-    NULL,
     linter
   )
-  expect_lint(
+  expect_no_lint(
     trim_some("
       filter(x, circular = # comment
         y && z)
     "),
-    NULL,
     linter
   )
-  expect_lint(
+  expect_no_lint(
     trim_some("
       filter(x, circular # comment
       = # comment
       y && z)
     "),
-    NULL,
     linter
   )
 })

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ vector_logic_linter <- function() {`
`77`	`77`	`and preceding-sibling::*[`
`78`	`78`	`self::IF`
`79`	`79`	`or self::WHILE`
`80`		`- or self::expr[SYMBOL_FUNCTION_CALL[text() = 'expect_true' or text() = 'expect_false']]`
	`80`	`+ or self::expr/SYMBOL_FUNCTION_CALL[text() = 'expect_true' or text() = 'expect_false']`
`81`	`81`	`]`
`82`	`82`	`]`
`83`	`83`	`and not(ancestor::expr[`
`@@ -100,7 +100,7 @@ vector_logic_linter <- function() {`
`100`	`100`	`and not(preceding-sibling::OP-LEFT-BRACKET)`
`101`	`101`	`and not(preceding-sibling::*[not(self::COMMENT)][2][self::SYMBOL_SUB and text() = 'circular'])`
`102`	`102`	`]`
`103`		`- /*[2]`
	`103`	`+ /*[not(self::COMMENT)][2]`
`104`	`104`	`"`
`105`	`105`
`106`	`106`	`Linter(linter_level = "expression", function(source_expression) {`