Avoid non-UTF8-encoded test files. (#5965)

zygoloid · web-flow · commit ad84e71acdd6 · 2025-08-20T21:07:58.000Z
Add a content keyword to file_test, `[[@0xab]]`, that expands to the code unit 0xAB, and use that instead of putting raw malformed code units in test files. Instead of printing the raw input bytes in snippets in diagnostics, replace non-printable characters with <AB> in the output, being careful to still compute the location of the caret and underscore properly.
diff --git a/.gitattributes b/.gitattributes
@@ -5,7 +5,3 @@
 # This tells Github to detect files having the extension `.def` as `C++` files, which
 # ensures that these files get syntax highlighted properly.
 *.def linguist-language=C++
-
-# This tells Git to treat lexer tests as text when producing diffs, even if
-# they contain non-printable characters.
-toolchain/lex/testdata/*.carbon diff
diff --git a/testing/file_test/test_file.cpp b/testing/file_test/test_file.cpp
@@ -145,7 +145,8 @@ static auto AutoFillDidOpenParams(llvm::json::Object& params,
   return Success();
 }
 
-// Reformats `[[@LSP:` and similar keyword as an LSP call with headers.
+// Reformats `[[@LSP:` and similar keyword as an LSP call with headers. Returns
+// the position to start a find for the next keyword.
 static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
                                 int& lsp_call_id,
                                 llvm::ArrayRef<TestFile::Split> splits)
@@ -154,7 +155,7 @@ static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
       llvm::StringRef(content).substr(keyword_pos);
 
   auto [keyword, body_start] = content_at_keyword.split(":");
-  if (body_start.empty()) {
+  if (keyword.size() == content_at_keyword.size()) {
     return ErrorBuilder() << "Missing `:` for `"
                           << content_at_keyword.take_front(10) << "`";
   }
@@ -179,12 +180,11 @@ static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
   }
 
   static constexpr llvm::StringLiteral LspEnd = "]]";
-  auto body_end = body_start.find(LspEnd);
-  if (body_end == std::string::npos) {
+  auto [body, rest] = body_start.split("]]");
+  if (body.size() == body_start.size()) {
     return ErrorBuilder() << "Missing `" << LspEnd << "` after `" << keyword
                           << "`";
   }
-  llvm::StringRef body = body_start.take_front(body_end);
   auto [method_or_id, extra_content] = body.split(":");
 
   llvm::json::Value parsed_extra_content = nullptr;
@@ -231,12 +231,33 @@ static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
   auto json_with_header = llvm::formatv("Content-Length: {0}\n\n{1}\n",
                                         content_length, buffer.TakeStr())
                               .str();
-  int keyword_len =
-      (body_start.data() + body_end + LspEnd.size()) - keyword.data();
+  size_t keyword_len = rest.data() - keyword.data();
   content.replace(keyword_pos, keyword_len, json_with_header);
   return keyword_pos + json_with_header.size();
 }
 
+// Replaces `[[@0xAB]]` with the raw byte with value 0xAB. Returns the position
+// to start a find for the next keyword.
+static auto ReplaceRawByteKeywordAt(std::string& content, size_t keyword_pos)
+    -> ErrorOr<size_t> {
+  llvm::StringRef content_at_keyword =
+      llvm::StringRef(content).substr(keyword_pos);
+  auto [keyword, rest] = content_at_keyword.split("]]");
+  if (keyword.size() == content_at_keyword.size()) {
+    return ErrorBuilder() << "Missing `]]` after " << keyword.take_front(10)
+                          << "`";
+  }
+
+  unsigned char byte_value;
+  if (keyword.substr(std::size("[[@0x") - 1).getAsInteger(16, byte_value)) {
+    return ErrorBuilder() << "Invalid raw byte specifier `"
+                          << keyword.take_front(10) << "`";
+  }
+
+  content.replace(keyword_pos, keyword.size() + 2, 1, byte_value);
+  return keyword_pos + 1;
+}
+
 // Replaces the keyword at the given position. Returns the position to start a
 // find for the next keyword.
 static auto ReplaceContentKeywordAt(std::string& content, size_t keyword_pos,
@@ -263,14 +284,18 @@ static auto ReplaceContentKeywordAt(std::string& content, size_t keyword_pos,
     return ReplaceLspKeywordAt(content, keyword_pos, lsp_call_id, splits);
   }
 
+  if (keyword.starts_with("[[@0x")) {
+    return ReplaceRawByteKeywordAt(content, keyword_pos);
+  }
+
   return ErrorBuilder() << "Unexpected use of `[[@` at `"
                         << keyword.substr(0, 5) << "`";
 }
 
 // Replaces the content keywords.
 //
-// TEST_NAME is the only content keyword at present, but we do validate that
-// other names are reserved.
+// This handles content keywords such as [[@TEST_NAME]] and [[@LSP*]]. Unknown
+// content keywords are diagnosed.
 static auto ReplaceContentKeywords(llvm::StringRef filename,
                                    std::string& content,
                                    llvm::ArrayRef<TestFile::Split> splits)
diff --git a/testing/file_test/testdata/replace_content.carbon b/testing/file_test/testdata/replace_content.carbon
@@ -11,3 +11,6 @@
 
 library "[[@TEST_NAME]]";
 // CHECK:STDOUT: replace_content.carbon:[[@LINE-1]]: library "replace_content";
+
+var x: str = "[[@0x48]][[@0x65]][[@0x6C]][[@0x6C]][[@0x6F]]";
+// CHECK:STDOUT: replace_content.carbon:[[@LINE-1]]: var x: str = "Hello";
diff --git a/toolchain/diagnostics/diagnostic.cpp b/toolchain/diagnostics/diagnostic.cpp
@@ -7,6 +7,8 @@
 #include <algorithm>
 #include <cstdint>
 
+#include "llvm/ADT/Sequence.h"
+
 namespace Carbon::Diagnostics {
 
 auto Loc::FormatLocation(llvm::raw_ostream& out) const -> void {
@@ -38,22 +40,51 @@ auto Loc::FormatSnippet(llvm::raw_ostream& out, int indent) const -> void {
   if (column_number == -1) {
     return;
   }
-
   // column_number is 1-based.
-  int32_t column = column_number - 1;
+  const int caret_byte_offset = column_number - 1;
 
   out.indent(indent);
-  out << line << "\n";
 
-  out.indent(indent + column);
+  int column = 0;
+  int caret_column = 0;
+  int underline_end_column = 0;
+
+  int byte_offset = 0;
+  for (char c : line) {
+    // TODO: Handle tab characters.
+    // TODO: Print Unicode characters directly, and use
+    // llvm::sys::unicode::getColumnWidth to determine their width.
+    if (std::isprint(static_cast<unsigned char>(c))) {
+      out << c;
+      ++column;
+    } else {
+      // TODO: Consider using ANSI colors to distinguish this from the program
+      // text.
+      int pos = out.tell();
+      out << '<';
+      llvm::write_hex(out, static_cast<unsigned char>(c),
+                      llvm::HexPrintStyle::Upper, 2);
+      out << '>';
+      column += out.tell() - pos;
+    }
+
+    ++byte_offset;
+    if (byte_offset <= caret_byte_offset) {
+      caret_column = column;
+    }
+    if (byte_offset <= caret_byte_offset + length) {
+      underline_end_column = column;
+    }
+  }
+
+  out << "\n";
+
+  out.indent(indent + caret_column);
   out << "^";
-  // We want to ensure that we don't underline past the end of the line in
-  // case of a multiline token.
-  // TODO: Revisit this once we can reference multiple ranges on multiple
-  // lines in a single diagnostic message.
-  int underline_length =
-      std::min(length, static_cast<int32_t>(line.size()) - column);
-  for (int i = 1; i < underline_length; ++i) {
+  // TODO: Revisit this once we can reference multiple ranges in a single
+  // diagnostic message.
+  for (auto _ :
+       llvm::seq(std::max(underline_end_column - caret_column - 1, 0))) {
     out << '~';
   }
   out << '\n';
diff --git a/toolchain/lex/testdata/char_literals.carbon b/toolchain/lex/testdata/char_literals.carbon
@@ -105,8 +105,8 @@
 
 // This literal contains a raw tab character.
 // CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:2: error: whitespace other than plain space must be expressed with an escape sequence in a string literal [InvalidHorizontalWhitespaceInString]
-// CHECK:STDERR: '{{\t}}'
-// CHECK:STDERR:  ^
+// CHECK:STDERR: '<09>'
+// CHECK:STDERR:  ^~~~
 // CHECK:STDERR:
 '	'
 // CHECK:STDOUT:   - { index: 11, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\t'", has_leading_space: true }
diff --git a/toolchain/lex/testdata/fail_bad_raw_identifier.carbon b/toolchain/lex/testdata/fail_bad_raw_identifier.carbon
@@ -34,7 +34,7 @@ r#3
 
 // Non ascii start to identifier.
 // CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: error: encountered unrecognized characters while parsing [UnrecognizedCharacters]
-// CHECK:STDERR: r#á
+// CHECK:STDERR: r#<C3><A1>
 // CHECK:STDERR:  ^
 // CHECK:STDERR:
 r#á
diff --git a/toolchain/lex/testdata/fail_char_literals_bad_encoding.carbon b/toolchain/lex/testdata/fail_char_literals_bad_encoding.carbon
diff --git a/toolchain/lex/testdata/string_literals.carbon b/toolchain/lex/testdata/string_literals.carbon
@@ -69,8 +69,8 @@
 // CHECK:STDOUT:   tokens:
 
 // CHECK:STDERR: fail_literal_tab_in_string.carbon:[[@LINE+4]]:2: error: whitespace other than plain space must be expressed with an escape sequence in a string literal [InvalidHorizontalWhitespaceInString]
-// CHECK:STDERR: "{{\t}}"
-// CHECK:STDERR:  ^
+// CHECK:STDERR: "<09>"
+// CHECK:STDERR:  ^~~~
 // CHECK:STDERR:
 "	"
 // CHECK:STDOUT:   - { index: 1, kind: "StringLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "\"\t\"", value: "\t", has_leading_space: true }