Skip to content

Commit ad84e71

Browse files
authored
Avoid non-UTF8-encoded test files. (#5965)
Add a content keyword to file_test, `[[@0xab]]`, that expands to the code unit 0xAB, and use that instead of putting raw malformed code units in test files. Instead of printing the raw input bytes in snippets in diagnostics, replace non-printable characters with <AB> in the output, being careful to still compute the location of the caret and underscore properly.
1 parent 2352e93 commit ad84e71

File tree

8 files changed

+84
-29
lines changed

8 files changed

+84
-29
lines changed

.gitattributes

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,3 @@
55
# This tells Github to detect files having the extension `.def` as `C++` files, which
66
# ensures that these files get syntax highlighted properly.
77
*.def linguist-language=C++
8-
9-
# This tells Git to treat lexer tests as text when producing diffs, even if
10-
# they contain non-printable characters.
11-
toolchain/lex/testdata/*.carbon diff

testing/file_test/test_file.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,8 @@ static auto AutoFillDidOpenParams(llvm::json::Object& params,
145145
return Success();
146146
}
147147

148-
// Reformats `[[@LSP:` and similar keyword as an LSP call with headers.
148+
// Reformats `[[@LSP:` and similar keyword as an LSP call with headers. Returns
149+
// the position to start a find for the next keyword.
149150
static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
150151
int& lsp_call_id,
151152
llvm::ArrayRef<TestFile::Split> splits)
@@ -154,7 +155,7 @@ static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
154155
llvm::StringRef(content).substr(keyword_pos);
155156

156157
auto [keyword, body_start] = content_at_keyword.split(":");
157-
if (body_start.empty()) {
158+
if (keyword.size() == content_at_keyword.size()) {
158159
return ErrorBuilder() << "Missing `:` for `"
159160
<< content_at_keyword.take_front(10) << "`";
160161
}
@@ -179,12 +180,11 @@ static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
179180
}
180181

181182
static constexpr llvm::StringLiteral LspEnd = "]]";
182-
auto body_end = body_start.find(LspEnd);
183-
if (body_end == std::string::npos) {
183+
auto [body, rest] = body_start.split("]]");
184+
if (body.size() == body_start.size()) {
184185
return ErrorBuilder() << "Missing `" << LspEnd << "` after `" << keyword
185186
<< "`";
186187
}
187-
llvm::StringRef body = body_start.take_front(body_end);
188188
auto [method_or_id, extra_content] = body.split(":");
189189

190190
llvm::json::Value parsed_extra_content = nullptr;
@@ -231,12 +231,33 @@ static auto ReplaceLspKeywordAt(std::string& content, size_t keyword_pos,
231231
auto json_with_header = llvm::formatv("Content-Length: {0}\n\n{1}\n",
232232
content_length, buffer.TakeStr())
233233
.str();
234-
int keyword_len =
235-
(body_start.data() + body_end + LspEnd.size()) - keyword.data();
234+
size_t keyword_len = rest.data() - keyword.data();
236235
content.replace(keyword_pos, keyword_len, json_with_header);
237236
return keyword_pos + json_with_header.size();
238237
}
239238

239+
// Replaces `[[@0xAB]]` with the raw byte with value 0xAB. Returns the position
240+
// to start a find for the next keyword.
241+
static auto ReplaceRawByteKeywordAt(std::string& content, size_t keyword_pos)
242+
-> ErrorOr<size_t> {
243+
llvm::StringRef content_at_keyword =
244+
llvm::StringRef(content).substr(keyword_pos);
245+
auto [keyword, rest] = content_at_keyword.split("]]");
246+
if (keyword.size() == content_at_keyword.size()) {
247+
return ErrorBuilder() << "Missing `]]` after " << keyword.take_front(10)
248+
<< "`";
249+
}
250+
251+
unsigned char byte_value;
252+
if (keyword.substr(std::size("[[@0x") - 1).getAsInteger(16, byte_value)) {
253+
return ErrorBuilder() << "Invalid raw byte specifier `"
254+
<< keyword.take_front(10) << "`";
255+
}
256+
257+
content.replace(keyword_pos, keyword.size() + 2, 1, byte_value);
258+
return keyword_pos + 1;
259+
}
260+
240261
// Replaces the keyword at the given position. Returns the position to start a
241262
// find for the next keyword.
242263
static auto ReplaceContentKeywordAt(std::string& content, size_t keyword_pos,
@@ -263,14 +284,18 @@ static auto ReplaceContentKeywordAt(std::string& content, size_t keyword_pos,
263284
return ReplaceLspKeywordAt(content, keyword_pos, lsp_call_id, splits);
264285
}
265286

287+
if (keyword.starts_with("[[@0x")) {
288+
return ReplaceRawByteKeywordAt(content, keyword_pos);
289+
}
290+
266291
return ErrorBuilder() << "Unexpected use of `[[@` at `"
267292
<< keyword.substr(0, 5) << "`";
268293
}
269294

270295
// Replaces the content keywords.
271296
//
272-
// TEST_NAME is the only content keyword at present, but we do validate that
273-
// other names are reserved.
297+
// This handles content keywords such as [[@TEST_NAME]] and [[@LSP*]]. Unknown
298+
// content keywords are diagnosed.
274299
static auto ReplaceContentKeywords(llvm::StringRef filename,
275300
std::string& content,
276301
llvm::ArrayRef<TestFile::Split> splits)

testing/file_test/testdata/replace_content.carbon

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@
1111

1212
library "[[@TEST_NAME]]";
1313
// CHECK:STDOUT: replace_content.carbon:[[@LINE-1]]: library "replace_content";
14+
15+
var x: str = "[[@0x48]][[@0x65]][[@0x6C]][[@0x6C]][[@0x6F]]";
16+
// CHECK:STDOUT: replace_content.carbon:[[@LINE-1]]: var x: str = "Hello";

toolchain/diagnostics/diagnostic.cpp

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include <algorithm>
88
#include <cstdint>
99

10+
#include "llvm/ADT/Sequence.h"
11+
1012
namespace Carbon::Diagnostics {
1113

1214
auto Loc::FormatLocation(llvm::raw_ostream& out) const -> void {
@@ -38,22 +40,51 @@ auto Loc::FormatSnippet(llvm::raw_ostream& out, int indent) const -> void {
3840
if (column_number == -1) {
3941
return;
4042
}
41-
4243
// column_number is 1-based.
43-
int32_t column = column_number - 1;
44+
const int caret_byte_offset = column_number - 1;
4445

4546
out.indent(indent);
46-
out << line << "\n";
4747

48-
out.indent(indent + column);
48+
int column = 0;
49+
int caret_column = 0;
50+
int underline_end_column = 0;
51+
52+
int byte_offset = 0;
53+
for (char c : line) {
54+
// TODO: Handle tab characters.
55+
// TODO: Print Unicode characters directly, and use
56+
// llvm::sys::unicode::getColumnWidth to determine their width.
57+
if (std::isprint(static_cast<unsigned char>(c))) {
58+
out << c;
59+
++column;
60+
} else {
61+
// TODO: Consider using ANSI colors to distinguish this from the program
62+
// text.
63+
int pos = out.tell();
64+
out << '<';
65+
llvm::write_hex(out, static_cast<unsigned char>(c),
66+
llvm::HexPrintStyle::Upper, 2);
67+
out << '>';
68+
column += out.tell() - pos;
69+
}
70+
71+
++byte_offset;
72+
if (byte_offset <= caret_byte_offset) {
73+
caret_column = column;
74+
}
75+
if (byte_offset <= caret_byte_offset + length) {
76+
underline_end_column = column;
77+
}
78+
}
79+
80+
out << "\n";
81+
82+
out.indent(indent + caret_column);
4983
out << "^";
50-
// We want to ensure that we don't underline past the end of the line in
51-
// case of a multiline token.
52-
// TODO: Revisit this once we can reference multiple ranges on multiple
53-
// lines in a single diagnostic message.
54-
int underline_length =
55-
std::min(length, static_cast<int32_t>(line.size()) - column);
56-
for (int i = 1; i < underline_length; ++i) {
84+
// TODO: Revisit this once we can reference multiple ranges in a single
85+
// diagnostic message.
86+
for (auto _ :
87+
llvm::seq(std::max(underline_end_column - caret_column - 1, 0))) {
5788
out << '~';
5889
}
5990
out << '\n';

toolchain/lex/testdata/char_literals.carbon

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,8 @@
105105
106106
// This literal contains a raw tab character.
107107
// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:2: error: whitespace other than plain space must be expressed with an escape sequence in a string literal [InvalidHorizontalWhitespaceInString]
108-
// CHECK:STDERR: '{{\t}}'
109-
// CHECK:STDERR: ^
108+
// CHECK:STDERR: '<09>'
109+
// CHECK:STDERR: ^~~~
110110
// CHECK:STDERR:
111111
' '
112112
// CHECK:STDOUT: - { index: 11, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\t'", has_leading_space: true }

toolchain/lex/testdata/fail_bad_raw_identifier.carbon

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ r#3
3434

3535
// Non ascii start to identifier.
3636
// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: error: encountered unrecognized characters while parsing [UnrecognizedCharacters]
37-
// CHECK:STDERR: r#á
37+
// CHECK:STDERR: r#<C3><A1>
3838
// CHECK:STDERR: ^
3939
// CHECK:STDERR:
4040
r#á
-103 Bytes
Binary file not shown.

toolchain/lex/testdata/string_literals.carbon

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@
6969
// CHECK:STDOUT: tokens:
7070

7171
// CHECK:STDERR: fail_literal_tab_in_string.carbon:[[@LINE+4]]:2: error: whitespace other than plain space must be expressed with an escape sequence in a string literal [InvalidHorizontalWhitespaceInString]
72-
// CHECK:STDERR: "{{\t}}"
73-
// CHECK:STDERR: ^
72+
// CHECK:STDERR: "<09>"
73+
// CHECK:STDERR: ^~~~
7474
// CHECK:STDERR:
7575
" "
7676
// CHECK:STDOUT: - { index: 1, kind: "StringLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "\"\t\"", value: "\t", has_leading_space: true }

0 commit comments

Comments
 (0)