Skip to content

Commit 601102d

Browse files
cor3ntinAaronBallman
authored andcommitted
Cleanup identifier parsing; NFC
Rename methods to clearly signal when they only deal with ASCII, simplify the parsing of identifier, and use start/continue instead of head/body for consistency with Unicode terminology.
1 parent 9bbc0c1 commit 601102d

File tree

24 files changed

+298
-309
lines changed

24 files changed

+298
-309
lines changed

clang-tools-extra/clang-include-fixer/IncludeFixer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ clang::TypoCorrection IncludeFixerSemaSource::CorrectTypo(
245245
// parent_path.
246246
// FIXME: Don't rely on source text.
247247
const char *End = Source.end();
248-
while (isIdentifierBody(*End) || *End == ':')
248+
while (isAsciiIdentifierContinue(*End) || *End == ':')
249249
++End;
250250

251251
return std::string(Source.begin(), End);

clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) {
129129
const StringRef Port = "unsigned short port";
130130
const char *Data = Result.SourceManager->getCharacterData(Loc);
131131
if (!std::strncmp(Data, Port.data(), Port.size()) &&
132-
!isIdentifierBody(Data[Port.size()]))
132+
!isAsciiIdentifierContinue(Data[Port.size()]))
133133
return;
134134

135135
std::string Replacement =

clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ void RenamerClangTidyCheck::check(const MatchFinder::MatchResult &Result) {
464464
Failure.FixStatus = ShouldFixStatus::ConflictsWithKeyword;
465465
else if (Ident->hasMacroDefinition())
466466
Failure.FixStatus = ShouldFixStatus::ConflictsWithMacroDefinition;
467-
} else if (!isValidIdentifier(Info.Fixup)) {
467+
} else if (!isValidAsciiIdentifier(Info.Fixup)) {
468468
Failure.FixStatus = ShouldFixStatus::FixInvalidIdentifier;
469469
}
470470

clang-tools-extra/clangd/CodeComplete.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,14 +1842,14 @@ CompletionPrefix guessCompletionPrefix(llvm::StringRef Content,
18421842
CompletionPrefix Result;
18431843

18441844
// Consume the unqualified name. We only handle ASCII characters.
1845-
// isIdentifierBody will let us match "0invalid", but we don't mind.
1846-
while (!Rest.empty() && isIdentifierBody(Rest.back()))
1845+
// isAsciiIdentifierContinue will let us match "0invalid", but we don't mind.
1846+
while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back()))
18471847
Rest = Rest.drop_back();
18481848
Result.Name = Content.slice(Rest.size(), Offset);
18491849

18501850
// Consume qualifiers.
18511851
while (Rest.consume_back("::") && !Rest.endswith(":")) // reject ::::
1852-
while (!Rest.empty() && isIdentifierBody(Rest.back()))
1852+
while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back()))
18531853
Rest = Rest.drop_back();
18541854
Result.Qualifier =
18551855
Content.slice(Rest.size(), Result.Name.begin() - Content.begin());
@@ -2057,8 +2057,8 @@ bool allowImplicitCompletion(llvm::StringRef Content, unsigned Offset) {
20572057
return true;
20582058

20592059
// Complete words. Give non-ascii characters the benefit of the doubt.
2060-
return !Content.empty() &&
2061-
(isIdentifierBody(Content.back()) || !llvm::isASCII(Content.back()));
2060+
return !Content.empty() && (isAsciiIdentifierContinue(Content.back()) ||
2061+
!llvm::isASCII(Content.back()));
20622062
}
20632063

20642064
} // namespace clangd

clang-tools-extra/clangd/SourceCode.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -945,9 +945,9 @@ llvm::Optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc,
945945
if (Invalid)
946946
return llvm::None;
947947
unsigned B = Offset, E = Offset;
948-
while (B > 0 && isIdentifierBody(Code[B - 1]))
948+
while (B > 0 && isAsciiIdentifierContinue(Code[B - 1]))
949949
--B;
950-
while (E < Code.size() && isIdentifierBody(Code[E]))
950+
while (E < Code.size() && isAsciiIdentifierContinue(Code[E]))
951951
++E;
952952
if (B == E)
953953
return llvm::None;

clang-tools-extra/clangd/refactor/Rename.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -478,10 +478,10 @@ static bool mayBeValidIdentifier(llvm::StringRef Ident) {
478478
// We don't check all the rules for non-ascii characters (most are allowed).
479479
bool AllowDollar = true; // lenient
480480
if (llvm::isASCII(Ident.front()) &&
481-
!isIdentifierHead(Ident.front(), AllowDollar))
481+
!isAsciiIdentifierStart(Ident.front(), AllowDollar))
482482
return false;
483483
for (char C : Ident) {
484-
if (llvm::isASCII(C) && !isIdentifierBody(C, AllowDollar))
484+
if (llvm::isASCII(C) && !isAsciiIdentifierContinue(C, AllowDollar))
485485
return false;
486486
}
487487
return true;

clang/include/clang/Basic/CharInfo.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; }
5050

5151
/// Returns true if this is a valid first character of a C identifier,
5252
/// which is [a-zA-Z_].
53-
LLVM_READONLY inline bool isIdentifierHead(unsigned char c,
54-
bool AllowDollar = false) {
53+
LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c,
54+
bool AllowDollar = false) {
5555
using namespace charinfo;
5656
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
5757
return true;
@@ -60,8 +60,8 @@ LLVM_READONLY inline bool isIdentifierHead(unsigned char c,
6060

6161
/// Returns true if this is a body character of a C identifier,
6262
/// which is [a-zA-Z0-9_].
63-
LLVM_READONLY inline bool isIdentifierBody(unsigned char c,
64-
bool AllowDollar = false) {
63+
LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c,
64+
bool AllowDollar = false) {
6565
using namespace charinfo;
6666
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
6767
return true;
@@ -186,13 +186,13 @@ LLVM_READONLY inline char toUppercase(char c) {
186186
///
187187
/// Note that this is a very simple check; it does not accept UCNs as valid
188188
/// identifier characters.
189-
LLVM_READONLY inline bool isValidIdentifier(StringRef S,
190-
bool AllowDollar = false) {
191-
if (S.empty() || !isIdentifierHead(S[0], AllowDollar))
189+
LLVM_READONLY inline bool isValidAsciiIdentifier(StringRef S,
190+
bool AllowDollar = false) {
191+
if (S.empty() || !isAsciiIdentifierStart(S[0], AllowDollar))
192192
return false;
193193

194194
for (StringRef::iterator I = S.begin(), E = S.end(); I != E; ++I)
195-
if (!isIdentifierBody(*I, AllowDollar))
195+
if (!isAsciiIdentifierContinue(*I, AllowDollar))
196196
return false;
197197

198198
return true;

clang/include/clang/Lex/Lexer.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,8 @@ class Lexer : public PreprocessorLexer {
536536
bool SkipTrailingWhitespaceAndNewLine);
537537

538538
/// Returns true if the given character could appear in an identifier.
539-
static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
539+
static bool isAsciiIdentifierContinueChar(char c,
540+
const LangOptions &LangOpts);
540541

541542
/// Checks whether new line pointed by Str is preceded by escape
542543
/// sequence.
@@ -573,10 +574,7 @@ class Lexer : public PreprocessorLexer {
573574

574575
bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
575576

576-
/// Given that a token begins with the Unicode character \p C, figure out
577-
/// what kind of token it is and dispatch to the appropriate lexing helper
578-
/// function.
579-
bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
577+
bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
580578

581579
/// FormTokenWithChars - When we lex a token, we have identified a span
582580
/// starting at BufferPtr, going to TokEnd that forms the token. This method
@@ -701,7 +699,11 @@ class Lexer : public PreprocessorLexer {
701699
bool IsStringLiteral);
702700

703701
// Helper functions to lex the remainder of a token of the specific type.
704-
bool LexIdentifier (Token &Result, const char *CurPtr);
702+
703+
// This function handles both ASCII and Unicode identifiers after
704+
// the first codepoint of the identifyier has been parsed.
705+
bool LexIdentifierContinue(Token &Result, const char *CurPtr);
706+
705707
bool LexNumericConstant (Token &Result, const char *CurPtr);
706708
bool LexStringLiteral (Token &Result, const char *CurPtr,
707709
tok::TokenKind Kind);

clang/lib/ARCMigrate/ObjCMT.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1144,7 +1144,7 @@ static bool AttributesMatch(const Decl *Decl1, const Decl *Decl2,
11441144

11451145
static bool IsValidIdentifier(ASTContext &Ctx,
11461146
const char *Name) {
1147-
if (!isIdentifierHead(Name[0]))
1147+
if (!isAsciiIdentifierStart(Name[0]))
11481148
return false;
11491149
std::string NameString = Name;
11501150
NameString[0] = toLowercase(NameString[0]);

clang/lib/ARCMigrate/TransUnbridgedCasts.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,8 @@ class UnbridgedCastRewriter : public RecursiveASTVisitor<UnbridgedCastRewriter>{
253253

254254
SourceManager &SM = Pass.Ctx.getSourceManager();
255255
char PrevChar = *SM.getCharacterData(InsertLoc.getLocWithOffset(-1));
256-
if (Lexer::isIdentifierBodyChar(PrevChar, Pass.Ctx.getLangOpts()))
256+
if (Lexer::isAsciiIdentifierContinueChar(PrevChar,
257+
Pass.Ctx.getLangOpts()))
257258
BridgeCall += ' ';
258259

259260
if (Kind == OBC_BridgeTransfer)

0 commit comments

Comments
 (0)