Skip to content

Commit 71d3190

Browse files
authored
Merge pull request swiftlang#15043 from omochi/lex-init
[Parse] Refactor Lexer initialization
2 parents e39a33a + 967a48c commit 71d3190

File tree

3 files changed

+141
-67
lines changed

3 files changed

+141
-67
lines changed

include/swift/Parse/Lexer.h

Lines changed: 17 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ enum class ConflictMarkerKind {
6464
class Lexer {
6565
const LangOptions &LangOpts;
6666
const SourceManager &SourceMgr;
67-
DiagnosticEngine *Diags;
6867
const unsigned BufferID;
68+
DiagnosticEngine *Diags;
6969

7070
using State = LexerState;
7171

@@ -137,20 +137,18 @@ class Lexer {
137137
Lexer(const Lexer&) = delete;
138138
void operator=(const Lexer&) = delete;
139139

140+
struct PrincipalTag {};
141+
140142
/// The principal constructor used by public constructors below.
141143
/// Don't use this constructor for other purposes, it does not initialize
142144
/// everything.
143-
Lexer(const LangOptions &Options,
144-
const SourceManager &SourceMgr, DiagnosticEngine *Diags,
145-
unsigned BufferID, bool InSILMode,
145+
Lexer(const PrincipalTag &, const LangOptions &LangOpts,
146+
const SourceManager &SourceMgr, unsigned BufferID,
147+
DiagnosticEngine *Diags, bool InSILMode,
146148
CommentRetentionMode RetainComments,
147149
TriviaRetentionMode TriviaRetention);
148150

149-
/// @{
150-
/// Helper routines used in \c Lexer constructors.
151-
void primeLexer();
152-
void initSubLexer(Lexer &Parent, State BeginState, State EndState);
153-
/// @}
151+
void initialize(unsigned Offset, unsigned EndOffset);
154152

155153
public:
156154
/// \brief Create a normal lexer that scans the whole source buffer.
@@ -166,44 +164,26 @@ class Lexer {
166164
/// means that APIs like getLocForEndOfToken really ought to take
167165
/// this flag; it's just that we don't care that much about fidelity
168166
/// when parsing SIL files.
169-
Lexer(const LangOptions &Options,
170-
const SourceManager &SourceMgr, unsigned BufferID,
171-
DiagnosticEngine *Diags, bool InSILMode,
172-
CommentRetentionMode RetainComments = CommentRetentionMode::None,
173-
TriviaRetentionMode TriviaRetention = TriviaRetentionMode::WithoutTrivia)
174-
: Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments,
175-
TriviaRetention) {
176-
primeLexer();
177-
}
167+
Lexer(
168+
const LangOptions &Options, const SourceManager &SourceMgr,
169+
unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
170+
CommentRetentionMode RetainComments = CommentRetentionMode::None,
171+
TriviaRetentionMode TriviaRetention = TriviaRetentionMode::WithoutTrivia);
178172

179173
/// \brief Create a lexer that scans a subrange of the source buffer.
180-
Lexer(const LangOptions &Options,
181-
const SourceManager &SourceMgr, unsigned BufferID,
182-
DiagnosticEngine *Diags, bool InSILMode,
174+
Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
175+
unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
183176
CommentRetentionMode RetainComments,
184-
TriviaRetentionMode TriviaRetention,
185-
unsigned Offset, unsigned EndOffset)
186-
: Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments,
187-
TriviaRetention) {
188-
assert(Offset <= EndOffset && "invalid range");
189-
initSubLexer(
190-
*this,
191-
State(getLocForStartOfBuffer().getAdvancedLoc(Offset)),
192-
State(getLocForStartOfBuffer().getAdvancedLoc(EndOffset)));
193-
}
177+
TriviaRetentionMode TriviaRetention, unsigned Offset,
178+
unsigned EndOffset);
194179

195180
/// \brief Create a sub-lexer that lexes from the same buffer, but scans
196181
/// a subrange of the buffer.
197182
///
198183
/// \param Parent the parent lexer that scans the whole buffer
199184
/// \param BeginState start of the subrange
200185
/// \param EndState end of the subrange
201-
Lexer(Lexer &Parent, State BeginState, State EndState)
202-
: Lexer(Parent.LangOpts, Parent.SourceMgr, Parent.Diags, Parent.BufferID,
203-
Parent.InSILMode, Parent.RetainComments,
204-
Parent.TriviaRetention) {
205-
initSubLexer(Parent, BeginState, EndState);
206-
}
186+
Lexer(Lexer &Parent, State BeginState, State EndState);
207187

208188
/// \brief Returns true if this lexer will produce a code completion token.
209189
bool isCodeCompletion() const {

lib/Parse/Lexer.cpp

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -168,64 +168,83 @@ uint32_t swift::validateUTF8CharacterAndAdvance(const char *&Ptr,
168168
// Setup and Helper Methods
169169
//===----------------------------------------------------------------------===//
170170

171-
Lexer::Lexer(const LangOptions &Options,
172-
const SourceManager &SM, DiagnosticEngine *Diags,
173-
unsigned BufferID, bool InSILMode,
171+
Lexer::Lexer(const PrincipalTag &, const LangOptions &LangOpts,
172+
const SourceManager &SourceMgr, unsigned BufferID,
173+
DiagnosticEngine *Diags, bool InSILMode,
174174
CommentRetentionMode RetainComments,
175175
TriviaRetentionMode TriviaRetention)
176-
: LangOpts(Options), SourceMgr(SM), Diags(Diags), BufferID(BufferID),
177-
InSILMode(InSILMode), RetainComments(RetainComments),
178-
TriviaRetention(TriviaRetention) {
176+
: LangOpts(LangOpts), SourceMgr(SourceMgr), BufferID(BufferID),
177+
Diags(Diags), InSILMode(InSILMode), RetainComments(RetainComments),
178+
TriviaRetention(TriviaRetention) {}
179+
180+
void Lexer::initialize(unsigned Offset, unsigned EndOffset) {
181+
assert(Offset <= EndOffset);
182+
179183
// Initialize buffer pointers.
180-
StringRef contents = SM.extractText(SM.getRangeForBuffer(BufferID));
184+
StringRef contents =
185+
SourceMgr.extractText(SourceMgr.getRangeForBuffer(BufferID));
181186
BufferStart = contents.data();
182187
BufferEnd = contents.data() + contents.size();
188+
assert(*BufferEnd == 0);
189+
assert(BufferStart + Offset <= BufferEnd);
190+
assert(BufferStart + EndOffset <= BufferEnd);
183191

184192
// Check for Unicode BOM at start of file (Only UTF-8 BOM supported now).
185-
size_t BOMLength = llvm::StringSwitch<size_t>(contents)
186-
.StartsWith("\xEF\xBB\xBF", 3)
187-
.Default(0);
193+
size_t BOMLength = contents.startswith("\xEF\xBB\xBF") ? 3 : 0;
188194

189195
// Keep information about existance of UTF-8 BOM for transparency source code
190196
// editing with libSyntax.
191-
CurPtr = BufferStart;
192197
ContentStart = BufferStart + BOMLength;
193198

194199
// Initialize code completion.
195-
if (BufferID == SM.getCodeCompletionBufferID()) {
196-
const char *Ptr = BufferStart + SM.getCodeCompletionOffset();
200+
if (BufferID == SourceMgr.getCodeCompletionBufferID()) {
201+
const char *Ptr = BufferStart + SourceMgr.getCodeCompletionOffset();
197202
if (Ptr >= BufferStart && Ptr <= BufferEnd)
198203
CodeCompletionPtr = Ptr;
199204
}
200-
}
201205

202-
void Lexer::primeLexer() {
206+
ArtificialEOF = BufferStart + EndOffset;
207+
CurPtr = BufferStart + Offset;
208+
203209
assert(NextToken.is(tok::NUM_TOKENS));
204210
lexImpl();
205211
assert((NextToken.isAtStartOfLine() || CurPtr != BufferStart) &&
206212
"The token should be at the beginning of the line, "
207213
"or we should be lexing from the middle of the buffer");
208214
}
209215

210-
void Lexer::initSubLexer(Lexer &Parent, State BeginState, State EndState) {
216+
Lexer::Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
217+
unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
218+
CommentRetentionMode RetainComments,
219+
TriviaRetentionMode TriviaRetention)
220+
: Lexer(PrincipalTag(), Options, SourceMgr, BufferID, Diags, InSILMode,
221+
RetainComments, TriviaRetention) {
222+
unsigned EndOffset = SourceMgr.getRangeForBuffer(BufferID).getByteLength();
223+
initialize(/*Offset=*/0, EndOffset);
224+
}
225+
226+
Lexer::Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
227+
unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
228+
CommentRetentionMode RetainComments,
229+
TriviaRetentionMode TriviaRetention, unsigned Offset,
230+
unsigned EndOffset)
231+
: Lexer(PrincipalTag(), Options, SourceMgr, BufferID, Diags, InSILMode,
232+
RetainComments, TriviaRetention) {
233+
initialize(Offset, EndOffset);
234+
}
235+
236+
Lexer::Lexer(Lexer &Parent, State BeginState, State EndState)
237+
: Lexer(PrincipalTag(), Parent.LangOpts, Parent.SourceMgr, Parent.BufferID,
238+
Parent.Diags, Parent.InSILMode, Parent.RetainComments,
239+
Parent.TriviaRetention) {
211240
assert(BufferID == SourceMgr.findBufferContainingLoc(BeginState.Loc) &&
212241
"state for the wrong buffer");
213242
assert(BufferID == SourceMgr.findBufferContainingLoc(EndState.Loc) &&
214243
"state for the wrong buffer");
215244

216-
// If the parent lexer should stop prematurely, and the ArtificialEOF
217-
// position is in this subrange, then we should stop at that point, too.
218-
const char *BeginStatePtr = getBufferPtrForSourceLoc(BeginState.Loc);
219-
const char *EndStatePtr = getBufferPtrForSourceLoc(EndState.Loc);
220-
if (Parent.ArtificialEOF &&
221-
Parent.ArtificialEOF >= BeginStatePtr &&
222-
Parent.ArtificialEOF <= EndStatePtr) {
223-
ArtificialEOF = Parent.ArtificialEOF;
224-
} else
225-
ArtificialEOF = EndStatePtr;
226-
227-
primeLexer();
228-
restoreState(BeginState);
245+
unsigned Offset = SourceMgr.getLocOffsetInBuffer(BeginState.Loc, BufferID);
246+
unsigned EndOffset = SourceMgr.getLocOffsetInBuffer(EndState.Loc, BufferID);
247+
initialize(Offset, EndOffset);
229248
}
230249

231250
InFlightDiagnostic Lexer::diagnose(const char *Loc, Diagnostic Diag) {
@@ -255,7 +274,7 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) {
255274
// When we are lexing a subrange from the middle of a file buffer, we will
256275
// run past the end of the range, but will stay within the file. Check if
257276
// we are past the imaginary EOF, and synthesize a tok::eof in this case.
258-
if (Kind != tok::eof && ArtificialEOF && TokStart >= ArtificialEOF) {
277+
if (Kind != tok::eof && TokStart >= ArtificialEOF) {
259278
Kind = tok::eof;
260279
}
261280
unsigned CommentLength = 0;

unittests/Parse/LexerTests.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include "swift/AST/DiagnosticConsumer.h"
2+
#include "swift/AST/DiagnosticEngine.h"
13
#include "swift/Basic/LangOptions.h"
24
#include "swift/Basic/SourceManager.h"
35
#include "swift/Parse/Lexer.h"
@@ -703,3 +705,76 @@ TEST_F(LexerTest, NestedPlaceholder) {
703705
std::vector<Token> Toks = checkLex(Source, ExpectedTokens);
704706
EXPECT_EQ("<#aa#>", Toks[2].getText());
705707
}
708+
709+
class StringCaptureDiagnosticConsumer : public DiagnosticConsumer {
710+
public:
711+
virtual void handleDiagnostic(SourceManager &SM, SourceLoc Loc,
712+
DiagnosticKind Kind, StringRef FormatString,
713+
ArrayRef<DiagnosticArgument> FormatArgs,
714+
const swift::DiagnosticInfo &Info) override {
715+
std::string DiagMsg;
716+
llvm::raw_string_ostream DiagOS(DiagMsg);
717+
DiagnosticEngine::formatDiagnosticText(DiagOS, FormatString, FormatArgs);
718+
auto LC = SM.getLineAndColumn(Loc);
719+
std::ostringstream StrOS;
720+
StrOS << LC.first << ", " << LC.second << ": " << DiagOS.str();
721+
messages.push_back(StrOS.str());
722+
}
723+
724+
std::vector<std::string> messages;
725+
};
726+
727+
bool containsPrefix(const std::vector<std::string> &strs,
728+
const std::string &prefix) {
729+
for (auto &str : strs) {
730+
if (StringRef(str).startswith(StringRef(prefix))) {
731+
return true;
732+
}
733+
}
734+
return false;
735+
}
736+
737+
TEST_F(LexerTest, DiagnoseEmbeddedNul) {
738+
const char Source[] = " \0 \0 aaa \0 \0 bbb";
739+
size_t SourceLen = sizeof(Source) - 1;
740+
741+
LangOptions LangOpts;
742+
SourceManager SourceMgr;
743+
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
744+
745+
StringCaptureDiagnosticConsumer DiagConsumer;
746+
DiagnosticEngine Diags(SourceMgr);
747+
Diags.addConsumer(DiagConsumer);
748+
749+
Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
750+
/*InSILMode=*/false, CommentRetentionMode::None,
751+
TriviaRetentionMode::WithTrivia);
752+
753+
ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
754+
"1, 2: nul character embedded in middle of file"));
755+
ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
756+
"1, 4: nul character embedded in middle of file"));
757+
}
758+
759+
TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
760+
const char Source[] = " \0 \0 aaa \0 \0 bbb";
761+
size_t SourceLen = sizeof(Source) - 1;
762+
763+
LangOptions LangOpts;
764+
SourceManager SourceMgr;
765+
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
766+
767+
StringCaptureDiagnosticConsumer DiagConsumer;
768+
DiagnosticEngine Diags(SourceMgr);
769+
Diags.addConsumer(DiagConsumer);
770+
771+
Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
772+
/*InSILMode=*/false, CommentRetentionMode::None,
773+
TriviaRetentionMode::WithTrivia,
774+
/*Offset=*/5, /*EndOffset=*/SourceLen);
775+
776+
ASSERT_FALSE(containsPrefix(
777+
DiagConsumer.messages, "1, 2: nul character embedded in middle of file"));
778+
ASSERT_FALSE(containsPrefix(
779+
DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
780+
}

0 commit comments

Comments
 (0)