Merge pull request swiftlang#15043 from omochi/lex-init

rintaro · web-flow · commit 71d31907e88f · 2018-03-08T14:04:02.000+09:00
[Parse] Refactor Lexer initialization
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -64,8 +64,8 @@ enum class ConflictMarkerKind {
 class Lexer {
   const LangOptions &LangOpts;
   const SourceManager &SourceMgr;
-  DiagnosticEngine *Diags;
   const unsigned BufferID;
+  DiagnosticEngine *Diags;
 
   using State = LexerState;
 
@@ -137,20 +137,18 @@ class Lexer {
   Lexer(const Lexer&) = delete;
   void operator=(const Lexer&) = delete;
 
+  struct PrincipalTag {};
+
   /// The principal constructor used by public constructors below.
   /// Don't use this constructor for other purposes, it does not initialize
   /// everything.
-  Lexer(const LangOptions &Options,
-        const SourceManager &SourceMgr, DiagnosticEngine *Diags,
-        unsigned BufferID, bool InSILMode,
+  Lexer(const PrincipalTag &, const LangOptions &LangOpts,
+        const SourceManager &SourceMgr, unsigned BufferID,
+        DiagnosticEngine *Diags, bool InSILMode,
         CommentRetentionMode RetainComments,
         TriviaRetentionMode TriviaRetention);
 
-  /// @{
-  /// Helper routines used in \c Lexer constructors.
-  void primeLexer();
-  void initSubLexer(Lexer &Parent, State BeginState, State EndState);
-  /// @}
+  void initialize(unsigned Offset, unsigned EndOffset);
 
 public:
   /// \brief Create a normal lexer that scans the whole source buffer.
@@ -166,44 +164,26 @@ class Lexer {
   ///   means that APIs like getLocForEndOfToken really ought to take
   ///   this flag; it's just that we don't care that much about fidelity
   ///   when parsing SIL files.
-  Lexer(const LangOptions &Options,
-        const SourceManager &SourceMgr, unsigned BufferID,
-        DiagnosticEngine *Diags, bool InSILMode,
-        CommentRetentionMode RetainComments = CommentRetentionMode::None,
-        TriviaRetentionMode TriviaRetention = TriviaRetentionMode::WithoutTrivia)
-      : Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments,
-              TriviaRetention) {
-    primeLexer();
-  }
+  Lexer(
+      const LangOptions &Options, const SourceManager &SourceMgr,
+      unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
+      CommentRetentionMode RetainComments = CommentRetentionMode::None,
+      TriviaRetentionMode TriviaRetention = TriviaRetentionMode::WithoutTrivia);
 
   /// \brief Create a lexer that scans a subrange of the source buffer.
-  Lexer(const LangOptions &Options,
-        const SourceManager &SourceMgr, unsigned BufferID,
-        DiagnosticEngine *Diags, bool InSILMode,
+  Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
+        unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
         CommentRetentionMode RetainComments,
-        TriviaRetentionMode TriviaRetention,
-        unsigned Offset, unsigned EndOffset)
-      : Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments,
-              TriviaRetention) {
-    assert(Offset <= EndOffset && "invalid range");
-    initSubLexer(
-        *this,
-        State(getLocForStartOfBuffer().getAdvancedLoc(Offset)),
-        State(getLocForStartOfBuffer().getAdvancedLoc(EndOffset)));
-  }
+        TriviaRetentionMode TriviaRetention, unsigned Offset,
+        unsigned EndOffset);
 
   /// \brief Create a sub-lexer that lexes from the same buffer, but scans
   /// a subrange of the buffer.
   ///
   /// \param Parent the parent lexer that scans the whole buffer
   /// \param BeginState start of the subrange
   /// \param EndState end of the subrange
-  Lexer(Lexer &Parent, State BeginState, State EndState)
-      : Lexer(Parent.LangOpts, Parent.SourceMgr, Parent.Diags, Parent.BufferID,
-              Parent.InSILMode, Parent.RetainComments,
-              Parent.TriviaRetention) {
-    initSubLexer(Parent, BeginState, EndState);
-  }
+  Lexer(Lexer &Parent, State BeginState, State EndState);
 
   /// \brief Returns true if this lexer will produce a code completion token.
   bool isCodeCompletion() const {
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -168,64 +168,83 @@ uint32_t swift::validateUTF8CharacterAndAdvance(const char *&Ptr,
 // Setup and Helper Methods
 //===----------------------------------------------------------------------===//
 
-Lexer::Lexer(const LangOptions &Options,
-             const SourceManager &SM, DiagnosticEngine *Diags,
-             unsigned BufferID, bool InSILMode,
+Lexer::Lexer(const PrincipalTag &, const LangOptions &LangOpts,
+             const SourceManager &SourceMgr, unsigned BufferID,
+             DiagnosticEngine *Diags, bool InSILMode,
              CommentRetentionMode RetainComments,
              TriviaRetentionMode TriviaRetention)
-    : LangOpts(Options), SourceMgr(SM), Diags(Diags), BufferID(BufferID),
-      InSILMode(InSILMode), RetainComments(RetainComments),
-      TriviaRetention(TriviaRetention) {
+    : LangOpts(LangOpts), SourceMgr(SourceMgr), BufferID(BufferID),
+      Diags(Diags), InSILMode(InSILMode), RetainComments(RetainComments),
+      TriviaRetention(TriviaRetention) {}
+
+void Lexer::initialize(unsigned Offset, unsigned EndOffset) {
+  assert(Offset <= EndOffset);
+
   // Initialize buffer pointers.
-  StringRef contents = SM.extractText(SM.getRangeForBuffer(BufferID));
+  StringRef contents =
+      SourceMgr.extractText(SourceMgr.getRangeForBuffer(BufferID));
   BufferStart = contents.data();
   BufferEnd = contents.data() + contents.size();
+  assert(*BufferEnd == 0);
+  assert(BufferStart + Offset <= BufferEnd);
+  assert(BufferStart + EndOffset <= BufferEnd);
 
   // Check for Unicode BOM at start of file (Only UTF-8 BOM supported now).
-  size_t BOMLength = llvm::StringSwitch<size_t>(contents)
-    .StartsWith("\xEF\xBB\xBF", 3)
-    .Default(0);
+  size_t BOMLength = contents.startswith("\xEF\xBB\xBF") ? 3 : 0;
 
   // Keep information about existance of UTF-8 BOM for transparency source code
   // editing with libSyntax.
-  CurPtr = BufferStart;
   ContentStart = BufferStart + BOMLength;
 
   // Initialize code completion.
-  if (BufferID == SM.getCodeCompletionBufferID()) {
-    const char *Ptr = BufferStart + SM.getCodeCompletionOffset();
+  if (BufferID == SourceMgr.getCodeCompletionBufferID()) {
+    const char *Ptr = BufferStart + SourceMgr.getCodeCompletionOffset();
     if (Ptr >= BufferStart && Ptr <= BufferEnd)
       CodeCompletionPtr = Ptr;
   }
-}
 
-void Lexer::primeLexer() {
+  ArtificialEOF = BufferStart + EndOffset;
+  CurPtr = BufferStart + Offset;
+
   assert(NextToken.is(tok::NUM_TOKENS));
   lexImpl();
   assert((NextToken.isAtStartOfLine() || CurPtr != BufferStart) &&
          "The token should be at the beginning of the line, "
          "or we should be lexing from the middle of the buffer");
 }
 
-void Lexer::initSubLexer(Lexer &Parent, State BeginState, State EndState) {
+Lexer::Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
+             unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
+             CommentRetentionMode RetainComments,
+             TriviaRetentionMode TriviaRetention)
+    : Lexer(PrincipalTag(), Options, SourceMgr, BufferID, Diags, InSILMode,
+            RetainComments, TriviaRetention) {
+  unsigned EndOffset = SourceMgr.getRangeForBuffer(BufferID).getByteLength();
+  initialize(/*Offset=*/0, EndOffset);
+}
+
+Lexer::Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
+             unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode,
+             CommentRetentionMode RetainComments,
+             TriviaRetentionMode TriviaRetention, unsigned Offset,
+             unsigned EndOffset)
+    : Lexer(PrincipalTag(), Options, SourceMgr, BufferID, Diags, InSILMode,
+            RetainComments, TriviaRetention) {
+  initialize(Offset, EndOffset);
+}
+
+Lexer::Lexer(Lexer &Parent, State BeginState, State EndState)
+    : Lexer(PrincipalTag(), Parent.LangOpts, Parent.SourceMgr, Parent.BufferID,
+            Parent.Diags, Parent.InSILMode, Parent.RetainComments,
+            Parent.TriviaRetention) {
   assert(BufferID == SourceMgr.findBufferContainingLoc(BeginState.Loc) &&
          "state for the wrong buffer");
   assert(BufferID == SourceMgr.findBufferContainingLoc(EndState.Loc) &&
          "state for the wrong buffer");
 
-  // If the parent lexer should stop prematurely, and the ArtificialEOF
-  // position is in this subrange, then we should stop at that point, too.
-  const char *BeginStatePtr = getBufferPtrForSourceLoc(BeginState.Loc);
-  const char *EndStatePtr = getBufferPtrForSourceLoc(EndState.Loc);
-  if (Parent.ArtificialEOF &&
-      Parent.ArtificialEOF >= BeginStatePtr &&
-      Parent.ArtificialEOF <= EndStatePtr) {
-    ArtificialEOF = Parent.ArtificialEOF;
-  } else
-    ArtificialEOF = EndStatePtr;
-
-  primeLexer();
-  restoreState(BeginState);
+  unsigned Offset = SourceMgr.getLocOffsetInBuffer(BeginState.Loc, BufferID);
+  unsigned EndOffset = SourceMgr.getLocOffsetInBuffer(EndState.Loc, BufferID);
+  initialize(Offset, EndOffset);
 }
 
 InFlightDiagnostic Lexer::diagnose(const char *Loc, Diagnostic Diag) {
@@ -255,7 +274,7 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) {
   // When we are lexing a subrange from the middle of a file buffer, we will
   // run past the end of the range, but will stay within the file.  Check if
   // we are past the imaginary EOF, and synthesize a tok::eof in this case.
-  if (Kind != tok::eof && ArtificialEOF && TokStart >= ArtificialEOF) {
+  if (Kind != tok::eof && TokStart >= ArtificialEOF) {
     Kind = tok::eof;
   }
   unsigned CommentLength = 0;
diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp
@@ -1,3 +1,5 @@
+#include "swift/AST/DiagnosticConsumer.h"
+#include "swift/AST/DiagnosticEngine.h"
 #include "swift/Basic/LangOptions.h"
 #include "swift/Basic/SourceManager.h"
 #include "swift/Parse/Lexer.h"
@@ -703,3 +705,76 @@ TEST_F(LexerTest, NestedPlaceholder) {
   std::vector<Token> Toks = checkLex(Source, ExpectedTokens);
   EXPECT_EQ("<#aa#>", Toks[2].getText());
 }
+
+class StringCaptureDiagnosticConsumer : public DiagnosticConsumer {
+public:
+  virtual void handleDiagnostic(SourceManager &SM, SourceLoc Loc,
+                                DiagnosticKind Kind, StringRef FormatString,
+                                ArrayRef<DiagnosticArgument> FormatArgs,
+                                const swift::DiagnosticInfo &Info) override {
+    std::string DiagMsg;
+    llvm::raw_string_ostream DiagOS(DiagMsg);
+    DiagnosticEngine::formatDiagnosticText(DiagOS, FormatString, FormatArgs);
+    auto LC = SM.getLineAndColumn(Loc);
+    std::ostringstream StrOS;
+    StrOS << LC.first << ", " << LC.second << ": " << DiagOS.str();
+    messages.push_back(StrOS.str());
+  }
+
+  std::vector<std::string> messages;
+};
+
+bool containsPrefix(const std::vector<std::string> &strs,
+                    const std::string &prefix) {
+  for (auto &str : strs) {
+    if (StringRef(str).startswith(StringRef(prefix))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TEST_F(LexerTest, DiagnoseEmbeddedNul) {
+  const char Source[] = " \0 \0 aaa \0 \0 bbb";
+  size_t SourceLen = sizeof(Source) - 1;
+
+  LangOptions LangOpts;
+  SourceManager SourceMgr;
+  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
+
+  StringCaptureDiagnosticConsumer DiagConsumer;
+  DiagnosticEngine Diags(SourceMgr);
+  Diags.addConsumer(DiagConsumer);
+
+  Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
+          /*InSILMode=*/false, CommentRetentionMode::None,
+          TriviaRetentionMode::WithTrivia);
+
+  ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
+                             "1, 2: nul character embedded in middle of file"));
+  ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
+                             "1, 4: nul character embedded in middle of file"));
+}
+
+TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
+  const char Source[] = " \0 \0 aaa \0 \0 bbb";
+  size_t SourceLen = sizeof(Source) - 1;
+
+  LangOptions LangOpts;
+  SourceManager SourceMgr;
+  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
+
+  StringCaptureDiagnosticConsumer DiagConsumer;
+  DiagnosticEngine Diags(SourceMgr);
+  Diags.addConsumer(DiagConsumer);
+
+  Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
+          /*InSILMode=*/false, CommentRetentionMode::None,
+          TriviaRetentionMode::WithTrivia,
+          /*Offset=*/5, /*EndOffset=*/SourceLen);
+
+  ASSERT_FALSE(containsPrefix(
+      DiagConsumer.messages, "1, 2: nul character embedded in middle of file"));
+  ASSERT_FALSE(containsPrefix(
+      DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
+}