[Parse] Avoid skipping bodies with /.../ regex literals

hamishknight · hamishknight · commit 515945fc6d63 · 2022-06-22T20:05:21.000+01:00
While skipping, if we encounter a token that looks
like it could be the start of a `/.../` regex
literal, fall back to parsing the function or type
body normally, as such a token could become a
regex literal. As such, it could treat `{` and
`}` as literal, or otherwise have contents that
would be lexically invalid Swift.

To avoid falling back in too many cases, we apply
the existing regex literal heuristics. Cases that
pass the heuristic fall back to regular parsing.
Cases that fail the heuristic are further checked
to make sure they wouldn't contain an unbalanced
`{` or `}`, but otherwise are allowed to be
skipped. This allows us to continue skipping for
most occurrences of infix and prefix `/`.

This is meant as a lower risk workaround to fix the
the issue, we ought to go back to handling regex
literals in the lexer.

Resolves rdar://95354010
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -580,6 +580,13 @@ class Lexer {
                             : LexerForwardSlashRegexMode::Tentative) {}
   };
 
+  /// Checks whether a given token could potentially contain the start of an
+  /// unskippable `/.../` regex literal. Such tokens need to go through the
+  /// parser, as they may become regex literal tokens. This includes operator
+  /// tokens such as `!/` which could be split into prefix `!` on a regex
+  /// literal.
+  bool isPotentialUnskippableBareSlashRegexLiteral(const Token &Tok) const;
+
 private:
   /// Nul character meaning kind.
   enum class NulCharacterKind {
diff --git a/include/swift/Parse/Parser.h b/include/swift/Parse/Parser.h
@@ -718,13 +718,6 @@ class Parser {
   /// plain Tok.is(T1) check).
   bool skipUntilTokenOrEndOfLine(tok T1, tok T2 = tok::NUM_TOKENS);
 
-  /// Skip a braced block (e.g. function body). The current token must be '{'.
-  /// Returns \c true if the parser hit the eof before finding matched '}'.
-  ///
-  /// Set \c HasNestedTypeDeclarations to true if a token for a type
-  /// declaration is detected in the skipped block.
-  bool skipBracedBlock(bool &HasNestedTypeDeclarations);
-
   /// Skip over SIL decls until we encounter the start of a Swift decl or eof.
   void skipSILUntilSwiftDecl();
 
@@ -1001,6 +994,8 @@ class Parser {
   bool canDelayMemberDeclParsing(bool &HasOperatorDeclarations,
                                  bool &HasNestedClassDeclarations);
 
+  bool canDelayFunctionBodyParsing(bool &HasNestedTypeDeclarations);
+
   bool delayParsingDeclList(SourceLoc LBLoc, SourceLoc &RBLoc,
                             IterableDeclContext *IDC);
 
@@ -1211,9 +1206,7 @@ class Parser {
                                        bool &hasEffectfulGet,
                                        AccessorKind currentKind,
                                        SourceLoc const& currentLoc);
-  
-  void consumeAbstractFunctionBody(AbstractFunctionDecl *AFD,
-                                   const DeclAttributes &Attrs);
+
   ParserResult<FuncDecl> parseDeclFunc(SourceLoc StaticLoc,
                                        StaticSpellingKind StaticSpelling,
                                        ParseDeclOptions Flags,
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -1980,6 +1980,64 @@ const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body,
   }
 }
 
+bool Lexer::isPotentialUnskippableBareSlashRegexLiteral(const Token &Tok) const {
+  if (!LangOpts.hasFeature(Feature::BareSlashRegexLiterals))
+    return false;
+
+  // A `/.../` regex literal may only start on a binary or prefix operator.
+  if (Tok.isNot(tok::oper_prefix, tok::oper_binary_spaced,
+                tok::oper_binary_unspaced)) {
+    return false;
+  }
+  auto SlashIdx = Tok.getText().find("/");
+  if (SlashIdx == StringRef::npos)
+    return false;
+
+  auto Offset = getBufferPtrForSourceLoc(Tok.getLoc()) + SlashIdx;
+  bool CompletelyErroneous;
+  if (tryScanRegexLiteral(Offset, /*MustBeRegex*/ false, /*Diags*/ nullptr,
+                          CompletelyErroneous)) {
+    // Definitely a regex literal.
+    return true;
+  }
+
+  // A prefix '/' can never be a regex literal if it failed a heuristic.
+  if (Tok.is(tok::oper_prefix))
+    return false;
+
+  // We either don't have a regex literal, or we failed a heuristic. We now need
+  // to make sure we don't have an unbalanced `{` or `}`, as that would have the
+  // potential to change the range of a skipped body if we try to more
+  // agressively lex a regex literal during normal parsing. If we have balanced
+  // `{` + `}`, we can proceed with skipping. Worst case scenario is we emit a
+  // worse diagnostic.
+  // FIXME: We ought to silence lexer diagnostics when skipping, this would
+  // avoid emitting a worse diagnostic.
+  auto *EndPtr = tryScanRegexLiteral(Offset, /*MustBeRegex*/ true,
+                                     /*Diags*/ nullptr, CompletelyErroneous);
+  if (!EndPtr)
+    return false;
+
+  Lexer L(*this, State(Tok.getLoc().getAdvancedLoc(Tok.getLength())),
+          State(getSourceLoc(EndPtr)), /*EnableDiagnostics*/ false);
+
+  unsigned OpenBraces = 0;
+  while (L.peekNextToken().isNot(tok::eof)) {
+    Token Tok;
+    L.lex(Tok);
+    if (Tok.is(tok::l_brace))
+      OpenBraces += 1;
+    if (Tok.is(tok::r_brace)) {
+      if (OpenBraces == 0)
+        return true;
+      OpenBraces -= 1;
+    }
+  }
+
+  // If we have an unbalanced `{`, this is unskippable.
+  return OpenBraces != 0;
+}
+
 const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
                                        DiagnosticEngine *Diags,
                                        bool &CompletelyErroneous) const {
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
@@ -4121,11 +4121,13 @@ static unsigned skipUntilMatchingRBrace(Parser &P,
                                         bool &HasPoundDirective,
                                         bool &HasOperatorDeclarations,
                                         bool &HasNestedClassDeclarations,
-                                        bool &HasNestedTypeDeclarations) {
+                                        bool &HasNestedTypeDeclarations,
+                                        bool &HasPotentialRegexLiteral) {
   HasPoundDirective = false;
   HasOperatorDeclarations = false;
   HasNestedClassDeclarations = false;
   HasNestedTypeDeclarations = false;
+  HasPotentialRegexLiteral = false;
 
   unsigned OpenBraces = 1;
 
@@ -4148,6 +4150,18 @@ static unsigned skipUntilMatchingRBrace(Parser &P,
     HasNestedTypeDeclarations |= P.Tok.isAny(tok::kw_class, tok::kw_struct,
                                              tok::kw_enum);
 
+    // HACK: Bail if we encounter what could potentially be a regex literal.
+    // This is necessary as:
+    // - We might encounter an invalid Swift token that might be valid in a
+    // regex.
+    // - Such a literal could contain a literal `}`, which should not be treated
+    // as an end brace.
+    // FIXME: We should be able to handle `/.../` regex literals in the lexer.
+    if (P.L->isPotentialUnskippableBareSlashRegexLiteral(P.Tok)) {
+      HasPotentialRegexLiteral = true;
+      return OpenBraces;
+    }
+
     if (P.consumeIf(tok::l_brace)) {
       ++OpenBraces;
       continue;
@@ -5456,12 +5470,14 @@ bool Parser::canDelayMemberDeclParsing(bool &HasOperatorDeclarations,
   CancellableBacktrackingScope BackTrack(*this);
   bool HasPoundDirective;
   bool HasNestedTypeDeclarations;
+  bool HasPotentialRegexLiteral;
   skipUntilMatchingRBrace(*this,
                           HasPoundDirective,
                           HasOperatorDeclarations,
                           HasNestedClassDeclarations,
-                          HasNestedTypeDeclarations);
-  if (!HasPoundDirective)
+                          HasNestedTypeDeclarations,
+                          HasPotentialRegexLiteral);
+  if (!HasPoundDirective && !HasPotentialRegexLiteral)
     BackTrack.cancelBacktrack();
   return !BackTrack.willBacktrack();
 }
@@ -6137,25 +6153,31 @@ static ParameterList *parseOptionalAccessorArgument(SourceLoc SpecifierLoc,
   return ParameterList::create(P.Context, StartLoc, param, EndLoc);
 }
 
-bool Parser::skipBracedBlock(bool &HasNestedTypeDeclarations) {
+bool Parser::canDelayFunctionBodyParsing(bool &HasNestedTypeDeclarations) {
+  // If explicitly disabled, respect the flag.
+  if (!isDelayedParsingEnabled() && !isCodeCompletionFirstPass())
+    return false;
+
   SyntaxParsingContext disabled(SyntaxContext);
   SyntaxContext->disable();
-  consumeToken(tok::l_brace);
 
-  // We don't care if a skipped function body contained any of these, so
-  // just ignore them.
+  // Skip until the matching right curly bracket; If it has a potential regex
+  // literal, we can't skip. We don't care others, so just ignore them;
+  CancellableBacktrackingScope BackTrack(*this);
+  consumeToken(tok::l_brace);
   bool HasPoundDirectives;
   bool HasOperatorDeclarations;
   bool HasNestedClassDeclarations;
+  bool HasPotentialRegexLiteral;
+  skipUntilMatchingRBrace(*this, HasPoundDirectives, HasOperatorDeclarations,
+                          HasNestedClassDeclarations, HasNestedTypeDeclarations,
+                          HasPotentialRegexLiteral);
+  if (HasPotentialRegexLiteral)
+    return false;
 
-  unsigned OpenBraces = skipUntilMatchingRBrace(*this,
-                                                HasPoundDirectives,
-                                                HasOperatorDeclarations,
-                                                HasNestedClassDeclarations,
-                                                HasNestedTypeDeclarations);
-  if (consumeIf(tok::r_brace))
-    --OpenBraces;
-  return OpenBraces != 0;
+  BackTrack.cancelBacktrack();
+  consumeIf(tok::r_brace);
+  return true;
 }
 
 void Parser::skipSILUntilSwiftDecl() {
@@ -7145,30 +7167,6 @@ Parser::parseDeclVar(ParseDeclOptions Flags,
   return makeResult(Status);
 }
 
-void Parser::consumeAbstractFunctionBody(AbstractFunctionDecl *AFD,
-                                         const DeclAttributes &Attrs) {
-  auto BeginParserPosition = getParserPosition();
-  SourceRange BodyRange;
-  BodyRange.Start = Tok.getLoc();
-
-  // Advance the parser to the end of the block; '{' ... '}'.
-  bool HasNestedTypeDeclarations;
-  skipBracedBlock(HasNestedTypeDeclarations);
-
-  BodyRange.End = PreviousLoc;
-
-  AFD->setBodyDelayed(BodyRange);
-  AFD->setHasNestedTypeDeclarations(HasNestedTypeDeclarations);
-
-  if (isCodeCompletionFirstPass() &&
-      SourceMgr.rangeContainsCodeCompletionLoc(BodyRange)) {
-    State->setCodeCompletionDelayedDeclState(
-        SourceMgr, L->getBufferID(),
-        CodeCompletionDelayedDeclKind::FunctionBody,
-        PD_Default, AFD, BodyRange, BeginParserPosition.PreviousLoc);
-  }
-}
-
 /// Parse a 'func' declaration, returning null on error.  The caller
 /// handles this case and does recovery as appropriate.
 ///
@@ -7481,12 +7479,41 @@ void Parser::parseAbstractFunctionBody(AbstractFunctionDecl *AFD) {
   // If we can delay parsing this body, or this is the first pass of code
   // completion, skip until the end. If we encounter a code completion token
   // while skipping, we'll make a note of it.
-  if (isDelayedParsingEnabled() || isCodeCompletionFirstPass()) {
-    consumeAbstractFunctionBody(AFD, AFD->getAttrs());
+  auto BodyPreviousLoc = PreviousLoc;
+  SourceRange BodyRange(Tok.getLoc());
+  auto setCodeCompletionDelayedDeclStateIfNeeded = [&] {
+    if (!isCodeCompletionFirstPass() ||
+        !SourceMgr.rangeContainsCodeCompletionLoc(BodyRange)) {
+      return;
+    }
+    if (State->hasCodeCompletionDelayedDeclState())
+      State->takeCodeCompletionDelayedDeclState();
+    State->setCodeCompletionDelayedDeclState(
+        SourceMgr, L->getBufferID(),
+        CodeCompletionDelayedDeclKind::FunctionBody,
+        PD_Default, AFD, BodyRange, BodyPreviousLoc);
+  };
+
+  bool HasNestedTypeDeclarations;
+  if (canDelayFunctionBodyParsing(HasNestedTypeDeclarations)) {
+    BodyRange.End = PreviousLoc;
+
+    assert(SourceMgr.isBeforeInBuffer(BodyRange.Start, BodyRange.End) ||
+           BodyRange.Start == BodyRange.End &&
+           "At least '{' should be consumed");
+
+    AFD->setBodyDelayed(BodyRange);
+    AFD->setHasNestedTypeDeclarations(HasNestedTypeDeclarations);
+
+    setCodeCompletionDelayedDeclStateIfNeeded();
     return;
   }
 
   (void)parseAbstractFunctionBodyImpl(AFD);
+  assert(BodyRange.Start == AFD->getBodySourceRange().Start &&
+         "The start of the body should be the 'l_brace' token above");
+  BodyRange = AFD->getBodySourceRange();
+  setCodeCompletionDelayedDeclStateIfNeeded();
 }
 
 BodyAndFingerprint
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
@@ -887,6 +887,8 @@ void Parser::tryLexRegexLiteral(bool forUnappliedOperator) {
 
   // Check to see if we have a regex literal `/.../`, optionally with a prefix
   // operator e.g `!/.../`.
+  // NOTE: If you change this logic you must also change the logic in
+  // isPotentialUnskippableBareSlashRegexLiteral.
   bool mustBeRegex = false;
   switch (Tok.getKind()) {
   case tok::oper_prefix:
diff --git a/test/StringProcessing/Parse/forward-slash-regex-skipping-allowed.swift b/test/StringProcessing/Parse/forward-slash-regex-skipping-allowed.swift
@@ -0,0 +1,62 @@
+// RUN: %empty-directory(%t)
+
+// RUN: %target-swift-frontend -parse -enable-bare-slash-regex -disable-availability-checking -experimental-skip-all-function-bodies -stats-output-dir %t %s
+// RUN: %{python} %utils/process-stats-dir.py --set-csv-baseline %t/stats.csv %t
+// RUN: %FileCheck -input-file %t/stats.csv %s
+
+// REQUIRES: swift_in_compiler
+
+// Make sure we can skip in all of the below cases.
+
+// We don't appear to output a stats entry when it is 0.
+// CHECK-NOT: {{"Parse.NumFunctionsParsed"}}
+
+// Balanced `{}`, so okay.
+func a() { / {}/ }
+func b() { / \{}/ }
+func c() { / {"{"}/ }
+
+// Some cases of infix '/' that we should continue to skip.
+func d() {
+  _ = 1 / 2 + 3 * 4
+  _ = 1 / 2 / 3 / 4
+}
+func e() {
+  let arr = [1, 2, 3]
+  _ = arr.reduce(0, /) / 2
+
+  func foo(_ i: Int, _ fn: () -> Void) {}
+  foo(1 / 2 / 3, { print("}}}{{{") })
+}
+
+// Some cases of prefix '/' that we should continue to skip.
+prefix operator /
+prefix func / <T> (_ x: T) -> T { x }
+
+enum E {
+  case e
+  func foo<T>(_ x: T) {}
+}
+
+func f() {
+  _ = /E.e
+  (/E.e).foo(/0)
+
+  func foo<T, U>(_ x: T, _ y: U) {}
+  foo((/E.e), /E.e)
+  foo((/)(E.e), /E.e)
+
+  func bar<T>(_ x: T) -> Int { 0 }
+  _ = bar(/E.e) / 2
+}
+
+postfix operator /
+prefix func / <T> (_ x: T) -> T { x }
+
+// Some cases of postfix '/' that we should continue to skip.
+func g() {
+    _ = 0/
+    _ = 0/ / 1/
+    _ = 1/ + 1/
+    _ = 1 + 2/
+}
diff --git a/test/StringProcessing/Parse/forward-slash-regex-skipping-invalid.swift b/test/StringProcessing/Parse/forward-slash-regex-skipping-invalid.swift
diff --git a/test/StringProcessing/Parse/forward-slash-regex-skipping.swift b/test/StringProcessing/Parse/forward-slash-regex-skipping.swift