cklin
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java
Lines changed: 167 additions & 24 deletions b/‎javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java
Lines changed: 167 additions & 24 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java
Lines changed: 5 additions & 6 deletions b/‎javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java
Lines changed: 5 additions & 6 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/HTMLExtractor.java
Lines changed: 5 additions & 4 deletions b/‎javascript/extractor/src/com/semmle/js/extractor/HTMLExtractor.java
Lines changed: 5 additions & 4 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/IExtractor.java
Lines changed: 2 additions & 1 deletion b/‎javascript/extractor/src/com/semmle/js/extractor/IExtractor.java
Lines changed: 2 additions & 1 deletion
@@ -1,22 +1,26 @@
 package com.semmle.js.extractor;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.Reader;
 import java.lang.ProcessBuilder.Redirect;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryNotEmptyException;
 import java.nio.file.FileVisitResult;
 import java.nio.file.FileVisitor;
 import java.nio.file.Files;
 import java.nio.file.InvalidPathException;
+import java.nio.file.NoSuchFileException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.SimpleFileVisitor;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
@@ -27,6 +31,7 @@
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -41,11 +46,16 @@
 import com.semmle.js.extractor.trapcache.DefaultTrapCache;
 import com.semmle.js.extractor.trapcache.DummyTrapCache;
 import com.semmle.js.extractor.trapcache.ITrapCache;
+import com.semmle.js.parser.ParseError;
 import com.semmle.js.parser.ParsedProject;
 import com.semmle.ts.extractor.TypeExtractor;
 import com.semmle.ts.extractor.TypeScriptParser;
+import com.semmle.ts.extractor.TypeScriptWrapperOOMError;
 import com.semmle.ts.extractor.TypeTable;
 import com.semmle.util.data.StringUtil;
+import com.semmle.util.diagnostics.DiagnosticLevel;
+import com.semmle.util.diagnostics.DiagnosticWriter;
+import com.semmle.util.diagnostics.DiagnosticLocation;
 import com.semmle.util.exception.CatastrophicError;
 import com.semmle.util.exception.Exceptions;
 import com.semmle.util.exception.ResourceError;
@@ -444,33 +454,141 @@ protected boolean hasSeenCode() {
 
   /** Perform extraction. */
   public int run() throws IOException {
-    startThreadPool();
-    try {
-      CompletableFuture<?> sourceFuture = extractSource();
-      sourceFuture.join(); // wait for source extraction to complete
-      if (hasSeenCode()) { // don't bother with the externs if no code was seen
-        extractExterns();
+      startThreadPool();
+      try {
+        CompletableFuture<?> sourceFuture = extractSource();
+        sourceFuture.join(); // wait for source extraction to complete
+        if (hasSeenCode()) { // don't bother with the externs if no code was seen
+          extractExterns();
+        }
+        extractXml();
+      } catch (OutOfMemoryError oom) {
+        System.err.println("Out of memory while extracting the project.");
+        return 137; // the CodeQL CLI will interpret this as an out-of-memory error
+        // purpusely not doing anything else (printing stack, etc.), as the JVM
+        // basically guarantees nothing after an OOM
+      } catch (TypeScriptWrapperOOMError oom) {
+        System.err.println("Out of memory while extracting the project.");
+        System.err.println(oom.getMessage());
+        oom.printStackTrace(System.err);
+        return 137;
+      } catch (RuntimeException | IOException e) {
+        writeDiagnostics("Internal error: " + e, JSDiagnosticKind.INTERNAL_ERROR);
+        e.printStackTrace(System.err);
+        return 1;
+      } finally {
+        shutdownThreadPool();
+        diagnosticsToClose.forEach(DiagnosticWriter::close);
       }
-      extractXml();
-    } finally {
-      shutdownThreadPool();
+
+      if (!hasSeenCode()) {
+        if (seenFiles) {
+          warn("Only found JavaScript or TypeScript files that were empty or contained syntax errors.");
+        } else {
+          warn("No JavaScript or TypeScript code found.");
+        }
+        // ensuring that the finalize steps detects that no code was seen.
+        Path srcFolder = Paths.get(EnvironmentVariables.getWipDatabase(), "src");
+        try {
+          // Non-recursive delete because "src/" should be empty.
+          FileUtil8.delete(srcFolder);
+        } catch (NoSuchFileException e) {
+          Exceptions.ignore(e, "the directory did not exist");
+        } catch (DirectoryNotEmptyException e) {
+          Exceptions.ignore(e, "just leave the directory if it is not empty");
+        }
+        return 0;
+      }
+    return 0;
+  }
+
+  /**
+   * A kind of error that can happen during extraction of JavaScript or TypeScript
+   * code.
+   * For use with the {@link #writeDiagnostics(String, JSDiagnosticKind)} method.
+   */
+  public static enum JSDiagnosticKind {
+    PARSE_ERROR("parse-error", "Parse error", DiagnosticLevel.Warning),
+    INTERNAL_ERROR("internal-error", "Internal error", DiagnosticLevel.Debug);
+
+    private final String id;
+    private final String name;
+    private final DiagnosticLevel level;
+
+    private JSDiagnosticKind(String id, String name, DiagnosticLevel level) {
+      this.id = id;
+      this.name = name;
+      this.level = level;
+    }
+
+    public String getId() {
+      return id;
+    }
+
+    public String getName() {
+      return name;
+    }
+
+    public DiagnosticLevel getLevel() {
+      return level;
     }
-    if (!hasSeenCode()) {
-      if (seenFiles) {
-        warn("Only found JavaScript or TypeScript files that were empty or contained syntax errors.");
+  }
+
+  private AtomicInteger diagnosticCount = new AtomicInteger(0);
+  private List<DiagnosticWriter> diagnosticsToClose = Collections.synchronizedList(new ArrayList<>());
+  private ThreadLocal<DiagnosticWriter> diagnostics = new ThreadLocal<DiagnosticWriter>(){
+        @Override protected DiagnosticWriter initialValue() {
+            DiagnosticWriter result = initDiagnosticsWriter(diagnosticCount.incrementAndGet());
+            diagnosticsToClose.add(result);
+            return result;
+        }
+  };
+
+  /**
+   * Persist a diagnostic message to a file in the diagnostics directory.
+   * See {@link JSDiagnosticKind} for the kinds of errors that can be reported,
+   * and see
+   * {@link DiagnosticWriter} for more details.
+   */
+  public void writeDiagnostics(String message, JSDiagnosticKind error) throws IOException {
+    writeDiagnostics(message, error, null);
+  }
+
+
+  /**
+   * Persist a diagnostic message with a location to a file in the diagnostics directory.
+   * See {@link JSDiagnosticKind} for the kinds of errors that can be reported,
+   * and see
+   * {@link DiagnosticWriter} for more details.
+   */
+  public void writeDiagnostics(String message, JSDiagnosticKind error, DiagnosticLocation location) throws IOException {
+    if (diagnostics.get() == null) {
+      warn("No diagnostics directory, so not writing diagnostic: " + message);
+      return;
+    }
+
+    // DiagnosticLevel level, String extractorName, String sourceId, String sourceName, String markdown
+    diagnostics.get().writeMarkdown(error.getLevel(), "javascript", "javascript/" + error.getId(), error.getName(),
+        message, location);
+  }
+
+  private DiagnosticWriter initDiagnosticsWriter(int count) {
+    String diagnosticsDir = System.getenv("CODEQL_EXTRACTOR_JAVASCRIPT_DIAGNOSTIC_DIR");
+
+    if (diagnosticsDir != null) {
+      File diagnosticsDirFile = new File(diagnosticsDir);
+      if (!diagnosticsDirFile.isDirectory()) {
+        warn("Diagnostics directory " + diagnosticsDir + " does not exist");
       } else {
-        warn("No JavaScript or TypeScript code found.");
-      }
-      // ensuring that the finalize steps detects that no code was seen. 
-      Path srcFolder = Paths.get(EnvironmentVariables.getWipDatabase(), "src");
-      // check that the srcFolder is empty
-      if (Files.list(srcFolder).count() == 0) {
-        // Non-recursive delete because "src/" should be empty.
-        FileUtil8.delete(srcFolder);
+        File diagnosticsFile = new File(diagnosticsDirFile, "autobuilder-" + count + ".jsonl");
+        try {
+          return new DiagnosticWriter(diagnosticsFile);
+        } catch (FileNotFoundException e) {
+          warn("Failed to open diagnostics file " + diagnosticsFile);
+        }
       }
-      return 0;
     }
-    return 0;
+    return null;
   }
 
   private void startThreadPool() {
@@ -1113,13 +1231,38 @@ private void doExtract(FileExtractor extractor, Path file, ExtractorState state)
 
     try {
       long start = logBeginProcess("Extracting " + file);
-      Integer loc = extractor.extract(f, state);
-      if (!extractor.getConfig().isExterns() && (loc == null || loc != 0)) seenCode = true;
+      ParseResultInfo loc = extractor.extract(f, state);
+      if (!extractor.getConfig().isExterns() && (loc == null || loc.getLinesOfCode() != 0)) seenCode = true;
       if (!extractor.getConfig().isExterns()) seenFiles = true;
+      for (ParseError err : loc.getParseErrors()) {
+        String msg = "A parse error occurred: " + StringUtil.escapeMarkdown(err.getMessage())
+            + ". Check the syntax of the file. If the file is invalid, correct the error or [exclude](https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/customizing-code-scanning) the file from analysis.";
+        // file, relative to the source root
+        String relativeFilePath = null;
+        if (file.startsWith(LGTM_SRC)) {
+          relativeFilePath = file.subpath(LGTM_SRC.getNameCount(), file.getNameCount()).toString();
+        }
+        DiagnosticLocation diagLoc = DiagnosticLocation.builder()
+            .setFile(relativeFilePath)
+            .setStartLine(err.getPosition().getLine())
+            .setStartColumn(err.getPosition().getColumn())
+            .setEndLine(err.getPosition().getLine())
+            .setEndColumn(err.getPosition().getColumn())
+            .build();
+        writeDiagnostics(msg, JSDiagnosticKind.PARSE_ERROR, diagLoc);
+      }
       logEndProcess(start, "Done extracting " + file);
+    } catch (OutOfMemoryError oom) {
+      System.err.println("Out of memory while extracting a file.");
+      System.exit(137); // caught by the CodeQL CLI
     } catch (Throwable t) {
       System.err.println("Exception while extracting " + file + ".");
       t.printStackTrace(System.err);
+      try {
+        writeDiagnostics("Internal error: " + t, JSDiagnosticKind.INTERNAL_ERROR);
+      } catch (IOException ignored) {
+        Exceptions.ignore(ignored, "we are already crashing");
+      }
       System.exit(1);
     }
   }
 
@@ -5,7 +5,6 @@
 import java.io.FileInputStream;
 import java.io.FileReader;
 import java.io.IOException;
-import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.util.LinkedHashSet;
@@ -434,7 +433,7 @@ public boolean supports(File f) {
   }
 
   /** @return the number of lines of code extracted, or {@code null} if the file was cached */
-  public Integer extract(File f, ExtractorState state) throws IOException {
+  public ParseResultInfo extract(File f, ExtractorState state) throws IOException {
     FileSnippet snippet = state.getSnippets().get(f.toPath());
     if (snippet != null) {
       return this.extractSnippet(f.toPath(), snippet, state);
@@ -461,7 +460,7 @@ public Integer extract(File f, ExtractorState state) throws IOException {
    * <p>A trap file will be derived from the snippet file, but its file label, source locations, and
    * source archive entry are based on the original file.
    */
-  private Integer extractSnippet(Path file, FileSnippet origin, ExtractorState state) throws IOException {
+  private ParseResultInfo extractSnippet(Path file, FileSnippet origin, ExtractorState state) throws IOException {
     TrapWriter trapwriter = outputConfig.getTrapWriterFactory().mkTrapWriter(file.toFile());
 
     File originalFile = origin.getOriginalFile().toFile();
@@ -495,7 +494,7 @@ private Integer extractSnippet(Path file, FileSnippet origin, ExtractorState sta
    * <p>Also note that we support extraction with TRAP writer factories that are not file-backed;
    * obviously, no caching is done in that scenario.
    */
-  private Integer extractContents(
+  private ParseResultInfo extractContents(
       File extractedFile, Label fileLabel, String source, LocationManager locationManager, ExtractorState state)
       throws IOException {
     ExtractionMetrics metrics = new ExtractionMetrics();
@@ -545,15 +544,15 @@ private Integer extractContents(
       TextualExtractor textualExtractor =
           new TextualExtractor(
               trapwriter, locationManager, source, config.getExtractLines(), metrics, extractedFile);
-      LoCInfo loc = extractor.extract(textualExtractor);
+      ParseResultInfo loc = extractor.extract(textualExtractor);
       int numLines = textualExtractor.isSnippet() ? 0 : textualExtractor.getNumLines();
       int linesOfCode = loc.getLinesOfCode(), linesOfComments = loc.getLinesOfComments();
       trapwriter.addTuple("numlines", fileLabel, numLines, linesOfCode, linesOfComments);
       trapwriter.addTuple("filetype", fileLabel, fileType.toString());
       metrics.stopPhase(ExtractionPhase.FileExtractor_extractContents);
       metrics.writeTimingsToTrap(trapwriter);
       successful = true;
-      return linesOfCode;
+      return loc;
     } finally {
       if (!successful && trapwriter instanceof CachingTrapWriter)
         ((CachingTrapWriter) trapwriter).discard();
 
@@ -3,6 +3,7 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.file.Path;
+import java.util.Collections;
 import java.util.List;
 import java.util.function.Supplier;
 import java.util.regex.Matcher;
@@ -29,7 +30,7 @@
 
 /** Extractor for handling HTML and XHTML files. */
 public class HTMLExtractor implements IExtractor {
-  private LoCInfo locInfo = new LoCInfo(0, 0);
+  private ParseResultInfo locInfo = new ParseResultInfo(0, 0, Collections.emptyList());
 
   private class JavaScriptHTMLElementHandler implements HtmlPopulator.ElementHandler {
     private final ScopeManager scopeManager;
@@ -212,11 +213,11 @@ public static HTMLExtractor forEmbeddedHtml(ExtractorConfig config) {
   }
 
   @Override
-  public LoCInfo extract(TextualExtractor textualExtractor) throws IOException {
+  public ParseResultInfo extract(TextualExtractor textualExtractor) throws IOException {
     return extractEx(textualExtractor).snd();
   }
 
-  public Pair<List<Label>, LoCInfo> extractEx(TextualExtractor textualExtractor) {
+  public Pair<List<Label>, ParseResultInfo> extractEx(TextualExtractor textualExtractor) {
     // Angular templates contain attribute names that are not valid HTML/XML, such
     // as [foo], (foo), [(foo)], and *foo.
     // Allow a large number of errors in attribute names, so the Jericho parser does
@@ -369,7 +370,7 @@ private void extractSnippet(
               config.getExtractLines(),
               textualExtractor.getMetrics(),
               textualExtractor.getExtractedFile());
-      Pair<Label, LoCInfo> result = extractor.extract(tx, source, toplevelKind, scopeManager);
+      Pair<Label, ParseResultInfo> result = extractor.extract(tx, source, toplevelKind, scopeManager);
       Label toplevelLabel = result.fst();
       if (toplevelLabel != null) { // can be null when script ends up being parsed as JSON
         emitTopLevelXmlNodeBinding(parentLabel, toplevelLabel, trapWriter);
 
@@ -1,6 +1,7 @@
 package com.semmle.js.extractor;
 
 import java.io.IOException;
+import com.semmle.js.parser.ParseError;
 
 /** Generic extractor interface. */
 public interface IExtractor {
@@ -9,5 +10,5 @@ public interface IExtractor {
    * TextualExtractor}, and return information about the number of lines of code and the number of
    * lines of comments extracted.
    */
-  public LoCInfo extract(TextualExtractor textualExtractor) throws IOException;
+  public ParseResultInfo extract(TextualExtractor textualExtractor) throws IOException;
 }