Skip to content

Commit d146514

Browse files
authored
Merge pull request github#13928 from asgerf/js/ignore-huge-files
JS: Ignore files larger than 10 MB during extraction
2 parents d2fca1b + b93e404 commit d146514

File tree

4 files changed

+49
-24
lines changed

4 files changed

+49
-24
lines changed

javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ public class AutoBuild {
222222
private boolean installDependencies = false;
223223
private final VirtualSourceRoot virtualSourceRoot;
224224
private ExtractorState state;
225+
private final long maximumFileSizeInMegabytes;
225226

226227
/** The default timeout when installing dependencies, in milliseconds. */
227228
public static final int INSTALL_DEPENDENCIES_DEFAULT_TIMEOUT = 10 * 60 * 1000; // 10 minutes
@@ -236,6 +237,7 @@ public AutoBuild() {
236237
this.defaultEncoding = getEnvVar("LGTM_INDEX_DEFAULT_ENCODING");
237238
this.installDependencies = Boolean.valueOf(getEnvVar("LGTM_INDEX_TYPESCRIPT_INSTALL_DEPS"));
238239
this.virtualSourceRoot = makeVirtualSourceRoot();
240+
this.maximumFileSizeInMegabytes = EnvironmentVariables.getMegabyteCountFromPrefixedEnv("MAX_FILE_SIZE", 10);
239241
setupFileTypes();
240242
setupXmlMode();
241243
setupMatchers();
@@ -446,8 +448,8 @@ private boolean addPathPattern(Set<Path> patterns, Path base, String pattern) {
446448
}
447449

448450
/**
449-
* Returns whether the autobuilder has seen code.
450-
* This is overridden in tests.
451+
* Returns whether the autobuilder has seen code.
452+
* This is overridden in tests.
451453
*/
452454
protected boolean hasSeenCode() {
453455
return seenCode;
@@ -741,12 +743,12 @@ private CompletableFuture<?> extractSource() throws IOException {
741743
dependencyInstallationResult = this.preparePackagesAndDependencies(filesToExtract);
742744
}
743745
Set<Path> extractedFiles = new LinkedHashSet<>();
744-
746+
745747
// Extract HTML files as they may contain TypeScript
746748
CompletableFuture<?> htmlFuture = extractFiles(
747749
filesToExtract, extractedFiles, extractors,
748750
f -> extractors.fileType(f) == FileType.HTML);
749-
751+
750752
htmlFuture.join(); // Wait for HTML extraction to be finished.
751753

752754
// extract TypeScript projects and files
@@ -1229,6 +1231,11 @@ private void doExtract(FileExtractor extractor, Path file, ExtractorState state)
12291231
warn("Skipping " + file + ", which does not exist.");
12301232
return;
12311233
}
1234+
long fileSize = f.length();
1235+
if (fileSize > 1_000_000L * this.maximumFileSizeInMegabytes) {
1236+
warn("Skipping " + file + " because it is too large (" + StringUtil.printFloat(fileSize / 1_000_000.0) + " MB). The limit is " + this.maximumFileSizeInMegabytes + " MB.");
1237+
return;
1238+
}
12321239

12331240
try {
12341241
long start = logBeginProcess("Extracting " + file);

javascript/extractor/src/com/semmle/js/extractor/EnvironmentVariables.java

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
package com.semmle.js.extractor;
22

3+
import com.semmle.util.data.UnitParser;
34
import com.semmle.util.exception.UserError;
45
import com.semmle.util.process.Env;
56
import com.semmle.util.process.Env.Var;
67

78
public class EnvironmentVariables {
89
public static final String CODEQL_EXTRACTOR_JAVASCRIPT_ROOT_ENV_VAR =
910
"CODEQL_EXTRACTOR_JAVASCRIPT_ROOT";
10-
11+
1112
public static final String CODEQL_EXTRACTOR_JAVASCRIPT_SCRATCH_DIR_ENV_VAR =
1213
"CODEQL_EXTRACTOR_JAVASCRIPT_SCRATCH_DIR";
1314

@@ -19,6 +20,36 @@ public class EnvironmentVariables {
1920

2021
public static final String CODEQL_DIST_ENV_VAR = "CODEQL_DIST";
2122

23+
/**
24+
* Returns a number of megabytes by reading an environment variable with the given suffix,
25+
* or the default value if not set.
26+
* <p>
27+
* The following prefixes are tried:
28+
* <code>CODEQL_EXTRACTOR_JAVASCRIPT_</code>,
29+
* <code>LGTM_</code>,
30+
* <code>SEMMLE_</code>.
31+
*/
32+
public static int getMegabyteCountFromPrefixedEnv(String suffix, int defaultValue) {
33+
String envVar = "CODEQL_EXTRACTOR_JAVASCRIPT_" + suffix;
34+
String value = Env.systemEnv().get(envVar);
35+
if (value == null || value.length() == 0) {
36+
envVar = "LGTM_" + suffix;
37+
value = Env.systemEnv().get(envVar);
38+
}
39+
if (value == null || value.length() == 0) {
40+
envVar = "SEMMLE_" + suffix;
41+
value = Env.systemEnv().get(envVar);
42+
}
43+
if (value == null || value.length() == 0) {
44+
return defaultValue;
45+
}
46+
Integer amount = UnitParser.parseOpt(value, UnitParser.MEGABYTES);
47+
if (amount == null) {
48+
throw new UserError("Invalid value for " + envVar + ": '" + value + "'");
49+
}
50+
return amount;
51+
}
52+
2253
/**
2354
* Gets the extractor root based on the <code>CODEQL_EXTRACTOR_JAVASCRIPT_ROOT</code> or <code>
2455
* SEMMLE_DIST</code> or environment variable, or <code>null</code> if neither is set.

javascript/extractor/src/com/semmle/ts/extractor/TypeScriptParser.java

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -273,32 +273,15 @@ private List<String> getNodeJsRuntimeInvocation(String... args) {
273273
return result;
274274
}
275275

276-
private static int getMegabyteCountFromPrefixedEnv(String suffix, int defaultValue) {
277-
String envVar = "SEMMLE_" + suffix;
278-
String value = Env.systemEnv().get(envVar);
279-
if (value == null || value.length() == 0) {
280-
envVar = "LGTM_" + suffix;
281-
value = Env.systemEnv().get(envVar);
282-
}
283-
if (value == null || value.length() == 0) {
284-
return defaultValue;
285-
}
286-
Integer amount = UnitParser.parseOpt(value, UnitParser.MEGABYTES);
287-
if (amount == null) {
288-
throw new UserError("Invalid value for " + envVar + ": '" + value + "'");
289-
}
290-
return amount;
291-
}
292-
293276
/** Start the Node.js parser wrapper process. */
294277
private void setupParserWrapper() {
295278
verifyNodeInstallation();
296279

297280
int mainMemoryMb =
298281
typescriptRam != 0
299282
? typescriptRam
300-
: getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_SUFFIX, 2000);
301-
int reserveMemoryMb = getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_RESERVE_SUFFIX, 400);
283+
: EnvironmentVariables.getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_SUFFIX, 2000);
284+
int reserveMemoryMb = EnvironmentVariables.getMegabyteCountFromPrefixedEnv(TYPESCRIPT_RAM_RESERVE_SUFFIX, 400);
302285

303286
System.out.println("Memory for TypeScript process: " + mainMemoryMb + " MB, and " + reserveMemoryMb + " MB reserve");
304287

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Files larger than 10 MB are no longer be extracted or analyzed.

0 commit comments

Comments
 (0)