Skip to content

Commit 1621190

Browse files
authored
Parallelize block list scanning. Add mime type check for secret (#1575)
detection.
1 parent 64ff5bf commit 1621190

File tree

9 files changed

+303
-116
lines changed

9 files changed

+303
-116
lines changed

server/src/main/java/org/eclipse/openvsx/scanning/BlocklistCheckService.java

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,17 @@
1919
import org.eclipse.openvsx.util.TempFile;
2020
import org.slf4j.Logger;
2121
import org.slf4j.LoggerFactory;
22+
import org.springframework.beans.factory.annotation.Qualifier;
2223
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
2324
import org.springframework.core.annotation.Order;
25+
import org.springframework.core.task.AsyncTaskExecutor;
2426
import org.springframework.stereotype.Service;
2527

2628
import java.io.IOException;
2729
import java.io.InputStream;
2830
import java.util.*;
31+
import java.util.concurrent.CompletableFuture;
32+
import java.util.concurrent.ConcurrentHashMap;
2933
import java.util.zip.ZipEntry;
3034
import java.util.zip.ZipException;
3135
import java.util.zip.ZipFile;
@@ -52,15 +56,18 @@ public class BlocklistCheckService implements PublishCheck {
5256
private final BlocklistCheckConfig config;
5357
private final ExtensionScanConfig scanConfig;
5458
private final FileDecisionRepository fileDecisionRepository;
59+
private final AsyncTaskExecutor taskExecutor;
5560

5661
public BlocklistCheckService(
5762
BlocklistCheckConfig config,
5863
ExtensionScanConfig scanConfig,
59-
FileDecisionRepository fileDecisionRepository
64+
FileDecisionRepository fileDecisionRepository,
65+
@Qualifier("applicationTaskExecutor") AsyncTaskExecutor taskExecutor
6066
) {
6167
this.config = config;
6268
this.scanConfig = scanConfig;
6369
this.fileDecisionRepository = fileDecisionRepository;
70+
this.taskExecutor = taskExecutor;
6471
}
6572

6673
@Override
@@ -121,7 +128,8 @@ public Result check(Context context) {
121128
* Check extension files against the blocklist.
122129
*/
123130
private List<BlockedFileInfo> checkForBlockedFiles(TempFile extensionFile) {
124-
Map<String, String> fileHashes = new LinkedHashMap<>();
131+
// Thread-safe map for parallel hashing: hash -> filePath
132+
Map<String, String> fileHashes = new ConcurrentHashMap<>();
125133

126134
try (ZipFile zipFile = new ZipFile(extensionFile.getPath().toFile())) {
127135
List<? extends ZipEntry> entries = Collections.list(zipFile.entries());
@@ -132,33 +140,30 @@ private List<BlockedFileInfo> checkForBlockedFiles(TempFile extensionFile) {
132140
scanConfig.getMaxArchiveSizeBytes()
133141
);
134142

135-
int filesHashed = 0;
136-
int filesSkipped = 0;
137-
138-
for (ZipEntry entry : entries) {
139-
if (entry.isDirectory()) {
140-
continue;
141-
}
142-
143-
String filePath = entry.getName();
144-
145-
if (!ArchiveUtil.isSafePath(filePath)) {
146-
logger.debug("Skipping unsafe path: {}", filePath);
147-
filesSkipped++;
148-
continue;
149-
}
150-
151-
try {
152-
String hash = computeHash(zipFile, entry);
153-
fileHashes.put(hash, filePath);
154-
filesHashed++;
155-
} catch (IOException e) {
156-
logger.warn("Failed to hash file {}: {}", filePath, e.getMessage());
157-
filesSkipped++;
158-
}
143+
var hashableEntries = entries.stream()
144+
.filter(entry -> !entry.isDirectory())
145+
.filter(entry -> ArchiveUtil.isSafePath(entry.getName()))
146+
.toList();
147+
148+
List<CompletableFuture<Void>> futures = new ArrayList<>(hashableEntries.size());
149+
150+
for (ZipEntry entry : hashableEntries) {
151+
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
152+
try {
153+
String hash = computeHash(zipFile, entry);
154+
fileHashes.put(hash, entry.getName());
155+
} catch (IOException e) {
156+
logger.warn("Failed to hash file {}: {}", entry.getName(), e.getMessage());
157+
}
158+
}, taskExecutor);
159+
futures.add(future);
159160
}
160161

161-
logger.debug("Blocklist check: hashed {} files, skipped {}", filesHashed, filesSkipped);
162+
// Wait for all hashing to complete (non-blocking wait)
163+
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
164+
165+
logger.debug("Blocklist check: hashed {} files, skipped {}",
166+
fileHashes.size(), entries.size() - hashableEntries.size());
162167

163168
} catch (ZipException e) {
164169
logger.error("Failed to open extension file as zip: {}", e.getMessage());

server/src/main/java/org/eclipse/openvsx/scanning/SecretCheckService.java

Lines changed: 57 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@
2727
import java.util.ArrayList;
2828
import java.util.Collections;
2929
import java.util.List;
30-
import java.util.concurrent.ExecutionException;
31-
import java.util.concurrent.Future;
30+
import java.util.concurrent.CompletableFuture;
3231
import java.util.concurrent.atomic.AtomicInteger;
3332
import java.util.zip.ZipEntry;
3433
import java.util.zip.ZipException;
@@ -122,81 +121,77 @@ public PublishCheck.Result check(PublishCheck.Context context) {
122121
* Callers should check {@link #isEnabled()} before invoking this method.
123122
*/
124123
private SecretDetector.Result scanForSecrets(@NotNull TempFile extensionFile) {
125-
// Use thread-safe collection for parallel processing
124+
// Thread-safe collection for parallel processing
126125
List<SecretDetector.Finding> findings = Collections.synchronizedList(new ArrayList<>());
127-
AtomicInteger findingsCount = new AtomicInteger(0); // Cap findings to protect memory
126+
AtomicInteger findingsCount = new AtomicInteger(0);
128127

129128
try (ZipFile zipFile = new ZipFile(extensionFile.getPath().toFile())) {
130129
List<? extends ZipEntry> entries = Collections.list(zipFile.entries());
131130
ArchiveUtil.enforceArchiveLimits(entries, scanConfig.getMaxEntryCount(), scanConfig.getMaxArchiveSizeBytes());
132131

132+
// Filter to only scannable entries BEFORE creating tasks (optimization)
133+
List<? extends ZipEntry> scannableEntries = entries.stream()
134+
.filter(entry -> !entry.isDirectory())
135+
.toList();
136+
133137
AtomicInteger filesScanned = new AtomicInteger(0);
134138
AtomicInteger filesSkipped = new AtomicInteger(0);
135139

136140
long startTime = System.currentTimeMillis();
137141
long timeoutMillis = config.getTimeoutSeconds() * 1000L;
138-
List<Future<?>> tasks = new ArrayList<>(entries.size());
139142

140-
try {
141-
for (ZipEntry entry : entries) {
142-
tasks.add(taskExecutor.submit(() -> {
143-
if (System.currentTimeMillis() - startTime > timeoutMillis) {
144-
throw new SecretScanningTimeoutException("Secret detection timed out");
145-
}
146-
147-
if (Thread.currentThread().isInterrupted()) {
148-
return null;
149-
}
150-
151-
if (entry.isDirectory()) {
152-
return null;
153-
}
154-
155-
String filePath = entry.getName();
156-
try {
157-
boolean scanned = fileContentScanner.scanFile(
158-
zipFile,
159-
entry,
160-
findings,
161-
startTime,
162-
timeoutMillis,
163-
findingsCount,
164-
this::recordFinding
165-
);
166-
if (scanned) {
167-
filesScanned.incrementAndGet();
168-
} else {
169-
filesSkipped.incrementAndGet();
170-
}
171-
} catch (SecretScanningTimeoutException | ScanCancelledException e) {
172-
throw e;
173-
} catch (Exception e) {
174-
logger.error("Failed to scan file {}: {}", filePath, e.getMessage());
175-
}
176-
return null;
177-
}));
178-
}
179-
180-
for (Future<?> task : tasks) {
143+
// Submit all tasks and collect CompletableFutures
144+
List<CompletableFuture<Void>> futures = new ArrayList<>(scannableEntries.size());
145+
146+
for (ZipEntry entry : scannableEntries) {
147+
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
148+
// Check timeout at start of each task
149+
if (System.currentTimeMillis() - startTime > timeoutMillis) {
150+
throw new SecretScanningTimeoutException("Secret detection timed out");
151+
}
152+
153+
if (Thread.currentThread().isInterrupted()) {
154+
return;
155+
}
156+
157+
String filePath = entry.getName();
181158
try {
182-
task.get();
183-
} catch (InterruptedException ie) {
184-
Thread.currentThread().interrupt();
185-
break;
186-
} catch (ExecutionException ee) {
187-
Throwable cause = ee.getCause();
188-
if (cause instanceof SecretScanningTimeoutException) {
189-
throw (SecretScanningTimeoutException) ee.getCause();
190-
} else if (cause instanceof ScanCancelledException) {
191-
break;
159+
boolean scanned = fileContentScanner.scanFile(
160+
zipFile,
161+
entry,
162+
findings,
163+
startTime,
164+
timeoutMillis,
165+
findingsCount,
166+
this::recordFinding
167+
);
168+
if (scanned) {
169+
filesScanned.incrementAndGet();
170+
} else {
171+
filesSkipped.incrementAndGet();
192172
}
173+
} catch (SecretScanningTimeoutException | ScanCancelledException e) {
174+
throw e;
175+
} catch (Exception e) {
176+
logger.error("Failed to scan file {}: {}", filePath, e.getMessage());
177+
filesSkipped.incrementAndGet();
193178
}
179+
}, taskExecutor);
180+
futures.add(future);
181+
}
182+
183+
// Wait for all tasks to complete (or fail)
184+
try {
185+
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
186+
} catch (java.util.concurrent.CompletionException ce) {
187+
Throwable cause = ce.getCause();
188+
if (cause instanceof SecretScanningTimeoutException) {
189+
throw (SecretScanningTimeoutException) cause;
190+
} else if (cause instanceof ScanCancelledException) {
191+
// Max findings reached - continue with what we have
192+
logger.debug("Scan cancelled early: {}", cause.getMessage());
194193
}
195-
} finally {
196-
// Ensure we cancel any remaining tasks if the loop exits early
197-
for (Future<?> f : tasks) {
198-
f.cancel(true);
199-
}
194+
// Other exceptions: log and continue
200195
}
201196

202197
logger.debug("Secret scan complete: {} files scanned, {} files skipped, {} findings",
@@ -214,7 +209,7 @@ private SecretDetector.Result scanForSecrets(@NotNull TempFile extensionFile) {
214209
logger.error("Failed to scan extension file: {}", e.getMessage());
215210
throw new SecretScanningException("Failed to scan extension file: " + e.getMessage(), e);
216211
} catch (SecretScanningException e) {
217-
throw e; // Re-throw already wrapped exceptions
212+
throw e;
218213
} catch (Exception e) {
219214
logger.error("Failed to scan extension file: {}", e.getMessage());
220215
throw new SecretScanningException("Failed to scan extension file: " + e.getMessage(), e);

server/src/main/java/org/eclipse/openvsx/scanning/SecretDetector.java

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,18 @@
1515
import org.slf4j.Logger;
1616
import org.slf4j.LoggerFactory;
1717
import org.apache.commons.lang3.StringUtils;
18+
import org.apache.tika.Tika;
1819
import org.eclipse.openvsx.util.ArchiveUtil;
1920
import org.eclipse.openvsx.util.SizeLimitInputStream;
2021
import jakarta.validation.constraints.NotNull;
2122
import javax.annotation.Nullable;
2223

2324
import java.io.BufferedReader;
25+
import java.io.BufferedInputStream;
2426
import java.io.IOException;
2527
import java.io.InputStream;
2628
import java.io.InputStreamReader;
2729
import java.nio.charset.StandardCharsets;
28-
import java.util.ArrayList;
29-
import java.util.Collections;
3030
import java.util.HashSet;
3131
import java.util.List;
3232
import java.util.Map;
@@ -47,6 +47,10 @@ interface FindingRecorder {
4747
}
4848

4949
private static final Logger logger = LoggerFactory.getLogger(SecretDetector.class);
50+
51+
// Tika for content-based file type detection (thread-safe, reusable)
52+
private static final Tika tika = new Tika();
53+
5054
private final AhoCorasick keywordMatcher;
5155
private final Map<String, List<SecretRule>> keywordToRules;
5256
private final List<SecretRule> rules;
@@ -55,6 +59,7 @@ interface FindingRecorder {
5559
private final AhoCorasick globalStopwordMatcher;
5660
private final AhoCorasick globalExcludedExtensionMatcher;
5761
private final AhoCorasick inlineSuppressionMatcher;
62+
private final List<Pattern> skipMimeTypePatterns;
5863
private final EntropyCalculator entropyCalculator;
5964
private final long maxFileSizeBytes;
6065
private final int maxLineLength;
@@ -71,6 +76,7 @@ interface FindingRecorder {
7176
@Nullable AhoCorasick stopwordMatcher,
7277
@Nullable AhoCorasick excludedExtensionMatcher,
7378
@Nullable AhoCorasick inlineSuppressionMatcher,
79+
@Nullable List<Pattern> skipMimeTypePatterns,
7480
@NotNull EntropyCalculator entropyCalculator,
7581
long maxFileSizeBytes,
7682
int maxLineLength,
@@ -86,6 +92,7 @@ interface FindingRecorder {
8692
this.globalStopwordMatcher = stopwordMatcher;
8793
this.globalExcludedExtensionMatcher = excludedExtensionMatcher;
8894
this.inlineSuppressionMatcher = inlineSuppressionMatcher;
95+
this.skipMimeTypePatterns = skipMimeTypePatterns != null ? skipMimeTypePatterns : List.of();
8996
this.entropyCalculator = entropyCalculator;
9097
this.maxFileSizeBytes = maxFileSizeBytes;
9198
this.maxLineLength = maxLineLength;
@@ -130,6 +137,11 @@ boolean scanFile(@NotNull ZipFile zipFile,
130137
return false;
131138
}
132139

140+
// Check if file should be skipped based on MIME type
141+
if (shouldExcludeByMimeType(zipFile, entry)) {
142+
return false;
143+
}
144+
133145
// Read the file line by line with a hard byte limit
134146
try (InputStream zipStream = zipFile.getInputStream(entry);
135147
// Use a limited stream since the entry header may not correctly reflect the file size
@@ -362,10 +374,52 @@ private boolean isAllowlistedContent(@NotNull String secretValue) {
362374

363375
return false;
364376
}
365-
366-
// ========================================================================
367-
// RESULT TYPES
368-
// ========================================================================
377+
378+
/**
379+
* Check if file should be skipped based on MIME type detection using Apache Tika.
380+
*
381+
* Uses Tika to detect MIME type from file content (magic bytes, structure, heuristics).
382+
* This is more reliable than extension-based detection for files without extensions
383+
* or with misleading extensions.
384+
*
385+
* Skip patterns are configured via {@code allowlist.skip-mime-types} in the YAML config.
386+
* Each pattern is a regex matched against the detected MIME type.
387+
*
388+
* @return true if file should be skipped, false if it should be scanned
389+
*/
390+
private boolean shouldExcludeByMimeType(@NotNull ZipFile zipFile, @NotNull ZipEntry entry) {
391+
// If no skip patterns configured, don't skip any files based on MIME type
392+
if (skipMimeTypePatterns.isEmpty()) {
393+
return false;
394+
}
395+
396+
try (InputStream is = zipFile.getInputStream(entry);
397+
BufferedInputStream bis = new BufferedInputStream(is)) {
398+
399+
// Tika.detect() reads only the bytes needed for detection
400+
String mimeType = tika.detect(bis, entry.getName());
401+
402+
if (mimeType == null) {
403+
return false; // Unknown type, scan it to be safe
404+
}
405+
406+
// Check against configured skip patterns (regex)
407+
for (Pattern pattern : skipMimeTypePatterns) {
408+
if (pattern.matcher(mimeType).find()) {
409+
logger.debug("Skipping file (MIME {} matches pattern {}): {}",
410+
mimeType, pattern.pattern(), entry.getName());
411+
return true;
412+
}
413+
}
414+
415+
return false;
416+
417+
} catch (IOException e) {
418+
// If we can't detect type, don't skip - let the main scan handle errors
419+
logger.debug("Could not detect file type for {}: {}", entry.getName(), e.getMessage());
420+
return false;
421+
}
422+
}
369423

370424
/**
371425
* Result of a secret scan. Immutable.

0 commit comments

Comments
 (0)