Skip to content

Commit 0912970

Browse files
JavaZerooclaude
andauthored
Optimize memory usage for large file processing (#61)
* perf: optimize large file memory handling - Avoid repeated split() in ValueExtractor by accepting pre-split lines array - Add LTTB (Largest Triangle Three Buckets) downsampling algorithm for chart rendering - Optimize LocalStorage: large files (>500KB) store only metricsData, not raw content - Add parsing status indicator and large file warning in FileList - Add i18n translations for new UI states This should significantly reduce memory usage when handling files with 100k+ lines. * fix: range-aware downsampling for zoom support When zoomed in, filter data to visible range first, then apply LTTB downsampling. This ensures full detail is visible when zoomed in to a specific range (e.g., 100-200 out of 100000 points). * revert: remove downsampling - display all data points User prefers to see all data points without any downsampling. Removed LTTB algorithm and related code. * perf: optimize chart sync with O(1) index lookup Use WeakMap cache for x-value to index mapping, replacing O(n) findIndex with O(1) Map.get() for faster hover synchronization on large datasets. * test: add stress test scripts for large file handling - stress-test.js: Tests ValueExtractor performance with 10K/50K/100K lines - generate-test-file.js: Generates test log files for browser testing * test: add stress test scripts for large file handling - stress-test.js: Tests ValueExtractor performance with 10K/50K/100K/500K/1M lines - generate-test-file.js: Generates test log files for browser testing - Add npm run test:stress script - Update CI to run stress tests on every PR Test results: - 10K lines: 13ms parse time, 14MB memory - 50K lines: 50ms parse time, 30MB memory - 100K lines: 110ms parse time, 66MB memory - 500K lines: 378ms parse time, 284MB memory - 1M lines: 681ms parse time, 606MB memory * chore: increase LARGE_FILE_THRESHOLD from 500KB to 5MB Raise the threshold for triggering large file handling from 500KB to 5MB. This allows more files to benefit from full content persistence in LocalStorage while still protecting against quota issues with very large files. * chore: sync LARGE_FILE_THRESHOLD in stress-test.js to 5MB Keep stress test consistent with App.jsx threshold change. --------- Co-authored-by: Claude <[email protected]>
1 parent 04e722b commit 0912970

File tree

12 files changed

+330
-56
lines changed

12 files changed

+330
-56
lines changed

.github/workflows/ci.yml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: CI
33
on:
44
pull_request:
55
push:
6-
branches: [ main ]
6+
branches: [ main, master ]
77

88
jobs:
99
test:
@@ -15,6 +15,21 @@ jobs:
1515
node-version: '20'
1616
cache: 'npm'
1717
- run: npm install
18+
- run: npm run lint
1819
- run: npm test
1920
env:
2021
CI: true
22+
23+
stress-test:
24+
runs-on: ubuntu-latest
25+
steps:
26+
- uses: actions/checkout@v4
27+
- uses: actions/setup-node@v4
28+
with:
29+
node-version: '20'
30+
cache: 'npm'
31+
- run: npm install
32+
- name: Run stress test (10K - 1M lines)
33+
run: npm run test:stress
34+
env:
35+
NODE_OPTIONS: '--max-old-space-size=4096'

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
"build": "vite build",
1111
"lint": "eslint .",
1212
"preview": "vite preview",
13-
"test": "vitest run --coverage"
13+
"test": "vitest run --coverage",
14+
"test:stress": "node scripts/stress-test.js"
1415
},
1516
"dependencies": {
1617
"@tailwindcss/forms": "^0.5.10",

public/locales/en/translation.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
"fileList.disabled": "Disabled",
1313
"fileList.config": "Configure file {{name}}",
1414
"fileList.delete": "Remove file {{name}}",
15+
"fileList.parsing": "Parsing",
16+
"fileList.needsReupload": "Large file - re-upload required to re-parse",
17+
"fileList.needsReuploadTip": "File data is cached, but re-upload is required to modify parsing config",
1518
"comparison.title": "Compare Mode",
1619
"comparison.select": "Select comparison mode",
1720
"comparison.multiFileMode": "Multi-file comparison mode",

public/locales/zh/translation.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
"fileList.disabled": "已禁用",
1313
"fileList.config": "配置文件 {{name}}",
1414
"fileList.delete": "删除文件 {{name}}",
15+
"fileList.parsing": "解析中",
16+
"fileList.needsReupload": "大文件 - 需要重新上传才能重新解析",
17+
"fileList.needsReuploadTip": "此文件数据已缓存,但需要重新上传才能修改解析配置",
1518
"comparison.title": "对比模式",
1619
"comparison.select": "选择数据对比模式",
1720
"comparison.multiFileMode": "多文件对比模式",

scripts/generate-test-file.js

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/**
2+
* Generate test log file for browser testing
3+
* Run with: node scripts/generate-test-file.js [lines]
4+
* Example: node scripts/generate-test-file.js 100000
5+
*/
6+
7+
import fs from 'fs';
8+
import path from 'path';
9+
import { fileURLToPath } from 'url';
10+
11+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
12+
13+
const numLines = parseInt(process.argv[2]) || 100000;
14+
15+
console.log(`\n📝 Generating test file with ${numLines.toLocaleString()} lines...`);
16+
17+
const lines = [];
18+
for (let i = 0; i < numLines; i++) {
19+
const step = i;
20+
const loss = Math.random() * 2 + Math.exp(-i / 10000);
21+
const gradNorm = Math.random() * 0.5 + 0.1;
22+
lines.push(`step: ${step} | loss: ${loss.toFixed(6)} | grad_norm: ${gradNorm.toFixed(6)}`);
23+
}
24+
25+
const content = lines.join('\n');
26+
const outputPath = path.join(__dirname, `test-${numLines}.log`);
27+
28+
fs.writeFileSync(outputPath, content);
29+
30+
const stats = fs.statSync(outputPath);
31+
const sizeMB = (stats.size / 1024 / 1024).toFixed(2);
32+
33+
console.log(`✓ Created: ${outputPath}`);
34+
console.log(`✓ Size: ${sizeMB} MB`);
35+
console.log(`\n📌 Drag this file into the Log Analyzer to test!`);

scripts/stress-test.js

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/**
2+
* Stress test for large file handling
3+
* Run with: node scripts/stress-test.js
4+
*/
5+
6+
import { ValueExtractor } from '../src/utils/ValueExtractor.js';
7+
8+
// Generate a large test log file
9+
function generateTestLog(numLines) {
10+
console.log(`\n📝 Generating ${numLines.toLocaleString()} lines of test data...`);
11+
const startTime = Date.now();
12+
13+
const lines = [];
14+
for (let i = 0; i < numLines; i++) {
15+
const step = i;
16+
const loss = Math.random() * 2 + Math.exp(-i / 10000); // Decreasing loss with noise
17+
const gradNorm = Math.random() * 0.5 + 0.1;
18+
lines.push(`step: ${step} | loss: ${loss.toFixed(6)} | grad_norm: ${gradNorm.toFixed(6)}`);
19+
}
20+
21+
const content = lines.join('\n');
22+
const elapsed = Date.now() - startTime;
23+
const sizeBytes = Buffer.byteLength(content, 'utf8');
24+
const sizeMB = (sizeBytes / 1024 / 1024).toFixed(2);
25+
26+
console.log(` ✓ Generated in ${elapsed}ms`);
27+
console.log(` ✓ Size: ${sizeMB} MB (${sizeBytes.toLocaleString()} bytes)`);
28+
29+
return { content, sizeBytes };
30+
}
31+
32+
// Test ValueExtractor performance
33+
function testValueExtractor(content) {
34+
console.log('\n🔍 Testing ValueExtractor...');
35+
36+
// Test 1: String input (old way - splits every time)
37+
console.log('\n Test 1: extractByKeyword with string input');
38+
let start = Date.now();
39+
const result1 = ValueExtractor.extractByKeyword(content, 'loss:');
40+
let elapsed = Date.now() - start;
41+
console.log(` ✓ Found ${result1.length.toLocaleString()} matches in ${elapsed}ms`);
42+
43+
// Test 2: Pre-split lines (optimized way)
44+
console.log('\n Test 2: extractByKeyword with pre-split lines');
45+
start = Date.now();
46+
const lines = content.split('\n');
47+
const splitTime = Date.now() - start;
48+
console.log(` ✓ Split into ${lines.length.toLocaleString()} lines in ${splitTime}ms`);
49+
50+
start = Date.now();
51+
const result2 = ValueExtractor.extractByKeyword(lines, 'loss:');
52+
elapsed = Date.now() - start;
53+
console.log(` ✓ Found ${result2.length.toLocaleString()} matches in ${elapsed}ms`);
54+
55+
// Test 3: Multiple metrics with pre-split lines
56+
console.log('\n Test 3: Multiple metrics with pre-split lines (simulates worker)');
57+
start = Date.now();
58+
const lossResults = ValueExtractor.extractByKeyword(lines, 'loss:');
59+
const gradResults = ValueExtractor.extractByKeyword(lines, 'grad_norm:');
60+
elapsed = Date.now() - start;
61+
console.log(` ✓ Loss: ${lossResults.length.toLocaleString()} matches`);
62+
console.log(` ✓ Grad Norm: ${gradResults.length.toLocaleString()} matches`);
63+
console.log(` ✓ Total time: ${elapsed}ms`);
64+
65+
// Verify data integrity
66+
console.log('\n Verifying data integrity...');
67+
if (result1.length === result2.length) {
68+
console.log(` ✓ Match counts are equal: ${result1.length}`);
69+
} else {
70+
console.log(` ✗ ERROR: Match counts differ! ${result1.length} vs ${result2.length}`);
71+
}
72+
73+
return { lossResults, gradResults, lines };
74+
}
75+
76+
// Test memory usage
77+
function testMemoryUsage(label) {
78+
const used = process.memoryUsage();
79+
console.log(`\n📊 Memory Usage (${label}):`);
80+
console.log(` Heap Used: ${(used.heapUsed / 1024 / 1024).toFixed(2)} MB`);
81+
console.log(` Heap Total: ${(used.heapTotal / 1024 / 1024).toFixed(2)} MB`);
82+
console.log(` RSS: ${(used.rss / 1024 / 1024).toFixed(2)} MB`);
83+
return used;
84+
}
85+
86+
// Main test runner
87+
async function runStressTest() {
88+
console.log('═'.repeat(60));
89+
console.log('🚀 STRESS TEST FOR LARGE FILE HANDLING');
90+
console.log('═'.repeat(60));
91+
92+
const testCases = [
93+
{ lines: 10000, name: '10K lines' },
94+
{ lines: 50000, name: '50K lines' },
95+
{ lines: 100000, name: '100K lines' },
96+
{ lines: 500000, name: '500K lines' },
97+
{ lines: 1000000, name: '1M lines' },
98+
];
99+
100+
testMemoryUsage('Initial');
101+
102+
for (const testCase of testCases) {
103+
console.log('\n' + '─'.repeat(60));
104+
console.log(`📋 TEST CASE: ${testCase.name}`);
105+
console.log('─'.repeat(60));
106+
107+
// Generate test data
108+
const { content, sizeBytes } = generateTestLog(testCase.lines);
109+
110+
// Check if this would be considered a "large file"
111+
const LARGE_FILE_THRESHOLD = 5 * 1024 * 1024; // 5MB
112+
const isLargeFile = sizeBytes > LARGE_FILE_THRESHOLD;
113+
console.log(`\n Large file threshold: ${isLargeFile ? '⚠️ EXCEEDS' : '✓ Within'} (${(LARGE_FILE_THRESHOLD / 1024 / 1024).toFixed(0)}MB)`);
114+
115+
// Test ValueExtractor
116+
const { lossResults, gradResults } = testValueExtractor(content);
117+
118+
// Memory after processing
119+
testMemoryUsage('After processing');
120+
121+
// Summary
122+
console.log('\n📈 Results Summary:');
123+
console.log(` Lines processed: ${testCase.lines.toLocaleString()}`);
124+
console.log(` Loss data points: ${lossResults.length.toLocaleString()}`);
125+
console.log(` Grad norm data points: ${gradResults.length.toLocaleString()}`);
126+
127+
// Verify first and last values
128+
if (lossResults.length > 0) {
129+
console.log(` First loss value: ${lossResults[0].value.toFixed(6)} (line ${lossResults[0].line})`);
130+
console.log(` Last loss value: ${lossResults[lossResults.length - 1].value.toFixed(6)} (line ${lossResults[lossResults.length - 1].line})`);
131+
}
132+
133+
// Force GC if available
134+
if (global.gc) {
135+
global.gc();
136+
console.log('\n 🧹 Garbage collection triggered');
137+
}
138+
}
139+
140+
console.log('\n' + '═'.repeat(60));
141+
console.log('✅ STRESS TEST COMPLETE');
142+
console.log('═'.repeat(60));
143+
144+
testMemoryUsage('Final');
145+
}
146+
147+
// Run the test
148+
runStressTest()
149+
.then(() => {
150+
process.exit(0);
151+
})
152+
.catch((error) => {
153+
console.error('❌ Stress test failed:', error);
154+
process.exit(1);
155+
});

src/App.jsx

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ import { Header } from './components/Header';
1111
import { PanelLeftClose, PanelLeftOpen } from 'lucide-react';
1212
import { mergeFilesWithReplacement } from './utils/mergeFiles.js';
1313

14+
// Threshold for "large file" - files above this won't have content persisted
15+
const LARGE_FILE_THRESHOLD = 5 * 1024 * 1024; // 5MB of content
16+
1417
// Default global parsing configuration
1518
export const DEFAULT_GLOBAL_PARSING_CONFIG = {
1619
metrics: [
@@ -35,7 +38,22 @@ function App() {
3538
const { t } = useTranslation();
3639
const [uploadedFiles, setUploadedFiles] = useState(() => {
3740
const stored = localStorage.getItem('uploadedFiles');
38-
return stored ? JSON.parse(stored) : [];
41+
if (!stored) return [];
42+
try {
43+
const parsed = JSON.parse(stored);
44+
// Restore files with proper defaults for large files that have metricsData
45+
return parsed.map(file => ({
46+
...file,
47+
enabled: file.enabled ?? true,
48+
isParsing: false,
49+
// For large files, metricsData is already stored; for small files it will be re-parsed
50+
metricsData: file.metricsData || {},
51+
// Mark large files that need re-upload for re-parsing
52+
needsReupload: file.isLargeFile && !file.content
53+
}));
54+
} catch {
55+
return [];
56+
}
3957
});
4058

4159
// Global parsing configuration state
@@ -118,16 +136,26 @@ function App() {
118136
useEffect(() => {
119137
if (savingDisabledRef.current) return;
120138
try {
121-
const serialized = uploadedFiles.map(({ id, name, enabled, content, config }) => ({
122-
id,
123-
name,
124-
enabled,
125-
content,
126-
config
127-
}));
139+
// Smart serialization: for large files, only store metricsData (not raw content)
140+
// This allows the app to still display charts after refresh, but re-parsing will need re-upload
141+
const serialized = uploadedFiles.map(({ id, name, enabled, content, config, metricsData }) => {
142+
const isLargeFile = content && content.length > LARGE_FILE_THRESHOLD;
143+
return {
144+
id,
145+
name,
146+
enabled,
147+
// For large files, don't store content to save memory/storage
148+
content: isLargeFile ? null : content,
149+
config,
150+
// Store metricsData for large files so charts still work after refresh
151+
metricsData: isLargeFile ? metricsData : undefined,
152+
// Flag to indicate this file needs re-upload for re-parsing
153+
isLargeFile
154+
};
155+
});
128156
if (serialized.length > 0) {
129157
const json = JSON.stringify(serialized);
130-
// Avoid filling localStorage with very large files
158+
// Avoid filling localStorage with very large data
131159
if (json.length > 5 * 1024 * 1024) {
132160
savingDisabledRef.current = true;
133161
console.warn('Uploaded files exceed storage limit; persistence disabled.');

0 commit comments

Comments
 (0)