@@ -34,8 +34,8 @@ export async function edaComprehensive(filePath, options = {}) {
3434 // Structured data mode for LLM consumption
3535 const structuredMode = options . structuredOutput || options . llmMode ;
3636
37- // Set timeout for analysis (default 30 seconds)
38- const timeoutMs = options . timeout || 30000 ;
37+ // Set timeout for analysis (default 60 seconds for large datasets )
38+ const timeoutMs = options . timeout || 60000 ;
3939
4040 const analysisPromise = performAnalysis ( ) ;
4141 const timeoutPromise = new Promise ( ( _ , reject ) => {
@@ -48,7 +48,7 @@ export async function edaComprehensive(filePath, options = {}) {
4848 return await Promise . race ( [ analysisPromise , timeoutPromise ] ) ;
4949 } catch ( error ) {
5050 outputHandler . restore ( ) ;
51- if ( spinner ) spinner . error ( { text : 'Analysis failed or timed out' } ) ;
51+ if ( spinner ) spinner . fail ( 'Analysis failed or timed out' ) ;
5252
5353 if ( error . message . includes ( 'timed out' ) ) {
5454 console . error ( chalk . red ( '🚨 EDA Analysis Timeout' ) ) ;
@@ -87,8 +87,10 @@ export async function edaComprehensive(filePath, options = {}) {
8787 }
8888
8989 if ( spinner ) spinner . text = 'Detecting column types...' ;
90+ const typeStart = Date . now ( ) ;
9091 try {
9192 columnTypes = detectColumnTypes ( records ) ;
93+ console . log ( `Column type detection took ${ Date . now ( ) - typeStart } ms` ) ;
9294 } catch ( typeError ) {
9395 throw new Error ( `Column type detection failed: ${ typeError . message } ` ) ;
9496 }
@@ -99,6 +101,17 @@ export async function edaComprehensive(filePath, options = {}) {
99101 const fileName = basename ( filePath ) ;
100102 const columns = Object . keys ( columnTypes ) ;
101103
104+ // Apply sampling for large datasets
105+ const originalRecordCount = records . length ;
106+ if ( records . length > 10000 ) {
107+ if ( spinner ) spinner . text = `Sampling large dataset (${ records . length } rows)...` ;
108+ const samplingStrategy = createSamplingStrategy ( records , 'basic' ) ;
109+ // For EDA, use max 5000 rows for analysis
110+ samplingStrategy . sampleSize = Math . min ( 5000 , samplingStrategy . sampleSize ) ;
111+ records = performSampling ( records , samplingStrategy ) ;
112+ if ( spinner ) spinner . text = `Analyzing sample of ${ records . length } rows from ${ originalRecordCount } total rows...` ;
113+ }
114+
102115 // Handle empty dataset
103116 if ( records . length === 0 ) {
104117 const report = formatComprehensiveEDAReport ( {
@@ -124,12 +137,22 @@ export async function edaComprehensive(filePath, options = {}) {
124137 if ( spinner ) spinner . text = 'Detecting analysis requirements...' ;
125138 const analysisNeeds = detectAnalysisNeeds ( records , columnTypes ) ;
126139
140+ // For very large datasets, disable expensive analyses
141+ if ( records . length > 10000 ) {
142+ analysisNeeds . regression = false ;
143+ analysisNeeds . cart = false ;
144+ analysisNeeds . correlationAnalysis = false ;
145+ analysisNeeds . timeSeries = false ;
146+ analysisNeeds . mlReadiness = false ;
147+ }
148+
127149 // Initialize analysis object
128150 const analysis = {
129151 fileName,
130152 fileSize : formatFileSize ( fileStats . size ) ,
131- rowCount : records . length ,
153+ rowCount : originalRecordCount ,
132154 columnCount : columns . length ,
155+ sampledRows : records . length < originalRecordCount ? records . length : undefined ,
133156 columns : [ ] ,
134157 numericColumnCount : 0 ,
135158 categoricalColumnCount : 0 ,
@@ -147,19 +170,28 @@ export async function edaComprehensive(filePath, options = {}) {
147170 const columnAnalyses = { } ;
148171 let totalNonNull = 0 ;
149172
150- // Process columns with timeout protection
173+ // Process columns with timeout protection
174+ const sampleForStats = records . slice ( 0 , Math . min ( 5000 , records . length ) ) ;
175+
151176 for ( const column of columns ) {
152177 try {
153178 const type = columnTypes [ column ] ;
154- const values = records . map ( r => r [ column ] ) . filter ( v => v !== null && v !== undefined ) ;
179+ // For large datasets, estimate non-null ratio from sample
180+ const sampleValues = sampleForStats . map ( r => r [ column ] ) ;
181+ const nonNullInSample = sampleValues . filter ( v => v !== null && v !== undefined ) . length ;
182+ const nonNullRatio = nonNullInSample / sampleForStats . length ;
183+ const estimatedNonNullCount = Math . round ( nonNullRatio * records . length ) ;
184+
185+ // Use sampled values for stats
186+ const values = sampleValues . filter ( v => v !== null && v !== undefined ) ;
155187
156188 const columnAnalysis = {
157189 name : column ,
158190 type : type . type ,
159- nonNullRatio : values . length / records . length
191+ nonNullRatio : nonNullRatio
160192 } ;
161193
162- totalNonNull += values . length ;
194+ totalNonNull += estimatedNonNullCount ;
163195
164196 // Add timeout protection for expensive calculations
165197 if ( [ 'integer' , 'float' ] . includes ( type . type ) && values . length > 0 ) {
@@ -239,7 +271,7 @@ export async function edaComprehensive(filePath, options = {}) {
239271
240272 for ( const col of numericColumns ) {
241273 const values = records . map ( r => r [ col ] ) ;
242- analysis . distributionAnalysis [ col ] = analyzeDistribution ( values ) ;
274+ analysis . distributionAnalysis [ col ] = await analyzeDistribution ( values ) ;
243275 }
244276 }
245277
@@ -265,8 +297,8 @@ export async function edaComprehensive(filePath, options = {}) {
265297 analysis . outlierRate = totalOutliers / ( records . length * numericColumns . length ) ;
266298 }
267299
268- // CART analysis
269- if ( analysisNeeds . cart ) {
300+ // CART analysis (skip for large datasets)
301+ if ( analysisNeeds . cart && records . length < 5000 ) {
270302 if ( spinner ) spinner . text = 'Performing CART analysis...' ;
271303 const targets = findPotentialTargets ( records , columnTypes ) ;
272304 if ( targets . length > 0 ) {
@@ -277,32 +309,43 @@ export async function edaComprehensive(filePath, options = {}) {
277309 targets [ 0 ] . column
278310 ) ;
279311 }
312+ } else if ( analysisNeeds . cart ) {
313+ analysis . cartAnalysis = { skipped : true , reason : 'Dataset too large' } ;
280314 }
281315
282- // Regression analysis
283- if ( analysisNeeds . regression ) {
316+ // Regression analysis (skip for large datasets)
317+ if ( analysisNeeds . regression && records . length < 5000 ) {
284318 if ( spinner ) spinner . text = 'Performing regression analysis...' ;
285319 analysis . regressionAnalysis = performRegressionAnalysis (
286320 records ,
287321 columns ,
288322 columnTypes
289323 ) ;
324+ } else if ( analysisNeeds . regression ) {
325+ analysis . regressionAnalysis = { skipped : true , reason : 'Dataset too large' } ;
290326 }
291327
292- // Correlation analysis
293- if ( analysisNeeds . correlationAnalysis ) {
328+ // Correlation analysis (skip for large datasets)
329+ if ( analysisNeeds . correlationAnalysis && records . length < 5000 ) {
294330 if ( spinner ) spinner . text = 'Analyzing correlations...' ;
295331 analysis . correlationAnalysis = performCorrelationAnalysis ( records , columns , columnTypes ) ;
332+ } else if ( analysisNeeds . correlationAnalysis ) {
333+ if ( spinner ) spinner . text = 'Skipping correlation analysis for large dataset...' ;
334+ analysis . correlationAnalysis = { skipped : true , reason : 'Dataset too large' } ;
296335 }
297336
298- // Pattern detection
337+ // Pattern detection (limit for large datasets)
299338 if ( analysisNeeds . patternDetection ) {
300339 if ( spinner ) spinner . text = 'Detecting patterns...' ;
301- analysis . patterns = detectPatterns ( records , columns , columnTypes ) ;
340+ const patternRecords = records . length > 5000 ? records . slice ( 0 , 5000 ) : records ;
341+ analysis . patterns = detectPatterns ( patternRecords , columns , columnTypes ) ;
342+ if ( records . length > 5000 ) {
343+ analysis . patterns . note = 'Analyzed first 5000 rows for patterns' ;
344+ }
302345 }
303346
304- // Time series analysis
305- if ( analysisNeeds . timeSeries ) {
347+ // Time series analysis (limit for large datasets)
348+ if ( analysisNeeds . timeSeries && records . length < 10000 ) {
306349 if ( spinner ) spinner . text = 'Analyzing time series...' ;
307350 const dateColumn = analysis . dateColumns [ 0 ] ; // Use first date column
308351 const numericColumns = columns . filter ( col =>
@@ -316,6 +359,8 @@ export async function edaComprehensive(filePath, options = {}) {
316359 numericColumns
317360 ) ;
318361 }
362+ } else if ( analysisNeeds . timeSeries ) {
363+ analysis . timeSeriesAnalysis = { skipped : true , reason : 'Dataset too large for time series analysis' } ;
319364 }
320365
321366 // Australian data validation
@@ -385,7 +430,7 @@ export async function edaComprehensive(filePath, options = {}) {
385430
386431 } catch ( error ) {
387432 outputHandler . restore ( ) ;
388- if ( spinner ) spinner . error ( { text : 'Error during analysis' } ) ;
433+ if ( spinner ) spinner . fail ( 'Error during analysis' ) ;
389434 console . error ( error . message ) ;
390435 if ( ! options . quiet ) process . exit ( 1 ) ;
391436 throw error ;
0 commit comments