Skip to content

Commit 5cd467b

Browse files
committed
fix: complete minor issue resolution for v1.6.5+ perfection
🎯 SECTION 2: Quality Scoring Calibration - Timeliness: 50β†’85 for datasets without dates (more realistic) - Precision: More lenient thresholds (6 vs 3 decimal variations) - Reduced penalty: 10β†’5 points per issue, max 30 points 🧠 SECTION 6: Business Logic Intelligence - Filter out ID columns from modeling targets (product_id, etc.) - Enhanced business appropriateness checks for all target selection - Applied to both main and fallback task identification - Now suggests only practical modeling tasks Results: Clean datasets now score appropriately, and Section 6 recommends sensible targets like 'price' instead of 'product_id' Addresses final v1.6.5 review feedback for 10/10 perfection
1 parent 27c0a24 commit 5cd467b

File tree

138 files changed

+1328
-30
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+1328
-30
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"key":"section1_corrupted_add214f61f953bfb","data":{"overview":{"fileDetails":{"originalFilename":"corrupted.csv","fullResolvedPath":"/Users/[user]/plum/corrupted.csv","fileSizeBytes":50,"fileSizeMB":0.000048,"mimeType":"text/csv","lastModified":"2025-06-28T23:06:45.172Z","sha256Hash":"d1ececc260032d0f77f454d09dc8ad40ba53943b7eec69ec66fe77d1f3cc1a9a","compressionAnalysis":{"originalSizeBytes":50,"estimatedGzipSizeBytes":67,"estimatedGzipReduction":-34,"estimatedParquetSizeBytes":25,"estimatedParquetReduction":50,"columnEntropy":[{"columnName":"corrupted","entropy":1,"compressionPotential":"high"},{"columnName":"data","entropy":1,"compressionPotential":"high"}],"recommendedFormat":"none","analysisMethod":"Sample-based analysis (0KB sample)"},"healthCheck":{"bomDetected":false,"lineEndingConsistency":"consistent","nullBytesDetected":false,"validEncodingThroughout":true,"largeFileWarning":false,"recommendations":[],"healthScore":100}},"parsingMetadata":{"dataSourceType":"Local File System","parsingEngine":"DataPilot Advanced CSV Parser v1.0.0","parsingTimeSeconds":0.001,"encoding":{"encoding":"utf8","detectionMethod":"Statistical Character Pattern Analysis","confidence":90,"bomDetected":false},"delimiter":{"delimiter":",","detectionMethod":"Character Frequency Analysis with Field Consistency Scoring","confidence":90,"alternativesConsidered":[{"delimiter":"TAB","score":70},{"delimiter":";","score":70},{"delimiter":"|","score":70},{"delimiter":":","score":70},{"delimiter":",","score":50}]},"lineEnding":"LF","quotingCharacter":"\"","emptyLinesEncountered":1,"headerProcessing":{"headerPresence":"Detected","headerRowNumbers":[1],"columnNamesSource":"First row interpreted as column headers"},"initialScanLimit":{"method":"First 50 bytes or 1000 lines","linesScanned":3,"bytesScanned":50}},"structuralDimensions":{"totalRowsRead":3,"totalDataRows":2,"totalColumns":3,"totalDataCells":6,"columnInventory":[{"index":1,"name":"corrupted","originalIndex":0},{"index":2,"name":"data","originalIndex":1},{"index":3,"name":"file","originalIndex":2}],"estimatedInMemorySizeMB":0,"averageRowLengthBytes":17,"sparsityAnalysis":{"sparsityPercentage":20,"method":"Full dataset analysis","sampleSize":2,"description":"Moderately sparse with significant missing values"},"quickStatistics":{"numericColumns":0,"textColumns":3,"dateColumns":0,"booleanColumns":0,"emptyColumns":1,"highCardinalityColumns":3,"lowCardinalityColumns":0,"potentialIdColumns":["corrupted","data"],"columnTypes":[{"columnName":"corrupted","detectedType":"text","uniqueValueCount":2,"cardinality":"high"},{"columnName":"data","detectedType":"text","uniqueValueCount":2,"cardinality":"high"},{"columnName":"file","detectedType":"empty","uniqueValueCount":1,"cardinality":"high"}],"analysisMethod":"Sample-based analysis (2 rows)","sampleSize":2}},"executionContext":{"fullCommandExecuted":"datapilot modeling corrupted.csv","analysisMode":"Comprehensive Deep Scan","analysisStartTimestamp":"2025-06-28T23:06:49.851Z","globalSamplingStrategy":"Full dataset analysis (No record sampling applied for initial overview)","activatedModules":["File I/O Manager","Advanced CSV Parser","Metadata Collector","Structural Analyzer","Report Generator"],"processingTimeSeconds":0.005,"hostEnvironment":{"operatingSystem":"macOS (Unknown Version)","systemArchitecture":"ARM64 (Apple Silicon/ARM 64-bit)","executionRuntime":"Node.js v23.6.1 (V8 12.9.202.28-node.12) on darwin","availableCpuCores":8,"availableMemoryGB":8,"nodeVersion":"v23.6.1"}},"dataPreview":{"headerRow":["corrupted","data","file"],"sampleRows":[["with","incomplete",""],["missing","data"]],"totalRowsShown":2,"totalRowsInFile":2,"truncated":false,"previewMethod":"head","generationTimeMs":1},"generatedAt":"2025-06-28T23:06:49.856Z","version":"1.0.0"},"warnings":[{"category":"structural","severity":"low","message":"Very small dataset (2 rows)","impact":"Limited statistical analysis capability","suggestion":"Statistical tests may have low power"}],"performanceMetrics":{"totalAnalysisTime":0.005,"peakMemoryUsage":62.89,"phases":{"file-analysis":0.002,"parsing":0.001,"structural-analysis":0.001,"data-preview":0.001}}},"size":4103,"timestamp":"2025-06-28T23:06:49.857Z","lastAccessed":"2025-06-28T23:06:49.857Z","accessCount":1,"checksum":"fae173fc14ed3fdf669520bae59b9542","dependencies":[],"options":{"cacheVersion":"1.0.0","enableHashing":true},"ttl":20000,"version":"1.0.0","filePath":"corrupted.csv","sectionName":"section1"}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"key":"section1_customers_f9528a995765ceea","data":{"overview":{"fileDetails":{"originalFilename":"customers.csv","fullResolvedPath":"/Users/[user]/plum/customers.csv","fileSizeBytes":239,"fileSizeMB":0.000228,"mimeType":"text/csv","lastModified":"2025-06-28T22:47:53.779Z","sha256Hash":"0a73273399af364b82fef159e0d305cf18d0d2fe23c3a4c7402e71c9a6c1f79b","compressionAnalysis":{"originalSizeBytes":239,"estimatedGzipSizeBytes":156,"estimatedGzipReduction":35,"estimatedParquetSizeBytes":120,"estimatedParquetReduction":50,"columnEntropy":[{"columnName":"customer_id","entropy":2.32,"compressionPotential":"medium"},{"columnName":"customer_name","entropy":2.32,"compressionPotential":"medium"},{"columnName":"email","entropy":2.32,"compressionPotential":"medium"},{"columnName":"phone","entropy":2.32,"compressionPotential":"medium"}],"recommendedFormat":"parquet","analysisMethod":"Sample-based analysis (0KB sample)"},"healthCheck":{"bomDetected":false,"lineEndingConsistency":"consistent","nullBytesDetected":false,"validEncodingThroughout":true,"largeFileWarning":false,"recommendations":[],"healthScore":100}},"parsingMetadata":{"dataSourceType":"Local File System","parsingEngine":"DataPilot Advanced CSV Parser v1.0.0","parsingTimeSeconds":0.007,"encoding":{"encoding":"utf8","detectionMethod":"Statistical Character Pattern Analysis","confidence":95,"bomDetected":false},"delimiter":{"delimiter":",","detectionMethod":"Character Frequency Analysis with Field Consistency Scoring","confidence":100,"alternativesConsidered":[{"delimiter":",","score":75},{"delimiter":"TAB","score":70},{"delimiter":";","score":70},{"delimiter":"|","score":70},{"delimiter":":","score":70}]},"lineEnding":"LF","quotingCharacter":"\"","emptyLinesEncountered":0,"headerProcessing":{"headerPresence":"Detected","headerRowNumbers":[1],"columnNamesSource":"First row interpreted as column headers"},"initialScanLimit":{"method":"First 239 bytes or 1000 lines","linesScanned":5,"bytesScanned":239}},"structuralDimensions":{"totalRowsRead":6,"totalDataRows":5,"totalColumns":4,"totalDataCells":20,"columnInventory":[{"index":1,"name":"customer_id","originalIndex":0},{"index":2,"name":"customer_name","originalIndex":1},{"index":3,"name":"email","originalIndex":2},{"index":4,"name":"phone","originalIndex":3}],"estimatedInMemorySizeMB":0,"averageRowLengthBytes":40,"sparsityAnalysis":{"sparsityPercentage":0,"method":"Full dataset analysis","sampleSize":5,"description":"Dense dataset with minimal missing values"},"quickStatistics":{"numericColumns":1,"textColumns":3,"dateColumns":0,"booleanColumns":0,"emptyColumns":0,"highCardinalityColumns":4,"lowCardinalityColumns":0,"potentialIdColumns":["customer_id","phone"],"columnTypes":[{"columnName":"customer_id","detectedType":"numeric","uniqueValueCount":5,"cardinality":"high"},{"columnName":"customer_name","detectedType":"text","uniqueValueCount":5,"cardinality":"high"},{"columnName":"email","detectedType":"text","uniqueValueCount":5,"cardinality":"high"},{"columnName":"phone","detectedType":"text","uniqueValueCount":5,"cardinality":"high"}],"analysisMethod":"Sample-based analysis (5 rows)","sampleSize":5}},"executionContext":{"fullCommandExecuted":"datapilot engineering customers.csv","analysisMode":"Comprehensive Deep Scan","analysisStartTimestamp":"2025-06-28T22:48:16.748Z","globalSamplingStrategy":"Full dataset analysis (No record sampling applied for initial overview)","activatedModules":["File I/O Manager","Advanced CSV Parser","Metadata Collector","Structural Analyzer","Report Generator"],"processingTimeSeconds":0.014,"hostEnvironment":{"operatingSystem":"macOS (Unknown Version)","systemArchitecture":"ARM64 (Apple Silicon/ARM 64-bit)","executionRuntime":"Node.js v23.6.1 (V8 12.9.202.28-node.12) on darwin","availableCpuCores":8,"availableMemoryGB":8,"nodeVersion":"v23.6.1"}},"dataPreview":{"headerRow":["customer_id","customer_name","email","phone"],"sampleRows":[["1","Alice Johnson","alice@example.com","555-0101"],["2","Bob Smith","bob@example.com","555-0102"],["3","Carol Williams","carol@example.com","555-0103"],["4","David Brown","david@example.com","555-0104"],["5","Eva Davis","eva@example.com","555-0105"]],"totalRowsShown":5,"totalRowsInFile":5,"truncated":false,"previewMethod":"head","generationTimeMs":2},"generatedAt":"2025-06-28T22:48:16.763Z","version":"1.0.0"},"warnings":[{"category":"parsing","severity":"low","message":"Processing rate: 857 rows/second","impact":"Slower than optimal performance"},{"category":"structural","severity":"low","message":"Very small dataset (5 rows)","impact":"Limited statistical analysis capability","suggestion":"Statistical tests may have low power"}],"performanceMetrics":{"totalAnalysisTime":0.015,"phases":{"file-analysis":0.003,"parsing":0.007,"structural-analysis":0.001,"data-preview":0.002}}},"size":4761,"timestamp":"2025-06-28T22:48:16.768Z","lastAccessed":"2025-06-28T22:48:16.768Z","accessCount":1,"checksum":"c61b4f31bc4ad46c222596c342d16233","dependencies":[],"options":{"cacheVersion":"1.0.0","enableHashing":true},"ttl":20000,"version":"1.0.0","filePath":"customers.csv","sectionName":"section1"}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"key":"section1_demo_error_a1dc40dca35693a0","data":{"overview":{"fileDetails":{"originalFilename":"demo_error.csv","fullResolvedPath":"/Users/[user]/plum/demo_error.csv","fileSizeBytes":52,"fileSizeMB":0.00005,"mimeType":"text/csv","lastModified":"2025-06-28T22:58:39.140Z","sha256Hash":"3cad50cb29fbf1385e80da8ba49e7f320cf56e1d14d99d7178e47db5aff1470f","compressionAnalysis":{"originalSizeBytes":52,"estimatedGzipSizeBytes":68,"estimatedGzipReduction":-31,"estimatedParquetSizeBytes":26,"estimatedParquetReduction":50,"columnEntropy":[{"columnName":"name","entropy":1,"compressionPotential":"high"},{"columnName":"age","entropy":0,"compressionPotential":"high"},{"columnName":"invalid","entropy":0,"compressionPotential":"high"}],"recommendedFormat":"none","analysisMethod":"Sample-based analysis (0KB sample)"},"healthCheck":{"bomDetected":false,"lineEndingConsistency":"consistent","nullBytesDetected":false,"validEncodingThroughout":true,"largeFileWarning":false,"recommendations":[],"healthScore":100}},"parsingMetadata":{"dataSourceType":"Local File System","parsingEngine":"DataPilot Advanced CSV Parser v1.0.0","parsingTimeSeconds":0.001,"encoding":{"encoding":"utf8","detectionMethod":"Statistical Character Pattern Analysis","confidence":90,"bomDetected":false},"delimiter":{"delimiter":",","detectionMethod":"Character Frequency Analysis with Field Consistency Scoring","confidence":70,"alternativesConsidered":[{"delimiter":"TAB","score":70},{"delimiter":";","score":70},{"delimiter":"|","score":70},{"delimiter":":","score":70},{"delimiter":",","score":36}]},"lineEnding":"LF","quotingCharacter":"\"","emptyLinesEncountered":1,"headerProcessing":{"headerPresence":"Detected","headerRowNumbers":[1],"columnNamesSource":"First row interpreted as column headers"},"initialScanLimit":{"method":"First 52 bytes or 1000 lines","linesScanned":3,"bytesScanned":52}},"structuralDimensions":{"totalRowsRead":3,"totalDataRows":2,"totalColumns":3,"totalDataCells":6,"columnInventory":[{"index":1,"name":"name","originalIndex":0},{"index":2,"name":"age","originalIndex":1},{"index":3,"name":"invalid","originalIndex":2}],"estimatedInMemorySizeMB":0,"averageRowLengthBytes":17,"sparsityAnalysis":{"sparsityPercentage":0,"method":"Full dataset analysis","sampleSize":2,"description":"Dense dataset with minimal missing values"},"quickStatistics":{"numericColumns":1,"textColumns":2,"dateColumns":0,"booleanColumns":0,"emptyColumns":0,"highCardinalityColumns":3,"lowCardinalityColumns":0,"potentialIdColumns":["invalid"],"columnTypes":[{"columnName":"name","detectedType":"text","uniqueValueCount":2,"cardinality":"high"},{"columnName":"age","detectedType":"numeric","uniqueValueCount":1,"cardinality":"high"},{"columnName":"invalid","detectedType":"text","uniqueValueCount":1,"cardinality":"high"}],"analysisMethod":"Sample-based analysis (2 rows)","sampleSize":2}},"executionContext":{"fullCommandExecuted":"datapilot all demo_error.csv","analysisMode":"Comprehensive Deep Scan","analysisStartTimestamp":"2025-06-28T22:58:39.390Z","globalSamplingStrategy":"Full dataset analysis (No record sampling applied for initial overview)","activatedModules":["File I/O Manager","Advanced CSV Parser","Metadata Collector","Structural Analyzer","Report Generator"],"processingTimeSeconds":0.004,"hostEnvironment":{"operatingSystem":"macOS (Unknown Version)","systemArchitecture":"ARM64 (Apple Silicon/ARM 64-bit)","executionRuntime":"Node.js v23.6.1 (V8 12.9.202.28-node.12) on darwin","availableCpuCores":8,"availableMemoryGB":8,"nodeVersion":"v23.6.1"}},"dataPreview":{"headerRow":["name","age","invalid"],"sampleRows":[["John","25","data"],["Invalid row structure"]],"totalRowsShown":2,"totalRowsInFile":2,"truncated":false,"previewMethod":"head","generationTimeMs":0},"generatedAt":"2025-06-28T22:58:39.394Z","version":"1.0.0"},"warnings":[{"category":"structural","severity":"low","message":"Very small dataset (2 rows)","impact":"Limited statistical analysis capability","suggestion":"Statistical tests may have low power"}],"performanceMetrics":{"totalAnalysisTime":0.005,"peakMemoryUsage":62.48,"phases":{"file-analysis":0.002,"parsing":0.001,"structural-analysis":0.001,"data-preview":0.001}}},"size":4139,"timestamp":"2025-06-28T22:58:39.398Z","lastAccessed":"2025-06-28T22:58:39.398Z","accessCount":1,"checksum":"f1aaee045829988ec7a0916fcf20a859","dependencies":[],"options":{"cacheVersion":"1.0.0","enableHashing":true,"privacyMode":"redacted","sampleMethod":"random"},"ttl":20000,"version":"1.0.0","filePath":"demo_error.csv","sectionName":"section1"}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"key":"section1_sales_data_0e3617c74cdf4f48","data":{"overview":{"fileDetails":{"originalFilename":"sales_data.csv","fullResolvedPath":"/Users/[user]/plum/examples/sales_data.csv","fileSizeBytes":355,"fileSizeMB":0.000339,"mimeType":"text/csv","lastModified":"2025-06-28T22:37:57.836Z","sha256Hash":"6a0196787fc4bddd3fc38d4843830f99117a0bfd4a94cd36fcf293e34194b62b","compressionAnalysis":{"originalSizeBytes":355,"estimatedGzipSizeBytes":210,"estimatedGzipReduction":41,"estimatedParquetSizeBytes":178,"estimatedParquetReduction":50,"columnEntropy":[{"columnName":"order_id","entropy":2.32,"compressionPotential":"medium"},{"columnName":"customer_id","entropy":1.52,"compressionPotential":"high"},{"columnName":"product_name","entropy":1.37,"compressionPotential":"high"},{"columnName":"quantity","entropy":2.32,"compressionPotential":"medium"},{"columnName":"unit_price","entropy":1.37,"compressionPotential":"high"},{"columnName":"total_amount","entropy":2.32,"compressionPotential":"medium"},{"columnName":"order_date","entropy":2.32,"compressionPotential":"medium"},{"columnName":"status","entropy":0.72,"compressionPotential":"high"}],"recommendedFormat":"parquet","analysisMethod":"Sample-based analysis (0KB sample)"},"healthCheck":{"bomDetected":false,"lineEndingConsistency":"consistent","nullBytesDetected":false,"validEncodingThroughout":true,"largeFileWarning":false,"recommendations":[],"healthScore":100}},"parsingMetadata":{"dataSourceType":"Local File System","parsingEngine":"DataPilot Advanced CSV Parser v1.0.0","parsingTimeSeconds":0.001,"encoding":{"encoding":"utf8","detectionMethod":"Statistical Character Pattern Analysis","confidence":95,"bomDetected":false},"delimiter":{"delimiter":",","detectionMethod":"Character Frequency Analysis with Field Consistency Scoring","confidence":100,"alternativesConsidered":[{"delimiter":"TAB","score":70},{"delimiter":";","score":70},{"delimiter":"|","score":70},{"delimiter":":","score":70},{"delimiter":",","score":23}]},"lineEnding":"LF","quotingCharacter":"\"","emptyLinesEncountered":1,"headerProcessing":{"headerPresence":"Detected","headerRowNumbers":[1],"columnNamesSource":"First row interpreted as column headers"},"initialScanLimit":{"method":"First 355 bytes or 1000 lines","linesScanned":6,"bytesScanned":355}},"structuralDimensions":{"totalRowsRead":6,"totalDataRows":5,"totalColumns":8,"totalDataCells":40,"columnInventory":[{"index":1,"name":"order_id","originalIndex":0},{"index":2,"name":"customer_id","originalIndex":1},{"index":3,"name":"product_name","originalIndex":2},{"index":4,"name":"quantity","originalIndex":3},{"index":5,"name":"unit_price","originalIndex":4},{"index":6,"name":"total_amount","originalIndex":5},{"index":7,"name":"order_date","originalIndex":6},{"index":8,"name":"status","originalIndex":7}],"estimatedInMemorySizeMB":0,"averageRowLengthBytes":59,"sparsityAnalysis":{"sparsityPercentage":0,"method":"Full dataset analysis","sampleSize":5,"description":"Dense dataset with minimal missing values"},"quickStatistics":{"numericColumns":4,"textColumns":3,"dateColumns":1,"booleanColumns":0,"emptyColumns":0,"highCardinalityColumns":7,"lowCardinalityColumns":0,"potentialIdColumns":["order_id","customer_id","order_date"],"columnTypes":[{"columnName":"order_id","detectedType":"numeric","uniqueValueCount":5,"cardinality":"high"},{"columnName":"customer_id","detectedType":"text","uniqueValueCount":3,"cardinality":"high"},{"columnName":"product_name","detectedType":"text","uniqueValueCount":3,"cardinality":"high"},{"columnName":"quantity","detectedType":"numeric","uniqueValueCount":5,"cardinality":"high"},{"columnName":"unit_price","detectedType":"numeric","uniqueValueCount":3,"cardinality":"high"},{"columnName":"total_amount","detectedType":"numeric","uniqueValueCount":5,"cardinality":"high"},{"columnName":"order_date","detectedType":"date","uniqueValueCount":5,"cardinality":"high"},{"columnName":"status","detectedType":"text","uniqueValueCount":2,"cardinality":"medium"}],"analysisMethod":"Sample-based analysis (5 rows)","sampleSize":5}},"executionContext":{"fullCommandExecuted":"datapilot modeling examples/sales_data.csv","analysisMode":"Comprehensive Deep Scan","analysisStartTimestamp":"2025-06-28T22:37:58.280Z","globalSamplingStrategy":"Full dataset analysis (No record sampling applied for initial overview)","activatedModules":["File I/O Manager","Advanced CSV Parser","Metadata Collector","Structural Analyzer","Report Generator"],"processingTimeSeconds":0.005,"hostEnvironment":{"operatingSystem":"macOS (Unknown Version)","systemArchitecture":"ARM64 (Apple Silicon/ARM 64-bit)","executionRuntime":"Node.js v23.6.1 (V8 12.9.202.28-node.12) on darwin","availableCpuCores":8,"availableMemoryGB":8,"nodeVersion":"v23.6.1"}},"dataPreview":{"headerRow":["order_id","customer_id","product_name","quantity","unit_price","total_amount","order_date","status"],"sampleRows":[["1001","C001","Widget A","5","19.99","99.95","2024-01-15","completed"],["1002","C002","Gadget B","2","49.99","99.98","2024-01-16","completed"],["1003","C001","Widget A","3","19.99","59.97","2024-01-17","pending"],["1004","C003","Tool C","1","149.99","149.99","2024-01-18","completed"],["1005","C002","Widget A","10","19.99","199.90","2024-01-19","completed"]],"totalRowsShown":5,"totalRowsInFile":5,"truncated":false,"previewMethod":"head","generationTimeMs":1},"generatedAt":"2025-06-28T22:37:58.285Z","version":"1.0.0"},"warnings":[{"category":"structural","severity":"low","message":"Very small dataset (5 rows)","impact":"Limited statistical analysis capability","suggestion":"Statistical tests may have low power"}],"performanceMetrics":{"totalAnalysisTime":0.005,"phases":{"file-analysis":0.001,"parsing":0.002,"structural-analysis":0.001,"data-preview":0.001}}},"size":5710,"timestamp":"2025-06-28T22:37:58.287Z","lastAccessed":"2025-06-28T22:37:58.287Z","accessCount":1,"checksum":"08aaae42649a1c7f40bbc71fbfb44b12","dependencies":[],"options":{"cacheVersion":"1.0.0","enableHashing":true},"ttl":20000,"version":"1.0.0","filePath":"examples/sales_data.csv","sectionName":"section1"}

0 commit comments

Comments
Β (0)