22using CsvHelper . Configuration ;
33using KeeperData . Core . Database ;
44using KeeperData . Core . ETL . Abstract ;
5+ using KeeperData . Core . ETL . Utils ;
56using KeeperData . Core . Reporting ;
67using KeeperData . Core . Reporting . Dtos ;
78using KeeperData . Core . Storage ;
@@ -25,12 +26,14 @@ public class IngestionPipeline(
2526 IMongoClient mongoClient ,
2627 IOptions < IDatabaseConfig > databaseConfig ,
2728 IImportReportingService reportingService ,
29+ CsvRowCounter csvRowCounter ,
2830 ILogger < IngestionPipeline > logger ) : IIngestionPipeline
2931{
3032 private const int BatchSize = 1000 ;
3133 private const int LogInterval = 100 ;
3234 private const int LineageEventBatchSize = 500 ;
3335 private readonly IDatabaseConfig _databaseConfig = databaseConfig . Value ;
36+ private readonly CsvRowCounter _rowCounter = csvRowCounter ;
3437
3538 // MongoDB field name constants
3639 private const string FieldId = "_id" ;
@@ -112,7 +115,8 @@ private async Task UpdateIngestionPhaseStartedAsync(Guid importId, CancellationT
112115 FilesProcessed = 0 ,
113116 RecordsCreated = 0 ,
114117 RecordsUpdated = 0 ,
115- RecordsDeleted = 0
118+ RecordsDeleted = 0 ,
119+ CurrentFileStatus = null
116120 } , ct ) ;
117121 }
118122
@@ -151,11 +155,12 @@ CancellationToken ct
151155
152156 totals = totals . Add ( fileResult ) ;
153157
154- // Update progress after each file
158+ // Clear current file status after completion and update overall progress
155159 await UpdateIngestionPhaseProgressAsync (
156160 importId ,
157161 processedFileCount ,
158162 totals ,
163+ null , // Clear current file status
159164 ct ) ;
160165 }
161166 }
@@ -236,6 +241,8 @@ private async Task<FileIngestionMetrics> IngestFileAsync(
236241 await EnsureWildcardIndexExistsAsync ( collection , ct ) ;
237242
238243 string ? tempFilePath = null ;
244+ IngestionProgressTracker ? progressTracker = null ;
245+
239246 try
240247 {
241248 // Track S3 download time
@@ -246,6 +253,13 @@ private async Task<FileIngestionMetrics> IngestFileAsync(
246253 logger . LogInformation ( "Downloaded file {FileKey} to temp storage: {TempPath} in {DownloadDuration}ms" ,
247254 file . Key , tempFilePath , downloadStopwatch . ElapsedMilliseconds ) ;
248255
256+ // Count rows for progress tracking
257+ var estimatedRowCount = await _rowCounter . CountRowsAsync ( tempFilePath , ct ) ;
258+ progressTracker = new IngestionProgressTracker ( file . Key , estimatedRowCount ) ;
259+
260+ logger . LogInformation ( "File {FileKey} has approximately {RowCount} data rows to process" ,
261+ file . Key , estimatedRowCount ) ;
262+
249263 // Track MongoDB ingestion time
250264 var mongoIngestionStopwatch = Stopwatch . StartNew ( ) ;
251265
@@ -265,6 +279,7 @@ private async Task<FileIngestionMetrics> IngestFileAsync(
265279 file . Key ,
266280 collectionName ,
267281 fileSet . Definition ,
282+ progressTracker ,
268283 ct ) ;
269284
270285 await csvContext . DisposeAsync ( ) ;
@@ -418,12 +433,14 @@ private async Task<FileIngestionMetrics> ProcessCsvRecordsAsync(
418433 string fileKey ,
419434 string collectionName ,
420435 DataSetDefinition definition ,
436+ IngestionProgressTracker progressTracker ,
421437 CancellationToken ct )
422438 {
423439 var metrics = new RecordMetricsAccumulator ( ) ;
424440 var batch = new List < ( BsonDocument Document , string ChangeType ) > ( ) ;
425441 var lineageEvents = new List < RecordLineageEvent > ( ) ;
426442 var totalMongoProcessingMs = 0L ;
443+ var totals = new IngestionTotals ( ) ;
427444
428445 while ( await csv . ReadAsync ( ) )
429446 {
@@ -459,8 +476,27 @@ private async Task<FileIngestionMetrics> ProcessCsvRecordsAsync(
459476 totalMongoProcessingMs += batchStopwatch . ElapsedMilliseconds ;
460477
461478 metrics . AddBatch ( batchMetrics ) ;
479+ totals = totals . Add ( new IngestionTotals
480+ {
481+ RecordsCreated = batchMetrics . RecordsCreated ,
482+ RecordsUpdated = batchMetrics . RecordsUpdated ,
483+ RecordsDeleted = batchMetrics . RecordsDeleted
484+ } ) ;
485+
486+ // Update progress tracking and report every 100 records
487+ progressTracker . UpdateProgress ( metrics . RecordsProcessed ) ;
462488
463- LogProgressIfNeeded ( metrics . RecordsProcessed , fileKey ) ;
489+ if ( metrics . RecordsProcessed % LogInterval == 0 )
490+ {
491+ LogProgressIfNeeded ( metrics . RecordsProcessed , fileKey ) ;
492+
493+ var currentStatus = progressTracker . GetCurrentStatus ( ) ;
494+ await UpdateIngestionPhaseProgressWithFileStatusAsync (
495+ importId ,
496+ totals ,
497+ currentStatus ,
498+ ct ) ;
499+ }
464500
465501 batch . Clear ( ) ;
466502
@@ -482,6 +518,16 @@ private async Task<FileIngestionMetrics> ProcessCsvRecordsAsync(
482518 ct ) ;
483519
484520 metrics . AddBatch ( batchMetrics ) ;
521+
522+ // Final progress update
523+ progressTracker . UpdateProgress ( metrics . RecordsProcessed ) ;
524+ var finalStatus = progressTracker . Complete ( ) ;
525+
526+ await UpdateIngestionPhaseProgressWithFileStatusAsync (
527+ importId ,
528+ totals ,
529+ finalStatus ,
530+ ct ) ;
485531 }
486532
487533 // Flush remaining lineage events
@@ -597,7 +643,7 @@ private async Task CreateWildcardIndexAsync(
597643 logger . LogInformation ( "Creating wildcard index on collection {CollectionName}" ,
598644 collection . CollectionNamespace . CollectionName ) ;
599645
600- var wildcardIndexKeys = Builders < BsonDocument > . IndexKeys . Wildcard ( "$**" ) ;
646+ var wildcardIndexKeys = Builders < BsonDocument > . IndexKeys . Wildcard ( ) ;
601647 var indexModel = new CreateIndexModel < BsonDocument > ( wildcardIndexKeys ) ;
602648
603649 await collection . Indexes . CreateOneAsync ( indexModel , cancellationToken : ct ) ;
@@ -944,6 +990,7 @@ private async Task UpdateIngestionPhaseProgressAsync(
944990 Guid importId ,
945991 int filesProcessed ,
946992 IngestionTotals totals ,
993+ IngestionCurrentFileStatus ? currentFileStatus ,
947994 CancellationToken ct )
948995 {
949996 await reportingService . UpdateIngestionPhaseAsync ( importId , new IngestionPhaseUpdate
@@ -952,7 +999,25 @@ private async Task UpdateIngestionPhaseProgressAsync(
952999 FilesProcessed = filesProcessed ,
9531000 RecordsCreated = totals . RecordsCreated ,
9541001 RecordsUpdated = totals . RecordsUpdated ,
955- RecordsDeleted = totals . RecordsDeleted
1002+ RecordsDeleted = totals . RecordsDeleted ,
1003+ CurrentFileStatus = currentFileStatus
1004+ } , ct ) ;
1005+ }
1006+
1007+ private async Task UpdateIngestionPhaseProgressWithFileStatusAsync (
1008+ Guid importId ,
1009+ IngestionTotals totals ,
1010+ IngestionCurrentFileStatus currentFileStatus ,
1011+ CancellationToken ct )
1012+ {
1013+ await reportingService . UpdateIngestionPhaseAsync ( importId , new IngestionPhaseUpdate
1014+ {
1015+ Status = PhaseStatus . Started ,
1016+ FilesProcessed = 0 , // Not updated during file processing
1017+ RecordsCreated = totals . RecordsCreated ,
1018+ RecordsUpdated = totals . RecordsUpdated ,
1019+ RecordsDeleted = totals . RecordsDeleted ,
1020+ CurrentFileStatus = currentFileStatus
9561021 } , ct ) ;
9571022 }
9581023
0 commit comments