@@ -23,51 +23,63 @@ def _validate_work_dir(self, work_dir: Path | str) -> Path:
2323 return work_dir_path
2424
2525 def build (self ) -> Path :
26- event_logs = self .extractor .extract ()
26+ paths = self .extractor .extract ()
2727
28- self .event_log = self ._concat (event_logs )
28+ if not paths :
29+ raise ValueError ("No files found" )
2930
30- return self .event_log
31+ self .event_log = self . _get_event_log ( paths )
3132
32- def _concat (self , event_logs : list [Path ]) -> Path :
33- if len (event_logs ) == 1 :
34- return event_logs [0 ]
33+ return self .event_log
3534
36- dat = []
37- for log in event_logs :
38- with open (log ) as log_file :
35+ def _get_event_log (self , paths : list [Path ]) -> Path :
36+ log_files = []
37+ rollover_dat = []
38+ for path in paths :
39+ with open (path ) as fobj :
3940 try :
40- line = json .loads (log_file .readline ())
41+ line = json .loads (fobj .readline ())
4142 except ValueError :
42- continue # Maybe a Databricks pricing file
43- if line ["Event" ] == "DBCEventLoggingListenerMetadata" :
44- dat .append ((line ["Rollover Number" ], line ["SparkContext Id" ], log ))
45- else :
46- raise ValueError ("Expected DBC event not found" )
43+ continue
44+ if "Event" in line :
45+ log_files .append (path )
46+ if line ["Event" ] == "DBCEventLoggingListenerMetadata" :
47+ rollover_dat .append (
48+ (line ["Rollover Number" ], line ["SparkContext Id" ], path )
49+ )
50+
51+ if rollover_dat :
52+ if len (log_files ) > len (rollover_dat ):
53+ raise ValueError ("No rollover properties found in log file" )
54+
55+ return self ._concat (rollover_dat )
56+
57+ if len (log_files ) > 1 :
58+ raise ValueError ("No rollover properties found in log file" )
59+
60+ return log_files [0 ]
61+
62+ def _concat (self , rollover_dat : list [tuple [str , str , str ]]) -> Path :
63+ rollover_df = pd .DataFrame (
64+ rollover_dat , columns = ["rollover_index" , "context_id" , "path" ]
65+ ).sort_values ("rollover_index" )
66+
67+ if not len (rollover_df .context_id .unique ()) == 1 :
68+ raise ValueError ("Not all rollover log files have the same Spark context ID" )
4769
48- df = pd .DataFrame (dat , columns = ["rollover_index" , "context_id" , "path" ]).sort_values (
49- "rollover_index"
50- )
70+ diffs = rollover_df .rollover_index .diff ()
5171
52- self ._validate_rollover_logs (df )
72+ if any (diffs > 1 ) or rollover_df .rollover_index [0 ] > 0 :
73+ raise ValueError ("Rollover log file appears to be missing" )
74+
75+ if any (diffs < 1 ):
76+ raise ValueError ("Duplicate rollover log file detected" )
5377
5478 event_log = Path (tempfile .mkstemp (suffix = "-concatenated.json" , dir = str (self .work_dir ))[1 ])
5579 with open (event_log , "w" ) as fobj :
56- for path in df .path :
80+ for path in rollover_df .path :
5781 with open (path ) as part_fobj :
5882 for line in part_fobj :
5983 fobj .write (line )
6084
6185 return event_log
62-
63- def _validate_rollover_logs (self , df : pd .DataFrame ):
64- if not len (df .context_id .unique ()) == 1 :
65- raise ValueError ("Not all rollover files have the same Spark context ID" )
66-
67- diffs = df .rollover_index .diff ()[1 :]
68-
69- if any (diffs > 1 ) or df .rollover_index [0 ] > 0 :
70- raise ValueError ("Rollover file appears to be missing" )
71-
72- if any (diffs < 1 ):
73- raise ValueError ("Duplicate rollover file detected" )
0 commit comments