|
11 | 11 | import boto3 |
12 | 12 | from collections import defaultdict |
13 | 13 |
|
14 | | -logging.basicConfig(format='%(levelname)s:%(message)s') |
15 | | - |
16 | 14 | class sparkApplication(): |
17 | 15 |
|
18 | 16 | def __init__(self, |
19 | 17 | objfile = None, # Previously saved object. This is the fastest and best option |
20 | 18 | appobj = None, # application_model object |
21 | 19 | eventlog = None, # spark eventlog path, |
22 | 20 | stdout = None, |
23 | | - debug = False |
24 | 21 | ): |
25 | 22 |
|
26 | 23 | self.eventlog = eventlog |
27 | 24 | self.existsSQL = False |
28 | 25 | self.existsExecutors = False |
29 | | - self.sparkMetadata = {} |
| 26 | + #self.sparkMetadata = {} |
30 | 27 | self.metadata = {} |
31 | 28 | self.stdout = stdout |
32 | 29 |
|
@@ -157,13 +154,13 @@ def getExecutorInfo(self, appobj): |
157 | 154 | df = defaultdict(lambda: []) |
158 | 155 | for xid, executor in appobj.executors.items(): |
159 | 156 |
|
160 | | - #print(executor.end_time) |
| 157 | + # print(executor.end_time) |
161 | 158 | # Special case for handling end_time |
162 | 159 | if executor.end_time is not None: |
163 | 160 | end_time = executor.end_time/1000 - appobj.start_time |
164 | 161 | else: |
165 | | - #print('None detected') |
166 | | - end_time = appobj.finish_time - appobj.start_time |
| 162 | + # print('None detected') |
| 163 | + end_time = executor.end_time |
167 | 164 |
|
168 | 165 | df['executor_id'].append(xid) |
169 | 166 | df['cores'] .append(executor.cores) |
@@ -336,7 +333,9 @@ def getAllTaskData(self, appobj): |
336 | 333 | 'end_time' : end_time, |
337 | 334 | 'duration' : duration, |
338 | 335 | #'input_mb' : input_mb, |
339 | | - 'remote_mb_read': remote_mb_read, |
| 336 | + |
| 337 | + # Duplicate entry: |
| 338 | + # 'remote_mb_read': remote_mb_read, |
340 | 339 | 'locality' : locality, |
341 | 340 |
|
342 | 341 | # Disk-based performance metrics |
@@ -560,11 +559,24 @@ def getAllDriverAccumData(self, appobj): |
560 | 559 | self.accumData = df |
561 | 560 |
|
562 | 561 | def getAllMetaData(self, appobj): |
563 | | - self.sparkMetadata = (appobj.spark_metadata) |
564 | | - self.metadata = {"app_name": appobj.app_name, |
565 | | - "start_time": appobj.start_time} |
566 | | - |
567 | 562 |
|
| 563 | + #self.sparkMetadata = (appobj.spark_metadata) |
| 564 | + |
| 565 | + self.metadata = { |
| 566 | + 'application_info' : { |
| 567 | + 'timestamp_start_ms' : int(appobj.start_time*1000), |
| 568 | + 'timestamp_end_ms' : int(appobj.finish_time*1000), |
| 569 | + 'runtime_sec' : appobj.finish_time - appobj.start_time, |
| 570 | + 'name' : appobj.app_name, |
| 571 | + 'id' : appobj.spark_metadata['spark.app.id'], |
| 572 | + 'spark_version' : appobj.spark_version, |
| 573 | + 'cloud_platform' : appobj.cloud_platform, |
| 574 | + 'cloud_provider' : appobj.cloud_provider |
| 575 | + |
| 576 | + }, |
| 577 | + 'spark_params' : appobj.spark_metadata |
| 578 | + } |
| 579 | + |
568 | 580 | def addMetadata(self, key=None, value=None): |
569 | 581 |
|
570 | 582 | if (key is None) or (value is None): |
@@ -640,7 +652,7 @@ def save(self, filepath=None, compress=False): |
640 | 652 | saveDat['executors'] = self.executorData.reset_index().to_dict('list') |
641 | 653 |
|
642 | 654 | saveDat['metadata'] = self.metadata |
643 | | - saveDat['sparkMetadata'] = self.sparkMetadata |
| 655 | + #saveDat['sparkMetadata'] = self.sparkMetadata |
644 | 656 | saveDat['metadata']['existsSQL'] = self.existsSQL |
645 | 657 | saveDat['metadata']['existsExecutors'] = self.existsExecutors |
646 | 658 |
|
@@ -703,21 +715,36 @@ def load(self, filepath=None): |
703 | 715 | self.metadata = saveDat['metadata'] |
704 | 716 | self.existsSQL = self.metadata.pop('existsSQL') |
705 | 717 | self.existsExecutors = self.metadata.pop('existsExecutors') |
706 | | - self.sparkMetadata = saveDat.pop('sparkMetadata') |
| 718 | + |
| 719 | + # This is for legacy support and should be removed after it is in production for a few |
| 720 | + # weeks. Introduced 3/9/2022 by SDG. |
| 721 | + if 'sparkMetadata' in saveDat: |
| 722 | + self.sparkMetadata = saveDat.pop('sparkMetadata') |
707 | 723 |
|
| 724 | + # SPC113 - SDG |
| 725 | + # Because of the way jobData is created, if there are no job Events in the eventlog then the |
| 726 | + # correct fields will not exist. A second condition checking for the 'job_id' field is |
| 727 | + # necessary here to ensure this method will run if this is the case. |
| 728 | + # |
| 729 | + # Note: stageData is initialized differently so this same issue does not exist for that |
| 730 | + # structure. Furthermore, in the event that 'jobData' has no values within, 'stageData' will |
| 731 | + # also have no values and an invalidLog exception will be thrown during log validation |
| 732 | + # in SparkApplicaionAdvanced. |
| 733 | + if ('jobData' in saveDat) and ('job_id' in saveDat['jobData']): |
| 734 | + self.jobData = pd.DataFrame.from_dict(saveDat['jobData']) |
| 735 | + self.jobData = self.jobData.set_index('job_id') |
708 | 736 |
|
709 | | - if 'jobData' in saveDat: self.jobData = pd.DataFrame.from_dict(saveDat['jobData'] ).set_index('job_id') |
710 | 737 | if 'stageData' in saveDat: self.stageData = pd.DataFrame.from_dict(saveDat['stageData']).set_index('stage_id') |
711 | 738 | if 'taskData' in saveDat: self.taskData = pd.DataFrame.from_dict(saveDat['taskData'] ).set_index('task_id') |
712 | | - if 'accumData' in saveDat: |
| 739 | + if 'accumData' in saveDat: |
713 | 740 | self.accumData = pd.DataFrame.from_dict(saveDat['accumData'] ) |
714 | 741 | if 'sql_id' in self.accumData.columns: |
715 | 742 | self.accumData = self.accumData.set_index('sql_id') |
716 | 743 |
|
717 | 744 | if self.existsSQL: |
718 | 745 | self.sqlData = pd.DataFrame.from_dict(saveDat['sqlData']).set_index('sql_id') |
719 | 746 | if self.existsExecutors: |
720 | | - self.executorData = pd.DataFrame.from_dict(saveDat['executors']).set_index('executor_id') |
| 747 | + self.executorData = pd.DataFrame.from_dict(saveDat['executors']).set_index('executor_id') |
721 | 748 |
|
722 | 749 | logging.info('Loaded object from: %s [%.2f]' % (filepath, (time.time()-t1))) |
723 | 750 |
|
|
0 commit comments