66
77import  common 
88
9+ 
910# Simple median calculation 
1011class  SimpleMedian :
1112
12- 	 def  __init__ (self ):
13- 		 self .elements  =  []
13+      def  __init__ (self ):
14+          self .elements  =  []
1415
15- 	 def  add (self , n : float ):
16- 		 self .elements .append (n )
16+      def  add (self , n : float ):
17+          self .elements .append (n )
1718
18- 	 def  get_median (self ) ->  float :
19- 		 return  statistics .median (elements )
19+      def  get_median (self ) ->  float :
20+          return  statistics .median (elements )
2021
2122
2223# Calculate medians incrementally using a heap: Useful for when dealing with 
@@ -26,88 +27,100 @@ def get_median(self) -> float:
2627# with precommit in mind, but if this only runs nightly, it would actually be 
2728# faster to do a normal median calculation. 
2829class  StreamingMedian :
29- 	 
30+ 
3031    def  __init__ (self ):
31- 		 # Gist: we keep a minheap and a maxheap, and store the median as the top 
32- 		 # of the minheap. When a new element comes it gets put into the heap 
33- 		 # based on if the element is bigger than the current median. Then, the 
34- 		 # heaps are heapified and the median is repopulated by heapify. 
32+          # Gist: we keep a minheap and a maxheap, and store the median as the top 
33+          # of the minheap. When a new element comes it gets put into the heap 
34+          # based on if the element is bigger than the current median. Then, the 
35+          # heaps are heapified and the median is repopulated by heapify. 
3536        self .minheap_larger  =  []
3637        self .maxheap_smaller  =  []
37- 		# Note: numbers on maxheap should be negative, as heapq 
38-         # is minheap by default 
38+ 
39+     # Note: numbers on maxheap should be negative, as heapq 
40+     # is minheap by default 
3941
4042    def  add (self , n : float ):
4143        if  len (self .maxheap_smaller ) ==  0  or  - self .maxheap_smaller [0 ] >=  n :
4244            heapq .heappush (self .maxheap_smaller , - n )
4345        else :
4446            heapq .heappush (self .minheap_larger , n )
4547
46- 		 # Ensure minheap has more elements than maxheap 
48+          # Ensure minheap has more elements than maxheap 
4749        if  len (self .maxheap_smaller ) >  len (self .minheap_larger ) +  1 :
48-             heapq .heappush (self .minheap_larger ,
49- 						   - heapq .heappop (self .maxheap_smaller ))
50+             heapq .heappush (self .minheap_larger , - heapq .heappop (self .maxheap_smaller ))
5051        elif  len (self .maxheap_smaller ) <  len (self .minheap_larger ):
51-             heapq .heappush (self .maxheap_smaller ,
52- 						   - heapq .heappop (self .minheap_larger ))
52+             heapq .heappush (self .maxheap_smaller , - heapq .heappop (self .minheap_larger ))
5353
5454    def  get_median (self ) ->  float :
5555        if  len (self .maxheap_smaller ) ==  len (self .minheap_larger ):
56- 			 # Equal number of elements smaller and larger than "median": 
57- 			 # thus, there are two median values. The median would then become 
58- 			 # the average of both median values. 
56+              # Equal number of elements smaller and larger than "median": 
57+              # thus, there are two median values. The median would then become 
58+              # the average of both median values. 
5959            return  (- self .maxheap_smaller [0 ] +  self .minheap_larger [0 ]) /  2.0 
6060        else :
61- 			 # Otherwise, median is always in minheap, as minheap is always 
62- 			 # bigger 
61+              # Otherwise, median is always in minheap, as minheap is always 
62+              # bigger 
6363            return  - self .maxheap_smaller [0 ]
6464
6565
6666def  aggregate_median (runner : str , benchmark : str , cutoff : str ):
6767
68- 	# Get all .csv benchmark samples for the requested runner + benchmark 
69- 	def  csv_samples () ->  list [str ]:
70- 		# TODO check that the path below is valid directory 
71- 		cache_dir  =  Path (f"{ common .PERF_RES_PATH }  /{ runner }  /{ benchmark }  " )
72- 		# TODO check for time range; What time range do I want? 
73- 		return  filter (lambda  f : f .is_file () and 
74- 						common .valid_timestamp (str (f )[- 19 :- 4 ]) and  str (f )[- 19 :- 4 ] >  cutoff ,
75- 						cache_dir .glob (f"{ benchmark }  -*_*.csv" ))
76- 	
77- 	# Calculate median of every desired metric: 
78- 	aggregate_s  =  dict ()
79- 	for  sample_path  in  csv_samples ():
80- 		with  open (sample_path , 'r' ) as  sample_file :
81- 			for  s  in  csv .DictReader (sample_file ):
82- 				test_case  =  s ["TestCase" ]
83- 				# Construct entry in aggregate_s for test case if it does not 
84- 				# exist already: 
85- 				if  test_case  not  in   aggregate_s :
86- 					aggregate_s [test_case ] =  \
87- 				 		{ metric : SimpleMedian () for  metric  in  common .metrics_variance  }
88- 
89- 				for  metric  in  common .metrics_variance :
90- 					aggregate_s [test_case ][metric ].add (common .sanitize (s [metric ]))
91- 
92- 	# Write calculated median (aggregate_s) as a new .csv file: 
93- 	with  open (f"{ common .PERF_RES_PATH }  /{ runner }  /{ benchmark }  /{ benchmark }  -median.csv" , 'w' ) as  output_csv :
94- 		writer  =  csv .DictWriter (output_csv ,
95- 							    fieldnames = ["TestCase" , * common .metrics_variance .keys ()])
96- 		writer .writeheader ()
97- 		for  test_case  in  aggregate_s :
98- 			writer .writerow ({ "TestCase" : test_case  } |  
99- 				{ metric : aggregate_s [test_case ][metric ].get_median () 
100- 					for  metric  in  common .metrics_variance  })
101- 	
102- 		
68+     # Get all .csv benchmark samples for the requested runner + benchmark 
69+     def  csv_samples () ->  list [str ]:
70+         # TODO check that the path below is valid directory 
71+         cache_dir  =  Path (f"{ common .PERF_RES_PATH }  /{ runner }  /{ benchmark }  " )
72+         # TODO check for time range; What time range do I want? 
73+         return  filter (
74+             lambda  f : f .is_file ()
75+             and  common .valid_timestamp (str (f )[- 19 :- 4 ])
76+             and  str (f )[- 19 :- 4 ] >  cutoff ,
77+             cache_dir .glob (f"{ benchmark }  -*_*.csv" ),
78+         )
79+ 
80+     # Calculate median of every desired metric: 
81+     aggregate_s  =  dict ()
82+     for  sample_path  in  csv_samples ():
83+         with  open (sample_path , "r" ) as  sample_file :
84+             for  s  in  csv .DictReader (sample_file ):
85+                 test_case  =  s ["TestCase" ]
86+                 # Construct entry in aggregate_s for test case if it does not 
87+                 # exist already: 
88+                 if  test_case  not  in   aggregate_s :
89+                     aggregate_s [test_case ] =  {
90+                         metric : SimpleMedian () for  metric  in  common .metrics_variance 
91+                     }
92+ 
93+                 for  metric  in  common .metrics_variance :
94+                     aggregate_s [test_case ][metric ].add (common .sanitize (s [metric ]))
95+ 
96+     # Write calculated median (aggregate_s) as a new .csv file: 
97+     with  open (
98+         f"{ common .PERF_RES_PATH }  /{ runner }  /{ benchmark }  /{ benchmark }  -median.csv" , "w" 
99+     ) as  output_csv :
100+         writer  =  csv .DictWriter (
101+             output_csv , fieldnames = ["TestCase" , * common .metrics_variance .keys ()]
102+         )
103+         writer .writeheader ()
104+         for  test_case  in  aggregate_s :
105+             writer .writerow (
106+                 {"TestCase" : test_case }
107+                 |  {
108+                     metric : aggregate_s [test_case ][metric ].get_median ()
109+                     for  metric  in  common .metrics_variance 
110+                 }
111+             )
112+ 
113+ 
103114if  __name__  ==  "__main__" :
104- 	if  len (sys .argv ) <  4 :
105- 		print (f"Usage: { sys .argv [0 ]}   <runner name> <test case name> <cutoff date YYYYMMDD_HHMMSS>" )
106- 		exit (1 )
107- 	if  not  common .valid_timestamp (sys .argv [3 ]):
108- 		print (sys .argv )
109- 		print (f"Bad cutoff timestamp, please use YYYYMMDD_HHMMSS." )
110- 		exit (1 )
111- 	common .load_configs ()
112- 	#                <runner>,    <test case>, <cutoff> 
113- 	aggregate_median (sys .argv [1 ], sys .argv [2 ], sys .argv [3 ])
115+     if  len (sys .argv ) <  4 :
116+         print (
117+             f"Usage: { sys .argv [0 ]}   <runner name> <test case name> <cutoff date YYYYMMDD_HHMMSS>" 
118+         )
119+         exit (1 )
120+     if  not  common .valid_timestamp (sys .argv [3 ]):
121+         print (sys .argv )
122+         print (f"Bad cutoff timestamp, please use YYYYMMDD_HHMMSS." )
123+         exit (1 )
124+     common .load_configs ()
125+     #                <runner>,    <test case>, <cutoff> 
126+     aggregate_median (sys .argv [1 ], sys .argv [2 ], sys .argv [3 ])
0 commit comments