@@ -195,6 +195,63 @@ def print_comparison_summary(results: List[Dict]) -> None:
195195 click .echo (f"Std Dev: { statistics .stdev (times ):.4f} seconds" if len (times ) > 1 else "" )
196196
197197
198+ def _print_header (n_tx : int , tx_len : int , vocab : int , min_support : float , rounds : int ) -> None :
199+ """Print benchmark header with dataset parameters."""
200+ click .echo ("=" * 60 )
201+ click .echo ("GSP PRUNING STRATEGIES BENCHMARK" )
202+ click .echo ("=" * 60 )
203+ click .echo ("\n Dataset Parameters:" )
204+ click .echo (f" Transactions: { n_tx :,} " )
205+ click .echo (f" Transaction Len: { tx_len } " )
206+ click .echo (f" Vocabulary Size: { vocab :,} " )
207+ click .echo (f" Min Support: { min_support } " )
208+ click .echo (f" Benchmark Rounds: { rounds } " )
209+
210+
211+ def _generate_and_report_data (n_tx : int , tx_len : int , vocab : int ) -> List [List [str ]]:
212+ """Generate synthetic data and report."""
213+ click .echo ("\n Generating synthetic data..." )
214+ transactions = generate_synthetic_data (n_tx , tx_len , vocab )
215+ click .echo (f"Generated { len (transactions ):,} transactions" )
216+ return transactions
217+
218+
219+ def _get_strategies_to_test (strategy : str , n_tx : int , min_support : float ) -> List [Tuple [str , Optional [Dict ]]]:
220+ """Determine which strategies to test based on user input."""
221+ if strategy == "all" :
222+ return [
223+ ("default" , None ),
224+ ("support" , None ),
225+ ("frequency" , {"min_frequency" : max (2 , int (n_tx * min_support ))}),
226+ ("combined" , {"min_frequency" : max (2 , int (n_tx * min_support * 0.8 ))}),
227+ ]
228+ return [(strategy , None )]
229+
230+
231+ def _aggregate_results (all_results : List [Dict ]) -> List [Dict ]:
232+ """Aggregate results across multiple rounds by averaging."""
233+ strategy_results : Dict [str , Dict [str , List ]] = {}
234+ for result in all_results :
235+ strat_name = result ["strategy" ]
236+ if strat_name not in strategy_results :
237+ strategy_results [strat_name ] = {"times" : [], "patterns" : []}
238+ strategy_results [strat_name ]["times" ].append (result ["time" ])
239+ strategy_results [strat_name ]["patterns" ].append (result ["total_patterns" ])
240+
241+ averaged_results = []
242+ for strat_name , data in strategy_results .items ():
243+ averaged_results .append (
244+ {
245+ "strategy" : strat_name ,
246+ "time" : statistics .mean (data ["times" ]),
247+ "total_patterns" : int (statistics .mean (data ["patterns" ])),
248+ "patterns_per_level" : [], # Not averaged for simplicity
249+ "max_level" : 0 , # Not averaged
250+ }
251+ )
252+ return averaged_results
253+
254+
198255@click .command ()
199256@click .option ("--n_tx" , default = 1000 , show_default = True , type = int , help = "Number of transactions" )
200257@click .option ("--tx_len" , default = 8 , show_default = True , type = int , help = "Average items per transaction" )
@@ -214,31 +271,9 @@ def main(n_tx: int, tx_len: int, vocab: int, min_support: float, strategy: str,
214271 This script generates synthetic transactional data and evaluates the performance
215272 of different pruning strategies. Use --strategy all to compare all available strategies.
216273 """
217- click .echo ("=" * 60 )
218- click .echo ("GSP PRUNING STRATEGIES BENCHMARK" )
219- click .echo ("=" * 60 )
220- click .echo (f"\n Dataset Parameters:" )
221- click .echo (f" Transactions: { n_tx :,} " )
222- click .echo (f" Transaction Len: { tx_len } " )
223- click .echo (f" Vocabulary Size: { vocab :,} " )
224- click .echo (f" Min Support: { min_support } " )
225- click .echo (f" Benchmark Rounds: { rounds } " )
226-
227- # Generate data
228- click .echo (f"\n Generating synthetic data..." )
229- transactions = generate_synthetic_data (n_tx , tx_len , vocab )
230- click .echo (f"Generated { len (transactions ):,} transactions" )
231-
232- # Define strategies to test
233- if strategy == "all" :
234- strategies_to_test = [
235- ("default" , None ),
236- ("support" , None ),
237- ("frequency" , {"min_frequency" : max (2 , int (n_tx * min_support ))}),
238- ("combined" , {"min_frequency" : max (2 , int (n_tx * min_support * 0.8 ))}),
239- ]
240- else :
241- strategies_to_test = [(strategy , None )]
274+ _print_header (n_tx , tx_len , vocab , min_support , rounds )
275+ transactions = _generate_and_report_data (n_tx , tx_len , vocab )
276+ strategies_to_test = _get_strategies_to_test (strategy , n_tx , min_support )
242277
243278 # Run benchmarks multiple rounds if specified
244279 all_results = []
@@ -253,33 +288,11 @@ def main(n_tx: int, tx_len: int, vocab: int, min_support: float, strategy: str,
253288
254289 # Print summary
255290 if strategy == "all" :
256- # Aggregate results across rounds
257291 if rounds > 1 :
258292 click .echo (f"\n { '=' * 60 } " )
259293 click .echo ("AVERAGE RESULTS ACROSS ALL ROUNDS" )
260294 click .echo (f"{ '=' * 60 } " )
261-
262- # Group by strategy and average
263- strategy_results = {}
264- for result in all_results :
265- strat_name = result ["strategy" ]
266- if strat_name not in strategy_results :
267- strategy_results [strat_name ] = {"times" : [], "patterns" : []}
268- strategy_results [strat_name ]["times" ].append (result ["time" ])
269- strategy_results [strat_name ]["patterns" ].append (result ["total_patterns" ])
270-
271- averaged_results = []
272- for strat_name , data in strategy_results .items ():
273- averaged_results .append (
274- {
275- "strategy" : strat_name ,
276- "time" : statistics .mean (data ["times" ]),
277- "total_patterns" : int (statistics .mean (data ["patterns" ])),
278- "patterns_per_level" : [], # Not averaged for simplicity
279- "max_level" : 0 , # Not averaged
280- }
281- )
282-
295+ averaged_results = _aggregate_results (all_results )
283296 print_comparison_summary (averaged_results )
284297 else :
285298 print_comparison_summary (results )
0 commit comments