|
21 | 21 | --batches BATCHES [BATCHES ...] |
22 | 22 | --input-length INPUT_LENGTH |
23 | 23 | --out-dir OUT_DIR |
| 24 | + --iterations ITERATIONS |
| 25 | + --warmup-runs WARMUP_RUNS |
| 26 | + --output-length OUTPUT_LENGTH |
24 | 27 | """ |
25 | 28 |
|
26 | 29 | import argparse |
|
30 | 33 | from optimum_benchmark.logging_utils import setup_logging |
31 | 34 | import torch |
32 | 35 |
|
| 36 | +torch.backends.cudnn.benchmark = False |
| 37 | +torch.backends.cudnn.deterministic = True |
| 38 | + |
33 | 39 | BFLOAT16_SUPPORT = torch.cuda.get_device_capability()[0] >= 8 |
34 | 40 |
|
35 | 41 | WEIGHTS_CONFIGS = { |
|
73 | 79 | }, |
74 | 80 | } |
75 | 81 |
|
76 | | -if __name__ == "__main__": |
77 | | - setup_logging(level="INFO") |
78 | 82 |
|
| 83 | +def parse_args(): |
79 | 84 | parser = argparse.ArgumentParser(description="bitsandbytes inference benchmark tool") |
80 | 85 |
|
81 | 86 | parser.add_argument("model_id", type=str, help="The model checkpoint to use.") |
|
98 | 103 |
|
99 | 104 | parser.add_argument("--out-dir", type=str, default="reports") |
100 | 105 |
|
101 | | - args = parser.parse_args() |
| 106 | + parser.add_argument("--iterations", type=int, default=10, help="Number of iterations for each benchmark run") |
| 107 | + parser.add_argument( |
| 108 | + "--warmup-runs", type=int, default=10, help="Number of warmup runs to discard before measurement" |
| 109 | + ) |
| 110 | + parser.add_argument( |
| 111 | + "--output-length", |
| 112 | + type=int, |
| 113 | + default=64, |
| 114 | + help="If set, `max_new_tokens` and `min_new_tokens` will be set to this value.", |
| 115 | + ) |
| 116 | + |
| 117 | + return parser.parse_args() |
| 118 | + |
| 119 | + |
| 120 | +def run_benchmark(args, config, batch_size): |
| 121 | + launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn", start_method="spawn") |
| 122 | + scenario_config = InferenceConfig( |
| 123 | + latency=True, |
| 124 | + memory=True, |
| 125 | + input_shapes={"batch_size": batch_size, "sequence_length": args.input_length}, |
| 126 | + iterations=args.iterations, |
| 127 | + warmup_runs=args.warmup_runs, |
| 128 | + # set duration to 0 to disable the duration-based stopping criterion |
| 129 | + # this is IMPORTANT to ensure that all benchmarks run the same number of operations, regardless of hardware speed/bottlenecks |
| 130 | + duration=0, |
| 131 | + # for consistent results, set a fixed min and max for output tokens |
| 132 | + generate_kwargs={"min_new_tokens": args.output_length, "max_new_tokens": args.output_length}, |
| 133 | + forward_kwargs={"min_new_tokens": args.output_length, "max_new_tokens": args.output_length}, |
| 134 | + ) |
| 135 | + |
| 136 | + backend_config = PyTorchConfig( |
| 137 | + device="cuda", |
| 138 | + device_ids="0", |
| 139 | + device_map="auto", |
| 140 | + no_weights=False, |
| 141 | + model=args.model_id, |
| 142 | + **WEIGHTS_CONFIGS[config], |
| 143 | + ) |
| 144 | + |
| 145 | + test_name = ( |
| 146 | + f"benchmark-{config}" |
| 147 | + f"-bsz-{batch_size}" |
| 148 | + f"-isz-{args.input_length}" |
| 149 | + f"-osz-{args.output_length}" |
| 150 | + f"-iter-{args.iterations}" |
| 151 | + f"-wrmup-{args.warmup_runs}" |
| 152 | + ) |
| 153 | + benchmark_config = BenchmarkConfig( |
| 154 | + name=test_name, |
| 155 | + scenario=scenario_config, |
| 156 | + launcher=launcher_config, |
| 157 | + backend=backend_config, |
| 158 | + ) |
| 159 | + |
| 160 | + out_path = out_dir / (test_name + ".json") |
| 161 | + print(f"[{test_name}] Starting:") |
| 162 | + benchmark_report = Benchmark.launch(benchmark_config) |
| 163 | + benchmark_report.save_json(out_path) |
| 164 | + |
| 165 | + |
| 166 | +if __name__ == "__main__": |
| 167 | + setup_logging(level="INFO") |
| 168 | + args = parse_args() |
102 | 169 |
|
103 | 170 | out_dir = Path(args.out_dir) |
104 | 171 | out_dir.mkdir(parents=True, exist_ok=True) |
105 | 172 |
|
106 | 173 | for batch_size in args.batches: |
107 | | - print(f"Benchmarking batch size: {batch_size}") |
108 | 174 | for config in args.configs: |
109 | | - launcher_config = ProcessConfig(device_isolation=True, start_method="spawn") |
110 | | - scenario_config = InferenceConfig( |
111 | | - latency=True, |
112 | | - memory=True, |
113 | | - input_shapes={"batch_size": batch_size, "sequence_length": args.input_length}, |
114 | | - ) |
115 | | - backend_config = PyTorchConfig( |
116 | | - device="cuda", |
117 | | - device_ids="0", |
118 | | - device_map="auto", |
119 | | - no_weights=False, |
120 | | - model=args.model_id, |
121 | | - **WEIGHTS_CONFIGS[config], |
122 | | - ) |
123 | | - benchmark_config = BenchmarkConfig( |
124 | | - name=f"benchmark-{config}-bsz{batch_size}", |
125 | | - scenario=scenario_config, |
126 | | - launcher=launcher_config, |
127 | | - backend=backend_config, |
128 | | - ) |
129 | | - |
130 | | - out_path = out_dir / f"benchmark_{config}_bsz{batch_size}.json" |
131 | | - |
132 | | - benchmark_report = Benchmark.launch(benchmark_config) |
133 | | - benchmark_report.log() |
134 | | - benchmark_report.save_json(out_path) |
| 175 | + run_benchmark(args, config, batch_size) |
0 commit comments