|
16 | 16 | def main(args): |
17 | 17 | processor = AutoProcessor.from_pretrained(args.model_id) |
18 | 18 | tokenizer = processor.tokenizer |
19 | | - model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_id).to(args.device) |
| 19 | + model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_id, torch_dtype=torch.bfloat16).to(args.device) |
20 | 20 |
|
21 | 21 | # create text prompt |
22 | 22 | chat = [ |
@@ -45,24 +45,24 @@ def benchmark(batch, min_new_tokens=None): |
45 | 45 | # START TIMING |
46 | 46 | start_time = time.time() |
47 | 47 |
|
48 | | - with torch.autocast(model.device.type, enabled=True): |
49 | | - model_inputs = processor( |
50 | | - texts, |
51 | | - audios, |
52 | | - device=args.device, # Computation device; returned tensors are put on CPU |
53 | | - return_tensors="pt", |
54 | | - ).to(args.device) |
55 | | - |
56 | | - # Model Inference |
57 | | - model_outputs = model.generate( |
58 | | - **model_inputs, |
59 | | - bos_token_id=tokenizer.bos_token_id, |
60 | | - pad_token_id=tokenizer.pad_token_id, |
61 | | - eos_token_id=tokenizer.eos_token_id, |
62 | | - repetition_penalty=1.0, |
63 | | - **gen_kwargs, |
64 | | - min_new_tokens=min_new_tokens, |
65 | | - ) |
| 48 | + # with torch.autocast(model.device.type, enabled=True): |
| 49 | + model_inputs = processor( |
| 50 | + texts, |
| 51 | + audios, |
| 52 | + device=args.device, # Computation device; returned tensors are put on CPU |
| 53 | + return_tensors="pt", |
| 54 | + ).to(args.device) |
| 55 | + |
| 56 | + # Model Inference |
| 57 | + model_outputs = model.generate( |
| 58 | + **model_inputs, |
| 59 | + bos_token_id=tokenizer.bos_token_id, |
| 60 | + pad_token_id=tokenizer.pad_token_id, |
| 61 | + eos_token_id=tokenizer.eos_token_id, |
| 62 | + repetition_penalty=1.0, |
| 63 | + **gen_kwargs, |
| 64 | + min_new_tokens=min_new_tokens, |
| 65 | + ) |
66 | 66 |
|
67 | 67 | # Transformers includes the input IDs in the response. |
68 | 68 | num_input_tokens = model_inputs["input_ids"].shape[-1] |
|
0 commit comments