minor changes per Derrick's feedback

chichun-charlie-liu · chichun-charlie-liu · commit 3a89c7b5b006 · 2025-01-28T21:37:32.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/examples/QAT_INT8/run_qa_no_trainer_qat.py b/examples/QAT_INT8/run_qa_no_trainer_qat.py
@@ -389,7 +389,7 @@ def parse_args():
     parser.add_argument(
         "--do_lowering",
         type=str,
-        default=None,
+        default="triton",
         help="convert QAT model to utilize real INT8 GPU kernel, 'cutlass' or 'triton'",
     )
 
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1049,8 +1049,9 @@ def iaddmm_int(self, bias, m1, m2):
                     imm_out = torch.bitwise_right_shift(
                         imm_out + round_bit, self.truncate_lsb
                     )
-                    # imm_out = imm_out.to(torch.int16)
-                    # only cast to i16 when truncating 8b from both side
+                    # could cast to smaller data type to further simulate HW behavior, for example,
+                    # if HW truncates 8b from both sides of i32 accumulator, the remaining data can
+                    # be cast to i16 to be more realistic. pay attention to overflow handling
                 fp16_out += imm_out.to(torch.float16)
 
             return (

Original file line number	Diff line number	Diff line change
`@@ -389,7 +389,7 @@ def parse_args():`
`389`	`389`	`parser.add_argument(`
`390`	`390`	`"--do_lowering",`
`391`	`391`	`type=str,`
`392`		`- default=None,`
	`392`	`+ default="triton",`
`393`	`393`	`help="convert QAT model to utilize real INT8 GPU kernel, 'cutlass' or 'triton'",`
`394`	`394`	`)`
`395`	`395`