Merge pull request #64 from tsisw/FIR-1031

akapoor3518 · web-flow · commit 22d8b0067cd8 · 2025-10-17T15:07:26.000-07:00
@FIR-1031 -  GGML: Add TSI Kernel Count for each OP at PERF TABLE
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -662,9 +662,10 @@ extern "C" {
 
 #if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
         int64_t perf_runs;
+        int64_t tsi_kernel_runs;
         int64_t perf_time_us;
         enum ggml_compute_backend_type ggml_compute_backend;
-        char padding[4];
+        char padding[12];
 #else
         char padding[8];
 #endif /* GML_PERF-related flag */
@@ -2561,11 +2562,13 @@ extern "C" {
 struct ggml_perf_backend_subtotals {
     int64_t total_us;
     int64_t runs;
+    int64_t tsi_kernel_count;
 };
 
 struct ggml_perf_unary_subtotals {
     int64_t total_us;
     int64_t runs;
+    int64_t tsi_kernel_count;
 };
 // internal perf accumulation struct
 struct ggml_perf_totals {
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -1225,6 +1225,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
                         val[0] = scale;
                         ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input[kernel_sub_type](srcP0, srcP1, nodeP, glob_buf);
                         ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+                        ++node->tsi_kernel_runs;
 	            }
 	        }
 	    }
@@ -1258,6 +1259,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
                     // kernel call
                     ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, srcP1, nodeP);
                     ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+                    ++node->tsi_kernel_runs;
                 }
             }
         }
@@ -1372,6 +1374,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
             ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input[kernel_sub_type](srcP0, nodeP);
 	}
         ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+        ++node->tsi_kernel_runs;
 
         if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
           log_data.data_type = GGML_TSAVORITE_TENSOR_NODE;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -7249,12 +7249,14 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
         if (be >= GGML_COMPUTE_BACKEND_CPU && be < GGML_COMPUTE_BACKEND_COUNT) {
             totals[op].backend_subtotals[be].total_us += node->perf_time_us;
 	    totals[op].backend_subtotals[be].runs     += node->perf_runs;
+	    totals[op].backend_subtotals[be].tsi_kernel_count   += node->tsi_kernel_runs;
         }
 
         if (op == GGML_OP_UNARY) {
             enum ggml_unary_op subop = ggml_get_unary_op(node);
             totals[op].unary_subtotals[subop].total_us += node->perf_time_us;
             totals[op].unary_subtotals[subop].runs     += node->perf_runs;
+            totals[op].unary_subtotals[subop].tsi_kernel_count   += node->tsi_kernel_runs;
         }
     }
 }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -2791,7 +2791,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
 #elif defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
 void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
     LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
-    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7s  %14s  %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");
+    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7s  %14s  %16s  %16s\n", "Op", "Target", "Runs", "TSI_KERNEL-RUN", "Total us", "Avg us");
 
     for (int i = 0; i < GGML_OP_COUNT; ++i) {
         if (totals[i].runs > 0) {
@@ -2801,10 +2801,11 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
                     char padded_backend[7] = {0}; // 6 chars + null terminator
                     snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name);
 
-                    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7ld  %14ld  %16.2f\n",
+                    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7ld  %14ld  %16ld  %16.2f\n",
                         totals[i].op_name ? totals[i].op_name : "UNKNOWN",
                         padded_backend,
                         totals[i].backend_subtotals[b].runs,
+                        totals[i].backend_subtotals[b].tsi_kernel_count,
                         totals[i].backend_subtotals[b].total_us,
                         (double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs);
                 }
@@ -2826,10 +2827,11 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
                         char padded_backend[7] = {0};
                         snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name ? backend_name : "UNK");
 
-                        LLAMA_LOG_TSAVORITE("    -> %-11s %-8s %7ld  %14ld  %16.2f\n",
+                        LLAMA_LOG_TSAVORITE("    -> %-11s %-8s %7ld  %14ld  %16ld  %16.2f\n",
                             ggml_unary_op_name((enum ggml_unary_op) j),
                             padded_backend,
                             totals[i].unary_subtotals[j].runs,
+                            totals[i].unary_subtotals[j].tsi_kernel_count,
                             totals[i].unary_subtotals[j].total_us,
                             (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
                     }

Original file line number	Diff line number	Diff line change
`@@ -1225,6 +1225,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,`
`1225`	`1225`	`val[0] = scale;`
`1226`	`1226`	`ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input[kernel_sub_type](srcP0, srcP1, nodeP, glob_buf);`
`1227`	`1227`	`++device->stats.op_run_count[kernel_type].num_of_kernel_call;`
	`1228`	`+ ++node->tsi_kernel_runs;`
`1228`	`1229`	`}`
`1229`	`1230`	`}`
`1230`	`1231`	`}`
`@@ -1258,6 +1259,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,`
`1258`	`1259`	`// kernel call`
`1259`	`1260`	`ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, srcP1, nodeP);`
`1260`	`1261`	`++device->stats.op_run_count[kernel_type].num_of_kernel_call;`
	`1262`	`+ ++node->tsi_kernel_runs;`
`1261`	`1263`	`}`
`1262`	`1264`	`}`
`1263`	`1265`	`}`
`@@ -1372,6 +1374,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,`
`1372`	`1374`	`ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input[kernel_sub_type](srcP0, nodeP);`
`1373`	`1375`	`}`
`1374`	`1376`	`++device->stats.op_run_count[kernel_type].num_of_kernel_call;`
	`1377`	`+ ++node->tsi_kernel_runs;`
`1375`	`1378`
`1376`	`1379`	`if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {`
`1377`	`1380`	`log_data.data_type = GGML_TSAVORITE_TENSOR_NODE;`
Original file line number	Diff line number	Diff line change
`@@ -7249,12 +7249,14 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct`
`7249`	`7249`	`if (be >= GGML_COMPUTE_BACKEND_CPU && be < GGML_COMPUTE_BACKEND_COUNT) {`
`7250`	`7250`	`totals[op].backend_subtotals[be].total_us += node->perf_time_us;`
`7251`	`7251`	`totals[op].backend_subtotals[be].runs += node->perf_runs;`
	`7252`	`+ totals[op].backend_subtotals[be].tsi_kernel_count += node->tsi_kernel_runs;`
`7252`	`7253`	`}`
`7253`	`7254`
`7254`	`7255`	`if (op == GGML_OP_UNARY) {`
`7255`	`7256`	`enum ggml_unary_op subop = ggml_get_unary_op(node);`
`7256`	`7257`	`totals[op].unary_subtotals[subop].total_us += node->perf_time_us;`
`7257`	`7258`	`totals[op].unary_subtotals[subop].runs += node->perf_runs;`
	`7259`	`+ totals[op].unary_subtotals[subop].tsi_kernel_count += node->tsi_kernel_runs;`
`7258`	`7260`	`}`
`7259`	`7261`	`}`
`7260`	`7262`	`}`