@@ -583,6 +583,133 @@ void run_long_kernel(ze_context_handle_t context, ze_device_handle_t device,
583
583
}
584
584
}
585
585
586
+ void run_long_kernel_scratch (ze_context_handle_t context,
587
+ ze_device_handle_t device,
588
+ process_synchro &synchro, debug_options &options) {
589
+
590
+ auto command_list = lzt::create_command_list (device);
591
+ auto command_queue = lzt::create_command_queue (device);
592
+ std::string module_name = options.module_name_in ;
593
+
594
+ std::string kernel_name = " long_kernel_slm" ;
595
+ size_t slm_buffer_size = 512 ; // NOTE: Not all SKUs have same SLM so can go too big.
596
+
597
+ synchro.wait_for_debugger_signal ();
598
+ const char *build_flags =" -g -igc_opts 'VISAOptions=-forcespills'" ;
599
+ auto module =
600
+ lzt::create_module (device, module_name, ZE_MODULE_FORMAT_IL_SPIRV,
601
+ build_flags /* include debug symbols*/ , nullptr );
602
+
603
+ auto kernel = lzt::create_function (module , kernel_name);
604
+ auto size = slm_buffer_size;
605
+
606
+ ze_kernel_properties_t kernel_properties = {
607
+ ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES, nullptr };
608
+ EXPECT_EQ (ZE_RESULT_SUCCESS,
609
+ zeKernelGetProperties (kernel, &kernel_properties));
610
+ int threadCount = std::ceil (size / kernel_properties.maxSubgroupSize );
611
+
612
+ LOG_INFO << " [Application] Problem size: " << size
613
+ << " . Kernel maxSubGroupSize: " << kernel_properties.maxSubgroupSize
614
+ << " . GPU thread count: ceil (P size/maxSubGroupSize) = "
615
+ << threadCount;
616
+
617
+ auto dest_buffer_d =
618
+ lzt::allocate_device_memory (size, size, 0 , 0 , device, context);
619
+ auto dest_buffer_s =
620
+ lzt::allocate_shared_memory (size, size, 0 , 0 , device, context);
621
+ auto src_buffer_d =
622
+ lzt::allocate_device_memory (size, size, 0 , 0 , device, context);
623
+ auto src_buffer_s =
624
+ lzt::allocate_shared_memory (size, size, 0 , 0 , device, context);
625
+
626
+ void *slm_output_s = nullptr ;
627
+ slm_output_s = lzt::allocate_shared_memory (slm_buffer_size, slm_buffer_size,
628
+ 0 , 0 , device, context);
629
+
630
+ unsigned long loop_max = 1000000000 ;
631
+
632
+ auto loop_counter_d = lzt::allocate_device_memory (
633
+ loop_counter_alloc_size, loop_counter_alloc_size, 0 , 0 , device, context);
634
+ auto loop_counter_s = lzt::allocate_shared_memory (
635
+ loop_counter_alloc_size, loop_counter_alloc_size, 0 , 0 , device, context);
636
+
637
+ LOG_DEBUG << " [Application] Allocated source device memory at: " << std::hex
638
+ << src_buffer_d;
639
+ LOG_DEBUG << " [Application] Allocated destination device memory at: "
640
+ << std::hex << dest_buffer_d;
641
+
642
+ std::memset (dest_buffer_s, 1 , size);
643
+ std::memset (src_buffer_s, 0 , size);
644
+ std::memset (loop_counter_s, 0 , loop_counter_alloc_size);
645
+ for (size_t i = 0 ; i < size; i++) {
646
+ static_cast <uint8_t *>(src_buffer_s)[i] = (i + 1 & 0xFF );
647
+ }
648
+
649
+ lzt::set_argument_value (kernel, 0 , sizeof (dest_buffer_d), &dest_buffer_d);
650
+ lzt::set_argument_value (kernel, 1 , sizeof (src_buffer_d), &src_buffer_d);
651
+ lzt::set_argument_value (kernel, 2 , sizeof (loop_counter_d), &loop_counter_d);
652
+ lzt::set_argument_value (kernel, 3 , sizeof (loop_max), &loop_max);
653
+ lzt::set_argument_value (kernel, 4 , sizeof (slm_output_s), &slm_output_s);
654
+
655
+ uint32_t group_size_x = 1 ;
656
+ uint32_t group_size_y = 1 ;
657
+ uint32_t group_size_z = 1 ;
658
+ lzt::suggest_group_size (kernel, size, 1 , 1 , group_size_x, group_size_y,
659
+ group_size_z);
660
+ lzt::set_group_size (kernel, group_size_x, 1 , 1 );
661
+ ze_group_count_t group_count = {};
662
+ group_count.groupCountX = size / group_size_x;
663
+ group_count.groupCountY = 1 ;
664
+ group_count.groupCountZ = 1 ;
665
+
666
+ lzt::append_memory_copy (command_list, src_buffer_d, src_buffer_s, size);
667
+ lzt::append_barrier (command_list);
668
+ lzt::append_launch_function (command_list, kernel, &group_count, nullptr , 0 ,
669
+ nullptr );
670
+ lzt::append_barrier (command_list);
671
+ lzt::append_memory_copy (command_list, dest_buffer_s, dest_buffer_d, size);
672
+ lzt::append_memory_copy (command_list, loop_counter_s, loop_counter_d,
673
+ loop_counter_alloc_size);
674
+ lzt::close_command_list (command_list);
675
+
676
+ LOG_DEBUG << " [Application] launching execution of " << kernel_name;
677
+
678
+ synchro.update_gpu_buffer_address (reinterpret_cast <uint64_t >(src_buffer_d));
679
+ synchro.notify_debugger ();
680
+
681
+ lzt::execute_command_lists (command_queue, 1 , &command_list, nullptr );
682
+ lzt::synchronize (command_queue, UINT64_MAX);
683
+
684
+ for (size_t i = 1 ; i < size; i++) {
685
+ EXPECT_EQ (static_cast <uint8_t *>(dest_buffer_s)[i],
686
+ static_cast <uint8_t *>(src_buffer_s)[i]);
687
+ if (static_cast <uint8_t *>(dest_buffer_s)[i] !=
688
+ static_cast <uint8_t *>(src_buffer_s)[i]) {
689
+ LOG_ERROR << " [Application] Buffer Sanity check did not pass" ;
690
+ break ;
691
+ }
692
+ }
693
+
694
+ // cleanup
695
+ lzt::free_memory (context, dest_buffer_s);
696
+ lzt::free_memory (context, dest_buffer_d);
697
+ lzt::free_memory (context, src_buffer_s);
698
+ lzt::free_memory (context, src_buffer_d);
699
+ lzt::free_memory (context, loop_counter_s);
700
+ lzt::free_memory (context, loop_counter_d);
701
+ lzt::free_memory (context, slm_output_s);
702
+
703
+ lzt::destroy_function (kernel);
704
+ lzt::destroy_module (module );
705
+ lzt::destroy_command_list (command_list);
706
+ lzt::destroy_command_queue (command_queue);
707
+
708
+ if (::testing::Test::HasFailure ()) {
709
+ exit (1 );
710
+ }
711
+ }
712
+
586
713
void run_multiple_threads (ze_context_handle_t context,
587
714
ze_device_handle_t device, process_synchro &synchro,
588
715
debug_options &options) {
@@ -1227,6 +1354,11 @@ int main(int argc, char **argv) {
1227
1354
options.kernel_name_in = " long_kernel_slm" ;
1228
1355
run_long_kernel (context, device, synchro, options);
1229
1356
break ;
1357
+ case LONG_RUNNING_KERNEL_INTERRUPTED_SCRATCH:
1358
+ options.use_custom_module = true ;
1359
+ options.module_name_in = " debug_loop_slm.spv" ;
1360
+ run_long_kernel_scratch (context, device, synchro, options);
1361
+ break ;
1230
1362
case MULTIPLE_THREADS:
1231
1363
run_multiple_threads (context, device, synchro, options);
1232
1364
break ;
0 commit comments