|
6 | 6 | */ |
7 | 7 | // clang-format on |
8 | 8 |
|
9 | | -#include <runtime/compiled_kernel.h> |
10 | | - |
11 | | -#include <codegen.h> |
12 | | -#include <cuda_utils.h> |
13 | | -#include <debug.h> |
14 | | -#include <device_lower/analysis/bank_conflict.h> |
15 | | -#include <device_lower/lower2device.h> |
16 | | -#include <disjoint_set.h> |
17 | | -#include <driver_api.h> |
18 | | -#include <fusion_profiler.h> |
19 | | -#include <global_allocator.h> |
20 | | -#include <instrumentation.h> |
21 | | -#include <ir/all_nodes.h> |
22 | | -#include <ir/utils.h> |
23 | | -#include <iter_visitor.h> |
24 | | -#include <kernel_db/kernel_db.h> |
25 | | -#include <kernel_ir.h> |
26 | | -#include <multidevice/communication.h> |
27 | | -#include <multidevice/communicator.h> |
28 | | -#include <multidevice/utils.h> |
29 | | -#include <options.h> |
30 | | -#include <polymorphic_value.h> |
31 | | -#include <runtime/allocations.h> |
32 | | -#include <runtime/executor_kernel_arg.h> |
33 | | -#include <runtime/executor_utils.h> |
34 | | -#include <serde/utils.h> |
35 | | -#include <tensor_metadata.h> |
36 | | -#include "base.h" |
37 | | - |
38 | | -#include <ATen/core/LegacyTypeDispatch.h> |
39 | | -#include <ATen/cuda/CUDAContext.h> |
40 | | -#include <ATen/cuda/llvm_jit_strings.h> |
41 | | -#include <ATen/native/cuda/jit_utils.h> |
42 | | -#include <c10/core/DeviceGuard.h> |
43 | | -#include <c10/cuda/CUDAFunctions.h> |
44 | | -#include <c10/cuda/CUDAStream.h> |
45 | | -#include <torch/csrc/jit/resource_guard.h> |
| 9 | +#include "runtime/compiled_kernel.h" |
46 | 10 |
|
47 | 11 | #include <array> |
48 | 12 | #include <cmath> |
|
53 | 17 |
|
54 | 18 | #include <cuda_runtime.h> |
55 | 19 |
|
56 | | -#include <nvfuser_resources/argsort.h> |
57 | | -#include <nvfuser_resources/array.h> |
58 | | -#include <nvfuser_resources/basic_type_traits.h> |
59 | | -#include <nvfuser_resources/bf16_support.h> |
60 | | -#include <nvfuser_resources/bit.h> |
61 | | -#include <nvfuser_resources/block_quantization_kernels.h> |
62 | | -#include <nvfuser_resources/block_reduction.h> |
63 | | -#include <nvfuser_resources/block_sync_atomic.h> |
64 | | -#include <nvfuser_resources/block_sync_default.h> |
65 | | -#include <nvfuser_resources/block_welford_outer.h> |
66 | | -#include <nvfuser_resources/broadcast.h> |
67 | | -#include <nvfuser_resources/casts.h> |
68 | | -#include <nvfuser_resources/cluster.h> |
69 | | -#include <nvfuser_resources/complex_number.h> |
70 | | -#include <nvfuser_resources/cub_utils.h> |
71 | | -#include <nvfuser_resources/fp16_support.h> |
72 | | -#include <nvfuser_resources/fp4_support.h> |
73 | | -#include <nvfuser_resources/fp8_support.h> |
74 | | -#include <nvfuser_resources/fused_reduction.h> |
75 | | -#include <nvfuser_resources/fused_welford_helper.h> |
76 | | -#include <nvfuser_resources/fused_welford_impl.h> |
77 | | -#include <nvfuser_resources/fused_welford_impl_outer.h> |
78 | | -#include <nvfuser_resources/grid_broadcast.h> |
79 | | -#include <nvfuser_resources/grid_dependency_control.h> |
80 | | -#include <nvfuser_resources/grid_reduction.h> |
81 | | -#include <nvfuser_resources/grid_sync.h> |
82 | | -#include <nvfuser_resources/helpers.h> |
83 | | -#include <nvfuser_resources/index_utils.h> |
84 | | -#include <nvfuser_resources/mbarrier.h> |
85 | | -#include <nvfuser_resources/memory.h> |
86 | | -#include <nvfuser_resources/random_numbers.h> |
87 | | -#include <nvfuser_resources/scan.h> |
88 | | -#include <nvfuser_resources/tensor.h> |
89 | | -#include <nvfuser_resources/tensor_memory.h> |
90 | | -#include <nvfuser_resources/topk.h> |
91 | | -#include <nvfuser_resources/tuple.h> |
92 | | -#include <nvfuser_resources/type_traits.h> |
93 | | -#include <nvfuser_resources/warp.h> |
94 | | -#include <nvfuser_resources/welford.h> |
| 20 | +#include <ATen/core/LegacyTypeDispatch.h> |
| 21 | +#include <ATen/cuda/CUDAContext.h> |
| 22 | +#include <ATen/cuda/llvm_jit_strings.h> |
| 23 | +#include <ATen/native/cuda/jit_utils.h> |
| 24 | +#include <c10/core/DeviceGuard.h> |
| 25 | +#include <c10/cuda/CUDAFunctions.h> |
| 26 | +#include <c10/cuda/CUDAStream.h> |
| 27 | +#include <torch/csrc/jit/resource_guard.h> |
| 28 | + |
| 29 | +#include "base.h" |
| 30 | +#include "codegen.h" |
| 31 | +#include "cuda_utils.h" |
| 32 | +#include "debug.h" |
| 33 | +#include "device_lower/analysis/bank_conflict.h" |
| 34 | +#include "device_lower/lower2device.h" |
| 35 | +#include "disjoint_set.h" |
| 36 | +#include "driver_api.h" |
| 37 | +#include "fusion_profiler.h" |
| 38 | +#include "global_allocator.h" |
| 39 | +#include "instrumentation.h" |
| 40 | +#include "ir/all_nodes.h" |
| 41 | +#include "ir/utils.h" |
| 42 | +#include "iter_visitor.h" |
| 43 | +#include "kernel_db/kernel_db.h" |
| 44 | +#include "kernel_ir.h" |
| 45 | +#include "multidevice/communication.h" |
| 46 | +#include "multidevice/communicator.h" |
| 47 | +#include "multidevice/utils.h" |
| 48 | +#include "nvfuser_resources/argsort.h" |
| 49 | +#include "nvfuser_resources/array.h" |
| 50 | +#include "nvfuser_resources/basic_type_traits.h" |
| 51 | +#include "nvfuser_resources/bf16_support.h" |
| 52 | +#include "nvfuser_resources/bit.h" |
| 53 | +#include "nvfuser_resources/block_quantization_kernels.h" |
| 54 | +#include "nvfuser_resources/block_reduction.h" |
| 55 | +#include "nvfuser_resources/block_sync_atomic.h" |
| 56 | +#include "nvfuser_resources/block_sync_default.h" |
| 57 | +#include "nvfuser_resources/block_welford_outer.h" |
| 58 | +#include "nvfuser_resources/broadcast.h" |
| 59 | +#include "nvfuser_resources/casts.h" |
| 60 | +#include "nvfuser_resources/cluster.h" |
| 61 | +#include "nvfuser_resources/complex_number.h" |
| 62 | +#include "nvfuser_resources/cub_utils.h" |
| 63 | +#include "nvfuser_resources/fp16_support.h" |
| 64 | +#include "nvfuser_resources/fp4_support.h" |
| 65 | +#include "nvfuser_resources/fp8_support.h" |
| 66 | +#include "nvfuser_resources/fused_reduction.h" |
| 67 | +#include "nvfuser_resources/fused_welford_helper.h" |
| 68 | +#include "nvfuser_resources/fused_welford_impl.h" |
| 69 | +#include "nvfuser_resources/fused_welford_impl_outer.h" |
| 70 | +#include "nvfuser_resources/grid_broadcast.h" |
| 71 | +#include "nvfuser_resources/grid_dependency_control.h" |
| 72 | +#include "nvfuser_resources/grid_reduction.h" |
| 73 | +#include "nvfuser_resources/grid_sync.h" |
| 74 | +#include "nvfuser_resources/helpers.h" |
| 75 | +#include "nvfuser_resources/index_utils.h" |
| 76 | +#include "nvfuser_resources/mbarrier.h" |
| 77 | +#include "nvfuser_resources/memory.h" |
| 78 | +#include "nvfuser_resources/random_numbers.h" |
| 79 | +#include "nvfuser_resources/scan.h" |
| 80 | +#include "nvfuser_resources/tensor.h" |
| 81 | +#include "nvfuser_resources/tensor_memory.h" |
| 82 | +#include "nvfuser_resources/topk.h" |
| 83 | +#include "nvfuser_resources/tuple.h" |
| 84 | +#include "nvfuser_resources/type_traits.h" |
| 85 | +#include "nvfuser_resources/warp.h" |
| 86 | +#include "nvfuser_resources/welford.h" |
| 87 | +#include "options.h" |
| 88 | +#include "polymorphic_value.h" |
| 89 | +#include "runtime/allocations.h" |
| 90 | +#include "runtime/executor_kernel_arg.h" |
| 91 | +#include "runtime/executor_utils.h" |
| 92 | +#include "serde/utils.h" |
| 93 | +#include "tensor_metadata.h" |
95 | 94 |
|
96 | 95 | namespace nvfuser { |
97 | 96 |
|
|
0 commit comments