Skip to content

Commit 054f249

Browse files
committed
gemma3 bf16 works
1 parent 132c342 commit 054f249

File tree

127 files changed

+11211
-1383
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

127 files changed

+11211
-1383
lines changed

AOTI_SYMBOL_FIX.md

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# AOTI 符号链接问题修复
2+
3+
## 问题描述
4+
5+
在运行 `gemma3_e2e_runner` 时遇到符号查找错误:
6+
7+
```
8+
./cmake-out/examples/models/gemma3/gemma3_e2e_runner: symbol lookup error: /tmp/token_embedding_so_blob68945.so: undefined symbol: aoti_torch_dtype_bfloat16
9+
```
10+
11+
## 根本原因
12+
13+
1. **AOTI shim 函数定义在头文件中**
14+
- `/home/gasoonjia/executorch/backends/cuda/runtime/shims/aoti_torch/c/shim.h` 包含了所有 AOTI 函数的定义(包括 `aoti_torch_dtype_bfloat16`
15+
16+
2. **符号导出宏配置问题**
17+
-`/home/gasoonjia/executorch/backends/cuda/runtime/shims/aoti_torch/c/macros.h` 中:
18+
```c
19+
#ifdef EXPORT_AOTI_FUNCTIONS
20+
#define AOTI_TORCH_EXPORT __attribute__((visibility("default")))
21+
#else
22+
#define AOTI_TORCH_EXPORT inline
23+
#endif
24+
```
25+
26+
-`EXPORT_AOTI_FUNCTIONS` **未定义**时,所有函数被标记为 `inline`
27+
- `inline` 函数不会作为外部符号导出到动态符号表
28+
- AOT 编译生成的 `.so` 文件(如 `/tmp/token_embedding_so_blob68945.so`)在运行时无法找到这些符号
29+
30+
3. **主程序未导出符号**
31+
- `gemma3_e2e_runner` 可执行文件需要使用 `--export-dynamic` 链接选项
32+
- 这样动态加载的库才能访问主程序中的符号
33+
34+
## 解决方案
35+
36+
### 修改 1: 添加 EXPORT_AOTI_FUNCTIONS 编译定义
37+
38+
**文件**: `/home/gasoonjia/executorch/backends/cuda/CMakeLists.txt`
39+
40+
`aoti_cuda` 目标添加编译定义:
41+
42+
```cmake
43+
# Define EXPORT_AOTI_FUNCTIONS to export AOTI shim symbols
44+
target_compile_definitions(aoti_cuda PUBLIC EXPORT_AOTI_FUNCTIONS)
45+
```
46+
47+
**作用**:
48+
- 使 `AOTI_TORCH_EXPORT` 宏展开为 `__attribute__((visibility("default")))`
49+
- 所有 AOTI shim 函数将被导出到动态符号表
50+
- AOT 编译的库可以在运行时找到这些符号
51+
52+
### 修改 2: 添加 --export-dynamic 链接选项
53+
54+
**文件**: `/home/gasoonjia/executorch/examples/models/gemma3/CMakeLists.txt`
55+
56+
`gemma3_e2e_runner` 目标添加链接选项:
57+
58+
```cmake
59+
# Export dynamic symbols for AOTI runtime linking
60+
if(NOT APPLE)
61+
target_link_options(gemma3_e2e_runner PRIVATE "LINKER:--export-dynamic")
62+
else()
63+
target_link_options(gemma3_e2e_runner PRIVATE "LINKER:-export_dynamic")
64+
endif()
65+
```
66+
67+
**作用**:
68+
- 确保主程序的符号在动态符号表中可见
69+
- 动态加载的 `.so` 文件可以解析到这些符号
70+
- 支持跨平台(Linux 和 macOS)
71+
72+
## 如何重新编译
73+
74+
修改后需要重新编译项目:
75+
76+
```bash
77+
# 清理之前的构建(可选但推荐)
78+
rm -rf cmake-out/examples/models/gemma3
79+
80+
# 重新配置 CMake
81+
cmake -DCMAKE_BUILD_TYPE=Release \
82+
-DEXECUTORCH_BUILD_CUDA=ON \
83+
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
84+
-B cmake-out/examples/models/gemma3
85+
86+
# 编译
87+
cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release -j
88+
```
89+
90+
**注意**: 如果 VSCode 的 clangd 显示 `shim.h` 中的 "Unknown type name 'AOTI_TORCH_EXPORT'" 等 Hint 级别的诊断信息,这是正常的。这些是因为 clangd 的索引还没有更新。重新编译后这些提示会消失。实际编译时不会有任何问题。
91+
92+
## 验证修复
93+
94+
重新运行 `verify.sh` 或直接运行:
95+
96+
```bash
97+
./cmake-out/examples/models/gemma3/gemma3_e2e_runner \
98+
--model_path=<your_model_path> \
99+
--tokenizer_path=<your_tokenizer_path>
100+
```
101+
102+
如果修复成功,应该不再出现 `undefined symbol: aoti_torch_dtype_bfloat16` 错误。
103+
104+
## 技术细节
105+
106+
### 为什么需要 EXPORT_AOTI_FUNCTIONS?
107+
108+
AOT Inductor 生成的代码在运行时需要调用 AOTI runtime 提供的函数。这些函数包括:
109+
110+
- 数据类型相关:`aoti_torch_dtype_*`
111+
- 设备类型相关:`aoti_torch_device_type_*`
112+
- Tensor 操作:`aoti_torch_create_tensor_from_blob`, `aoti_torch_empty_strided`, 等
113+
- 内存管理:`aoti_torch_delete_tensor_object`
114+
115+
这些函数必须在动态符号表中可见,才能被 dlopen 加载的 `.so` 文件解析。
116+
117+
### 为什么需要 --export-dynamic?
118+
119+
Linux 下默认情况,可执行文件的符号不会被导出到动态符号表。`--export-dynamic` 选项:
120+
121+
1. 将所有全局符号添加到动态符号表
122+
2. 允许 dlopen 加载的库通过 dlsym 查找这些符号
123+
3. 在 runtime 建立正确的符号链接
124+
125+
## 相关文件
126+
127+
- `/home/gasoonjia/executorch/backends/cuda/runtime/shims/aoti_torch/c/shim.h` - AOTI 函数定义
128+
- `/home/gasoonjia/executorch/backends/cuda/runtime/shims/aoti_torch/c/macros.h` - 导出宏定义
129+
- `/home/gasoonjia/executorch/backends/cuda/runtime/cuda_backend.cpp` - CUDA backend 实现
130+
- `/home/gasoonjia/executorch/backends/cuda/CMakeLists.txt` - CUDA backend 构建配置
131+
- `/home/gasoonjia/executorch/examples/models/gemma3/CMakeLists.txt` - Gemma3 runner 构建配置

backends/cuda/CMakeLists.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ find_package(CUDAToolkit REQUIRED)
3232

3333
# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
3434
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
35-
find_package_torch()
35+
find_package_torch_headers()
3636

3737
# CUDA tensor maker for backends that support incontiguous tensors
3838
set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
@@ -70,7 +70,6 @@ install(
7070
# CUDA-specific AOTI functionality
7171
set(_aoti_cuda_sources
7272
runtime/cuda_backend.cpp
73-
runtime/guard.cpp
7473
runtime/platform/platform.cpp
7574
)
7675
add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
@@ -86,6 +85,12 @@ target_compile_options(
8685
aoti_cuda PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
8786
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
8887
)
88+
# Define EXPORT_AOTI_FUNCTIONS to export AOTI shim symbols
89+
target_compile_definitions(aoti_cuda PUBLIC EXPORT_AOTI_FUNCTIONS)
90+
91+
# Define USE_CUDA to enable CUDA-specific functionality
92+
target_compile_definitions(aoti_cuda PUBLIC USE_CUDA)
93+
8994
# Ensure symbols are exported properly
9095
target_link_options(
9196
aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>

backends/cuda/runtime/TARGETS

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,8 @@ runtime.cxx_library(
4949
runtime.cxx_library(
5050
name = "runtime_shims",
5151
srcs = [
52-
"guard.cpp",
5352
],
5453
headers = [
55-
"guard.h",
5654
"utils.h",
5755
],
5856
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)

backends/cuda/runtime/aoti_delegate_handle.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88

99
#pragma once
1010

11+
#include <executorch/backends/cuda/runtime/slim/core/SlimTensor.h>
1112
#include <executorch/runtime/core/error.h>
1213
#include <executorch/runtime/core/evalue.h>
13-
#include <executorch/backends/cuda/runtime/slim/core/SlimTensor.h>
1414

1515
namespace executorch {
1616
namespace backends {
@@ -22,7 +22,7 @@ using executorch::runtime::etensor::Tensor;
2222
extern "C" {
2323

2424
// Type definitions
25-
using AOTITensorHandle = standalone::slim::SlimTensor*;
25+
using AOTITensorHandle = executorch::backends::cuda::slim::SlimTensor*;
2626
using AOTIRuntimeError = Error;
2727

2828
// Forward declarations for AOT Inductor model container
@@ -52,11 +52,11 @@ using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)(
5252
using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)(
5353
AOTInductorModelContainerHandle container_handle,
5454
AOTITensorHandle* input_handles, // array of input SlimTensor*; handles
55-
// are stolen; the array itself is borrowed
55+
// are stolen; the array itself is borrowed
5656
size_t num_inputs,
5757
AOTITensorHandle* output_handles, // array for writing SlimTensor*; handles
58-
// will be stolen by the caller; the array itself
59-
// is borrowed
58+
// will be stolen by the caller; the array
59+
// itself is borrowed
6060
size_t n_outputs,
6161
AOTInductorStreamHandle stream_handle,
6262
AOTIProxyExecutorHandle proxy_executor_handle);
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#pragma once
2+
3+
#include <executorch/backends/cuda/runtime/c10/util/ArrayRef.h>
4+
#include <executorch/backends/cuda/runtime/c10/util/irange.h>
5+
6+
#include <algorithm>
7+
#include <cstdint>
8+
9+
namespace executorch::backends::cuda::c10 {
10+
11+
template <typename T>
12+
bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
13+
if (numel == 0) {
14+
return true;
15+
}
16+
17+
T expected_stride = 1;
18+
// NB: make sure we do signed arithmetic
19+
for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
20+
const auto &size_d = sizes[d];
21+
if (size_d == 1) {
22+
continue;
23+
}
24+
25+
if (strides[d] != expected_stride) {
26+
return false;
27+
}
28+
expected_stride *= size_d;
29+
}
30+
return true;
31+
}
32+
33+
// This function will return True if the tensor is contiguous, and False if the
34+
// its not or if we can't determine if it is contiguous due to unbacked symbols
35+
// (it could be either in that case based on the actual runtime data).
36+
template <typename T>
37+
bool definitely_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
38+
if (numel == 0) {
39+
return true;
40+
}
41+
42+
T expected_stride = 1;
43+
// NB: make sure we do signed arithmetic
44+
for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
45+
const auto &size_d = sizes[d];
46+
if (size_d == 1) {
47+
continue;
48+
}
49+
50+
if (strides[d] != expected_stride) {
51+
return false;
52+
}
53+
expected_stride *= size_d;
54+
}
55+
return true;
56+
}
57+
58+
template <typename T>
59+
bool _compute_channels_last_contiguous_2d(ArrayRef<T> sizes,
60+
ArrayRef<T> strides) {
61+
// Please don't combine these code, constant array is used here to let
62+
// compiler fully unroll the loop to get better performance
63+
switch (sizes.size()) {
64+
case 4: {
65+
T expected = 1;
66+
for (auto &d : {1, 3, 2, 0}) {
67+
const auto &size_d = sizes[d];
68+
if (size_d != 1) {
69+
if (strides[d] != expected) {
70+
return false;
71+
}
72+
expected *= size_d;
73+
}
74+
}
75+
return true;
76+
}
77+
// NOLINTNEXTLINE(bugprone-branch-clone)
78+
case 3:
79+
// TODO dim == 3 case will be enabled once it is fully tested
80+
return false;
81+
default:
82+
return false;
83+
}
84+
}
85+
86+
template <typename T>
87+
bool _compute_channels_last_contiguous_3d(ArrayRef<T> sizes,
88+
ArrayRef<T> strides) {
89+
// Please don't combine these code, constant array is used here to let
90+
// compiler fully unroll the loop to get better performance
91+
switch (sizes.size()) {
92+
case 5: {
93+
T expected = 1;
94+
for (auto &d : {1, 4, 3, 2, 0}) {
95+
const auto &size_d = sizes[d];
96+
if (size_d != 1) {
97+
if (strides[d] != expected) {
98+
return false;
99+
}
100+
expected *= size_d;
101+
}
102+
}
103+
return true;
104+
}
105+
// NOLINTNEXTLINE(bugprone-branch-clone)
106+
case 4:
107+
// TODO dim == 4 case will be enabled once it is fully tested
108+
return false;
109+
default:
110+
return false;
111+
}
112+
}
113+
114+
template <typename T>
115+
bool _compute_non_overlapping_and_dense(ArrayRef<T> sizes,
116+
ArrayRef<T> strides) {
117+
auto dim = sizes.size();
118+
if (dim == 1) {
119+
return sizes[0] < 2 || strides[0] == 1;
120+
}
121+
std::vector<int64_t> perm(dim);
122+
for (const auto i : irange(dim)) {
123+
perm[i] = i;
124+
}
125+
// Sort by strides, leaving 0 and 1 sized dims at the end of the array
126+
std::sort(perm.begin(), perm.end(), [&](int64_t a, int64_t b) {
127+
if (sizes[a] < 2) {
128+
return false;
129+
} else if (sizes[b] < 2) {
130+
return true;
131+
}
132+
return strides[a] < strides[b];
133+
});
134+
T require_stride = 1;
135+
for (const auto i : irange(dim)) {
136+
const auto &size_perm_i = sizes[perm[i]];
137+
if (size_perm_i < 2) {
138+
return true;
139+
}
140+
if (strides[perm[i]] != require_stride) {
141+
return false;
142+
}
143+
require_stride *= size_perm_i;
144+
}
145+
return true;
146+
}
147+
148+
} // namespace executorch::backends::cuda::c10

0 commit comments

Comments
 (0)