Skip to content

Commit 1a61a03

Browse files
committed
Update base for Update on "[slimtensor] Add SlimTensor class with basic properties and CPU copy operation"
**Key components:** 1. **`c10/core/Contiguity.h`** - Contiguity checking utility: - `_compute_contiguous<T>()` - computes whether a tensor with given sizes/strides is contiguous in memory (row-major order) 2. **`core/SlimTensor.h`** - Main SlimTensor class with: - **Constructors**: Default (undefined tensor) and full constructor with storage, sizes, strides, dtype, and storage_offset - **Property accessors**: - `sizes()`, `size(dim)` - get tensor dimensions with negative indexing support - `strides()`, `stride(dim)` - get tensor strides with negative indexing support - `dtype()`, `device()`, `device_type()`, `device_index()` - `numel()`, `dim()`, `nbytes()`, `itemsize()` - `data_ptr()` - returns pointer to tensor data (adjusted for storage_offset) - `storage_offset()`, `storage()` - **State queries**: `defined()`, `is_cpu()`, `is_contiguous()`, `is_empty()` - **Copy operation**: `copy_(other)` - copies data from another tensor - Fast path: uses memcpy for both-contiguous tensors - Slow path: element-wise copy respecting strides for non-contiguous tensors - **Setters**: `reset()`, `set_storage()`, `set_sizes_and_strides()` **Curretnt constraints:** - Only CPU device supported - Only Float32 dtype tested - copy_() only supports CPU-to-CPU copy Those contraints will be further improved in the following diffs Differential Revision: [D89750150](https://our.internmc.facebook.com/intern/diff/D89750150/) [ghstack-poisoned]
2 parents d525c32 + 9a30bd3 commit 1a61a03

File tree

99 files changed

+5509
-1443
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+5509
-1443
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
7a064ed3eafa43f17412d434b395240c727b3000
1+
7a79b41e29a790ebb4b530eb98a89381e2d7de29

backends/aoti/common_shims.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
6464
AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
6565
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
6666
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
67+
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bool();
6768
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
6869
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
6970
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();

backends/aoti/slim/c10/core/Device.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,25 @@ using DeviceIndex = int8_t;
2626
/// Represents a compute device on which a tensor is located.
2727
/// A device is uniquely identified by a type (e.g., CPU) and a device index.
2828
struct Device final {
29-
using Type = DeviceType;
30-
3129
/// Constructs a new Device from a DeviceType and an optional device index.
3230
/// @param type The type of device.
3331
/// @param index The device index. For CPU, this should be -1 or 0.
3432
/* implicit */
35-
Device(DeviceType type, DeviceIndex index = -1) : type_(type), index_(index) {
33+
explicit Device(DeviceType type, DeviceIndex index = -1)
34+
: type_(type), index_(index) {
3635
validate();
3736
}
3837

3938
/// Constructs a Device from a string description.
4039
/// The string must be "cpu" or "cpu:0".
41-
/* implicit */ Device(const std::string& device_string) : Device(Type::CPU) {
40+
/* implicit */ Device(const std::string& device_string)
41+
: Device(DeviceType::CPU) {
4242
ET_CHECK_MSG(!device_string.empty(), "Device string must not be empty");
4343

4444
if (device_string == "cpu" || device_string == "CPU") {
4545
type_ = DeviceType::CPU;
4646
index_ = -1;
47-
} else if (
48-
device_string == "cpu:0" || device_string == "CPU:0" ||
49-
device_string == "cpu:1" || device_string == "CPU:1") {
47+
} else if (device_string == "cpu:0" || device_string == "CPU:0") {
5048
type_ = DeviceType::CPU;
5149
index_ = static_cast<DeviceIndex>(device_string.back() - '0');
5250
} else {

backends/aoti/slim/core/Storage.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ inline void noop(void*) {}
2727
} // namespace detail
2828

2929
/// Default CPU device constant.
30-
const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0);
30+
inline const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0);
3131

3232
/// DeviceTraits template for device-specific operations.
3333
/// Device-specific implementations provide allocate(), free(), and memcpy().

backends/aoti/slim/core/test/test_storage_cpu.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ TEST(StorageSharedPtrTest, SharedOwnership) {
190190
Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
191191
void* data_ptr = storage1->data();
192192

193-
Storage storage2 = storage1;
193+
const Storage& storage2 = storage1;
194194

195195
EXPECT_EQ(storage1.use_count(), 2);
196196
EXPECT_EQ(storage2.use_count(), 2);
@@ -208,7 +208,7 @@ TEST(StorageSharedPtrTest, SharedOwnershipModification) {
208208
data[i] = 0.0f;
209209
}
210210

211-
Storage storage2 = storage1;
211+
const Storage& storage2 = storage1;
212212

213213
float* data2 = static_cast<float*>(storage2->data());
214214
for (size_t i = 0; i < kNumFloats; ++i) {
@@ -226,10 +226,7 @@ TEST(StorageSharedPtrTest, ReferenceCountDecrement) {
226226
Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
227227
EXPECT_EQ(storage1.use_count(), 1);
228228

229-
{
230-
Storage storage2 = storage1;
231-
EXPECT_EQ(storage1.use_count(), 2);
232-
}
229+
{ EXPECT_EQ(storage1.use_count(), 2); }
233230

234231
EXPECT_EQ(storage1.use_count(), 1);
235232
}

backends/arm/MODELS.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# The following file contains all models that have been confirmed to be functional and tested for the Arm backend:
2+
- Conformer
3+
- Deit Tiny
4+
- DeepLab v3 (DL3)
5+
- Inception v3 (IC3)
6+
- Llama
7+
- Long Short-Term Memory (LSTM)
8+
- MobileNet v2 (MV2)
9+
- MobileNet v3 (MV3)
10+
- Some popular torch.nn.functional models (NN functional)
11+
- Some popular torch.nn.modules models (NN modules)
12+
- Some popular torch ops (Torch Functions)
13+
- Neural Super Sampler (NSS)
14+
- ResNet 18
15+
- Wav2Letter (W2L)
16+
- Stable Diffusion:
17+
* CLIP Text Encoder (CLIP Text with Projection)
18+
* Stable Diffusion 3 Transformer (SD3 Transformer)
19+
* T5 Encoder
20+
* VAE Encoder/Decoder (VAE)

backends/arm/_passes/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@
113113
from .replace_scalar_with_tensor_pass import ( # noqa
114114
ReplaceScalarWithTensorByProfilePass,
115115
)
116+
from .rewrite_bool_to_fp32_cast_via_int8_pass import ( # noqa
117+
RewriteBoolToFp32CastViaInt8Pass,
118+
)
116119
from .rewrite_conv_pass import RewriteConvPass # noqa
117120
from .rewrite_matmul import RewriteMatmulPass # noqa
118121
from .rewrite_upsample import RewriteUpsamplePass # noqa

backends/arm/_passes/arm_pass.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from abc import abstractmethod
99
from typing import Any, List, Optional, Set, Type
1010

11+
from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
1112
from executorch.exir.pass_base import ExportPass, NodeMetadata
1213
from torch.fx import GraphModule
1314
from torch.fx.passes.infra.pass_base import PassResult
@@ -16,9 +17,23 @@
1617
class ArmPass(ExportPass):
1718
"""Base class for Arm passes"""
1819

19-
def __init__(self) -> None:
20-
super().__init__()
20+
def __init__(self, tfa_pass: bool = False, *args, **kwargs) -> None:
21+
super().__init__(*args, **kwargs)
2122
self.submodule_depth = 0
23+
self.is_tfa_pass = tfa_pass
24+
25+
def allowed_to_transform(self, meta: NodeMetadata | dict[str, Any]) -> bool:
26+
if not self.is_tfa_pass:
27+
return True
28+
29+
if isinstance(meta, NodeMetadata):
30+
meta_dict = meta.data
31+
else:
32+
meta_dict = meta
33+
34+
disallow_tfa = meta_dict.get(DISALLOW_TFA_META_KEY, False)
35+
36+
return not disallow_tfa
2237

2338
@property
2439
@abstractmethod

backends/arm/_passes/arm_pass_manager.py

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
RemoveNoopPass,
104104
ReplaceInfAndLimitValuesPass,
105105
ReplaceScalarWithTensorByProfilePass,
106+
RewriteBoolToFp32CastViaInt8Pass,
106107
RewriteConvPass,
107108
RewriteMatmulPass,
108109
RewriteUpsamplePass,
@@ -221,6 +222,7 @@ def _tosa_pipeline(
221222
self.add_passes(
222223
[
223224
FuseQuantizedActivationPass(),
225+
RewriteBoolToFp32CastViaInt8Pass(),
224226
ConvertToClampPass(),
225227
DecomposeTOSAUnsupportedClampPass(),
226228
DecomposeGroupNormPass(),
@@ -374,65 +376,65 @@ def transform_to_backend_pipeline(
374376

375377
def transform_for_annotation_pipeline(self, graph_module: GraphModule):
376378
# Preprocessing passes
377-
self.add_pass(RemoveGraphAssertsPass())
379+
self.add_pass(RemoveGraphAssertsPass(tfa_pass=True))
378380

379381
# Transformation passes (pre scalar -> tensor)
380382
self.add_passes(
381383
[
382-
DecomposeSelectScatterPass(),
383-
ConvertInt64ConstOpsToInt32Pass(),
384-
ConvertInt64OutputOpsToInt32Pass(),
385-
InsertInt32CastsAfterInt64PlaceholdersPass(),
386-
DecomposeEmbeddingPass(),
387-
DecomposeScaledDotProductAttentionPass(),
388-
DecomposeRoundPass(),
389-
DecomposeLogitPass(),
390-
PromoteBoolOperandsPass(),
391-
DecomposeSignPass(),
392-
DecomposeAddmmPass(),
393-
DecomposeRemainderPass(),
394-
DecomposeFloorDividePass(),
395-
DecomposeDivTensorModePass(),
384+
DecomposeSelectScatterPass(tfa_pass=True),
385+
ConvertInt64ConstOpsToInt32Pass(tfa_pass=True),
386+
ConvertInt64OutputOpsToInt32Pass(tfa_pass=True),
387+
InsertInt32CastsAfterInt64PlaceholdersPass(tfa_pass=True),
388+
DecomposeEmbeddingPass(tfa_pass=True),
389+
DecomposeScaledDotProductAttentionPass(tfa_pass=True),
390+
DecomposeRoundPass(tfa_pass=True),
391+
DecomposeLogitPass(tfa_pass=True),
392+
PromoteBoolOperandsPass(tfa_pass=True),
393+
DecomposeSignPass(tfa_pass=True),
394+
DecomposeAddmmPass(tfa_pass=True),
395+
DecomposeRemainderPass(tfa_pass=True),
396+
DecomposeFloorDividePass(tfa_pass=True),
397+
DecomposeDivTensorModePass(tfa_pass=True),
396398
]
397399
)
398400

399401
# Scalars -> tensors
400402
self.add_passes(
401403
[
402-
ReplaceScalarWithTensorByProfilePass(),
403-
ScalarsToAttributePass(),
404+
ReplaceScalarWithTensorByProfilePass(tfa_pass=True),
405+
ScalarsToAttributePass(tfa_pass=True),
404406
]
405407
)
406408

407409
# Transformation passes (post scalar removal)
408410
self.add_passes(
409411
[
410-
NormalizeWhileInitialArgsPass(use_exir_clone=False),
411-
DecomposeAddSubAlphaPass(),
412-
DecomposeGroupNormPass(),
413-
DecomposeLayerNormPass(),
414-
DecomposeVarPass(),
415-
DecomposeMeanDimPass(graph_module, self.tosa_spec),
416-
DecomposeNotEqualPass(),
417-
DecomposeCosineSimilarityPass(),
418-
DecomposeGluPass(),
419-
DecomposeDivPass(),
420-
DecomposeLeakyReLUPass(),
421-
DecomposeLinalgVectorNormPass(),
422-
DecomposeSqrtPass(),
423-
DecomposeSiluPass(),
424-
DecomposeAvgPool2dPass(),
425-
DecomposeSoftmaxUnstablePass(),
426-
DecomposeSoftmaxPass(),
427-
ConvertMinMaxPass(),
412+
NormalizeWhileInitialArgsPass(use_exir_clone=False, tfa_pass=True),
413+
DecomposeAddSubAlphaPass(tfa_pass=True),
414+
DecomposeGroupNormPass(tfa_pass=True),
415+
DecomposeLayerNormPass(tfa_pass=True),
416+
DecomposeVarPass(tfa_pass=True),
417+
DecomposeMeanDimPass(graph_module, self.tosa_spec, tfa_pass=True),
418+
DecomposeNotEqualPass(tfa_pass=True),
419+
DecomposeCosineSimilarityPass(tfa_pass=True),
420+
DecomposeGluPass(tfa_pass=True),
421+
DecomposeDivPass(tfa_pass=True),
422+
DecomposeLeakyReLUPass(tfa_pass=True),
423+
DecomposeLinalgVectorNormPass(tfa_pass=True),
424+
DecomposeSqrtPass(tfa_pass=True),
425+
DecomposeSiluPass(tfa_pass=True),
426+
DecomposeAvgPool2dPass(tfa_pass=True),
427+
DecomposeSoftmaxUnstablePass(tfa_pass=True),
428+
DecomposeSoftmaxPass(tfa_pass=True),
429+
ConvertMinMaxPass(tfa_pass=True),
428430
]
429431
)
430432

431433
# Postprocessing passes
432434
self.add_passes(
433435
[
434-
ReplaceInfAndLimitValuesPass(),
435-
DecomposeMaskedFillPass(),
436+
ReplaceInfAndLimitValuesPass(tfa_pass=True),
437+
DecomposeMaskedFillPass(tfa_pass=True),
436438
]
437439
)
438440

backends/arm/_passes/cast_int64_pass.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ class CastInt64BuffersToInt32Pass(ArmPass):
2323

2424
_passes_required_after: Set[Type[ExportPass]] = set()
2525

26-
def __init__(self, exported_program: ExportedProgram):
27-
super().__init__()
26+
def __init__(self, exported_program: ExportedProgram, *args, **kwargs):
27+
super().__init__(*args, **kwargs)
2828
self.exported_program = exported_program
2929

3030
def _assert_within_int32(self, tensor: torch.Tensor, node: torch.fx.Node):

0 commit comments

Comments
 (0)