|
10 | 10 | from typing import Tuple
|
11 | 11 |
|
12 | 12 | import torch
|
13 |
| -import torch.nn as nn |
14 |
| -import torch.nn.functional as F |
15 | 13 | from torch.testing._internal import common_utils
|
16 | 14 | from torch.testing._internal.common_utils import (
|
17 | 15 | TestCase,
|
|
28 | 26 | )
|
29 | 27 | from torchao.quantization.quantize_.common import KernelPreference
|
30 | 28 | from torchao.quantization.utils import compute_error
|
| 29 | +from torchao.testing.model_architectures import Experts |
31 | 30 | from torchao.utils import (
|
32 | 31 | TORCH_VERSION_AT_LEAST_2_8,
|
33 | 32 | _is_fbgemm_genai_gpu_available,
|
|
39 | 38 | torch._dynamo.config.cache_size_limit = 128
|
40 | 39 |
|
41 | 40 |
|
42 |
| -class Experts(nn.Module): |
43 |
| - def __init__( |
44 |
| - self, |
45 |
| - num_local_experts: int, |
46 |
| - dim: int, |
47 |
| - hidden_dim: int, |
48 |
| - dtype: torch.dtype, |
49 |
| - device: torch.device, |
50 |
| - ) -> None: |
51 |
| - super().__init__() |
52 |
| - |
53 |
| - self.num_local_experts = num_local_experts |
54 |
| - self.dim = dim |
55 |
| - |
56 |
| - self.w1: nn.Parameter = nn.Parameter( |
57 |
| - torch.randn( |
58 |
| - num_local_experts, |
59 |
| - dim, |
60 |
| - hidden_dim, |
61 |
| - dtype=dtype, |
62 |
| - device=device, |
63 |
| - ) |
64 |
| - ) |
65 |
| - |
66 |
| - self.w2: nn.Parameter = nn.Parameter( |
67 |
| - torch.randn( |
68 |
| - num_local_experts, |
69 |
| - hidden_dim, |
70 |
| - dim, |
71 |
| - dtype=dtype, |
72 |
| - device=device, |
73 |
| - ) |
74 |
| - ) |
75 |
| - |
76 |
| - self.w3: nn.Parameter = nn.Parameter( |
77 |
| - torch.randn( |
78 |
| - num_local_experts, |
79 |
| - dim, |
80 |
| - hidden_dim, |
81 |
| - dtype=dtype, |
82 |
| - device=device, |
83 |
| - ) |
84 |
| - ) |
85 |
| - |
86 |
| - def forward( |
87 |
| - self, |
88 |
| - routed_in_egD: torch.Tensor, # noqa: N803 |
89 |
| - ) -> torch.Tensor: |
90 |
| - e = self.num_local_experts |
91 |
| - D = self.dim |
92 |
| - |
93 |
| - x_egD = routed_in_egD.view(e, -1, D) |
94 |
| - |
95 |
| - middle_out_egF = F.silu(torch.bmm(x_egD, self.w1)) * torch.bmm(x_egD, self.w3) |
96 |
| - out_egD = torch.bmm(middle_out_egF, self.w2) |
97 |
| - out_egD = out_egD.view(-1, D) |
98 |
| - |
99 |
| - return out_egD |
100 |
| - |
101 |
| - |
102 | 41 | class ToyLinearModel(torch.nn.Module):
|
103 | 42 | def __init__(self, in_features, out_features):
|
104 | 43 | super().__init__()
|
|
0 commit comments