vectorch-ai · guocuimi · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(common)
 add_subdirectory(handlers)
 add_subdirectory(kernels)
 add_subdirectory(tokenizer)
+add_subdirectory(module)
 add_subdirectory(layers)
 add_subdirectory(quantization)
 add_subdirectory(models)

diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
@@ -3,7 +3,7 @@ include(cc_test)
 
 cc_library(
   NAME
-    linear 
+    linear
   HDRS
     linear.h
     qkv_linear.h
@@ -21,35 +21,37 @@ cc_library(
     :model_parallel
     :quantization
     :kernels
+    :module
     glog::glog
     gflags::gflags
     torch
 )
 
 cc_library(
-  NAME 
+  NAME
     pos_embedding
-  HDRS 
+  HDRS
     pos_embedding.h
-  SRCS 
+  SRCS
     pos_embedding.cpp
   DEPS
     :state_dict
     :memory
     :kernels
+    :module
     glog::glog
     gflags::gflags
     torch
 )
 
 cc_library(
-  NAME 
+  NAME
     layers
-  HDRS 
+  HDRS
     normalization.h
     embedding.h
     activation.h
-  SRCS 
+  SRCS
     activation.cpp
   DEPS
     :state_dict
@@ -58,6 +60,7 @@ cc_library(
     :pos_embedding
     :attention
     :kernels
+    :module
     glog::glog
     gflags::gflags
     torch
@@ -80,4 +83,4 @@ cc_test(
 )
 
 add_subdirectory(attention)
-add_subdirectory(moe)
+add_subdirectory(moe)
diff --git a/src/layers/attention/attention.h b/src/layers/attention/attention.h
@@ -5,10 +5,12 @@
 #include "layers/attention/handler.h"
 #include "memory/kv_cache.h"
 #include "models/parameters.h"
+#include "module/module.h"
+#include "module/module_holder.h"
 
 namespace llm {
 
-class AttentionImpl : public torch::nn::Module {
+class AttentionImpl : public llm::nn::Module {
  public:
   AttentionImpl(int64_t n_heads,
                 int64_t n_kv_heads,
@@ -38,6 +40,6 @@ class AttentionImpl : public torch::nn::Module {
   // sliding window for self-attention, -1 means no sliding window
   int32_t sliding_window_ = -1;
 };
-TORCH_MODULE(Attention);
+LLM_MODULE(Attention);
 
 }  // namespace llm
diff --git a/src/layers/embedding.h b/src/layers/embedding.h
@@ -7,14 +7,16 @@
 
 #include "model_loader/state_dict.h"
 #include "model_parallel/model_parallel.h"
+#include "module/module.h"
+#include "module/module_holder.h"
 
 namespace llm {
 
 // A simple lookup table that stores embeddings of a fixed dictionary and size.
 // This module is often used to store word embeddings and retrieve them using
 // indices.
 
-class EmbeddingImpl : public torch::nn::Module {
+class EmbeddingImpl : public llm::nn::Module {
  public:
   EmbeddingImpl(int64_t num_embeddings,
                 int64_t embedding_dim,
@@ -63,10 +65,10 @@ class EmbeddingImpl : public torch::nn::Module {
   // whether the weight is loaded
   bool is_loaded_ = false;
 };
-TORCH_MODULE(Embedding);
+LLM_MODULE(Embedding);
 
 // Embedding parallelized in the embedding dimension.
-class ParallelEmbeddingImpl : public torch::nn::Module {
+class ParallelEmbeddingImpl : public llm::nn::Module {
  public:
   ParallelEmbeddingImpl(int64_t num_embeddings,
                         int64_t embedding_dim,
@@ -134,10 +136,10 @@ class ParallelEmbeddingImpl : public torch::nn::Module {
   // parallel args
   ParallelArgs parallel_args_;
 };
-TORCH_MODULE(ParallelEmbedding);
+LLM_MODULE(ParallelEmbedding);
 
 // Embedding parallelized in the vocabulary dimension
-class VocabParallelEmbeddingImpl : public torch::nn::Module {
+class VocabParallelEmbeddingImpl : public llm::nn::Module {
  public:
   VocabParallelEmbeddingImpl(int64_t num_embeddings,
                              int64_t embedding_dim,
@@ -152,8 +154,7 @@ class VocabParallelEmbeddingImpl : public torch::nn::Module {
     // register the weight parameter
     weight_ = register_parameter(
         "weight",
-        torch::empty({num_embeddings_per_partition, embedding_dim},
-                     options),
+        torch::empty({num_embeddings_per_partition, embedding_dim}, options),
         /*requires_grad=*/false);
   }
 
@@ -218,5 +219,5 @@ class VocabParallelEmbeddingImpl : public torch::nn::Module {
   int64_t start_index_ = 0;
   int64_t end_index_ = 0;
 };
-TORCH_MODULE(VocabParallelEmbedding);
+LLM_MODULE(VocabParallelEmbedding);
 }  // namespace llm
diff --git a/src/layers/fused_linear.h b/src/layers/fused_linear.h
@@ -6,11 +6,13 @@
 #include "linear.h"
 #include "model_loader/state_dict.h"
 #include "model_parallel/parallel_args.h"
+#include "module/module.h"
+#include "module/module_holder.h"
 #include "quantization/quant_args.h"
 
 namespace llm {
 
-class FusedColumnParallelLinearImpl : public torch::nn::Module {
+class FusedColumnParallelLinearImpl : public llm::nn::Module {
  public:
   FusedColumnParallelLinearImpl(int64_t in_features,
                                 const std::vector<int64_t>& out_features,
@@ -44,6 +46,6 @@ class FusedColumnParallelLinearImpl : public torch::nn::Module {
   // whether the linear layer is fused
   bool fused_ = false;
 };
-TORCH_MODULE(FusedColumnParallelLinear);
+LLM_MODULE(FusedColumnParallelLinear);
 
 }  // namespace llm
diff --git a/src/layers/linear.h b/src/layers/linear.h
@@ -5,6 +5,8 @@
 
 #include "model_loader/state_dict.h"
 #include "model_parallel/parallel_args.h"
+#include "module/module.h"
+#include "module/module_holder.h"
 #include "quantization/quant_args.h"
 
 namespace llm {
@@ -14,7 +16,7 @@ using TensorTransform = std::function<torch::Tensor(const torch::Tensor&)>;
 // an interface for parallel linear layer.
 // all linear classes should inherit from this class and implement the forward
 // function.
-class ParallelLinearImpl : public torch::nn::Module {
+class ParallelLinearImpl : public llm::nn::Module {
  public:
   ~ParallelLinearImpl() override = default;
 
@@ -37,10 +39,9 @@ class ParallelLinearImpl : public torch::nn::Module {
   }
 };
 
-class ColumnParallelLinear
-    : public torch::nn::ModuleHolder<ParallelLinearImpl> {
+class ColumnParallelLinear : public llm::nn::ModuleHolder<ParallelLinearImpl> {
  public:
-  using torch::nn::ModuleHolder<ParallelLinearImpl>::ModuleHolder;
+  using llm::nn::ModuleHolder<ParallelLinearImpl>::ModuleHolder;
   using Impl __attribute__((__unused__)) = ParallelLinearImpl;
 
   // construct a rotary positional embedding.
@@ -61,9 +62,9 @@ class ColumnParallelLinear
                        const torch::TensorOptions& options);
 };
 
-class RowParallelLinear : public torch::nn::ModuleHolder<ParallelLinearImpl> {
+class RowParallelLinear : public llm::nn::ModuleHolder<ParallelLinearImpl> {
  public:
-  using torch::nn::ModuleHolder<ParallelLinearImpl>::ModuleHolder;
+  using llm::nn::ModuleHolder<ParallelLinearImpl>::ModuleHolder;
   using Impl __attribute__((__unused__)) = ParallelLinearImpl;
 
   // construct a rotary positional embedding.

diff --git a/src/layers/normalization.h b/src/layers/normalization.h
@@ -7,6 +7,8 @@
 
 #include "kernels/layernorm_kernels.h"
 #include "model_loader/state_dict.h"
+#include "module/module.h"
+#include "module/module_holder.h"
 
 DECLARE_bool(disable_custom_kernels);
 namespace llm {
@@ -63,7 +65,7 @@ inline torch::Tensor layer_norm(torch::Tensor input,
 // apply layer normalization over a mini-batch of inputs as described in
 // the paper `Layer Normalization`: https://arxiv.org/abs/1607.06450
 // x = ((x - mean(x)) / sqrt(std(x) + eps)) * weight + bias
-class LayerNormImpl : public torch::nn::Module {
+class LayerNormImpl : public llm::nn::Module {
  public:
   // dim: the dim over which the mean and std are calculated separately.
   // eps: a value added to the denominator for numerical stability.
@@ -140,10 +142,10 @@ class LayerNormImpl : public torch::nn::Module {
   float eps_;
   std::vector<int64_t> normalized_shape_;
 };
-TORCH_MODULE(LayerNorm);
+LLM_MODULE(LayerNorm);
 
 // Root mean square normalization
-class RMSNormImpl : public torch::nn::Module {
+class RMSNormImpl : public llm::nn::Module {
  public:
   RMSNormImpl(int64_t dim, float eps, const torch::TensorOptions& options)
       : eps_(eps) {
@@ -191,9 +193,9 @@ class RMSNormImpl : public torch::nn::Module {
   // configs
   float eps_;
 };
-TORCH_MODULE(RMSNorm);
+LLM_MODULE(RMSNorm);
 
-class GemmaRMSNormImpl : public torch::nn::Module {
+class GemmaRMSNormImpl : public llm::nn::Module {
  public:
   GemmaRMSNormImpl(int64_t dim, float eps, const torch::TensorOptions& options)
       : eps_(eps) {
@@ -241,10 +243,10 @@ class GemmaRMSNormImpl : public torch::nn::Module {
   // configs
   float eps_;
 };
-TORCH_MODULE(GemmaRMSNorm);
+LLM_MODULE(GemmaRMSNorm);
 
 // Root mean square normalization
-class RMSNormResidualImpl : public torch::nn::Module {
+class RMSNormResidualImpl : public llm::nn::Module {
  public:
   RMSNormResidualImpl(int64_t dim,
                       float eps,
@@ -304,6 +306,6 @@ class RMSNormResidualImpl : public torch::nn::Module {
   // configs
   float eps_;
 };
-TORCH_MODULE(RMSNormResidual);
+LLM_MODULE(RMSNormResidual);
 
 }  // namespace llm
diff --git a/src/layers/qkv_linear.h b/src/layers/qkv_linear.h
@@ -12,7 +12,7 @@ namespace llm {
 
 // a thin wrapper to handle state_dict loading for QKV with
 // support of MQA/GQA
-class QKVColumnParallelLinearImpl : public torch::nn::Module {
+class QKVColumnParallelLinearImpl : public llm::nn::Module {
  public:
   QKVColumnParallelLinearImpl(int64_t hidden_size,
                               int64_t n_heads,
@@ -47,6 +47,6 @@ class QKVColumnParallelLinearImpl : public torch::nn::Module {
 
   int64_t head_dim_ = 0;
 };
-TORCH_MODULE(QKVColumnParallelLinear);
+LLM_MODULE(QKVColumnParallelLinear);
 
 }  // namespace llm
diff --git a/src/models/CMakeLists.txt b/src/models/CMakeLists.txt
@@ -2,7 +2,7 @@ include(cc_library)
 include(cc_test)
 
 cc_library(
-  NAME 
+  NAME
     models
   HDRS
     model_args.h
@@ -18,7 +18,7 @@ cc_library(
     :quantization
     :memory
     :chat_template
+    :module
     glog::glog
     torch
 )
-