diff --git a/include/infiniop.h b/include/infiniop.h
index 0acad83f9..4a0773b32 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -4,21 +4,42 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/averagepool.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/cross_entropy_loss.h"
 #include "infiniop/ops/dequantize.h"
+#include "infiniop/ops/div.h"
+#include "infiniop/ops/exp.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/hardswish.h"
+#include "infiniop/ops/interpolate_nearest.h"
+#include "infiniop/ops/logical_and.h"
+#include "infiniop/ops/logical_or.h"
+#include "infiniop/ops/maxpool.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
+#include "infiniop/ops/reduce_max.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/sin.h"
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/tanh.h"
 #include "infiniop/ops/topkrouter.h"
+#include "infiniop/ops/where.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/reduce_max.h"
+#include "infiniop/ops/layer_norm.h"
+#include "infiniop/ops/index_copy_inplace.h"
+#include "infiniop/ops/gather.h"
+#include "infiniop/ops/scatter.h"
+#include "infiniop/ops/batch_norm.h"
+#include "infiniop/ops/equal.h"
+#include "infiniop/ops/reduce_mean.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h
new file mode 100644
index 000000000..87e857175
--- /dev/null
+++ b/include/infiniop/ops/averagepool.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_AVERAGEPOOL_H__
+#define __INFINIOP_AVERAGEPOOL_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t;
+
+__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
+                                                   infiniopAvgPoolDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t output_desc,
+                                                   infiniopTensorDescriptor_t input_desc,
+                                                   void *kernel_size,
+                                                   void *strides,
+                                                   void *pads,
+                                                   bool ceil_mode);
+
+__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc,
+                                                   size_t *size);
+
+__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   void *output,
+                                   const void *input,
+                                   void *stream);
+
+__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
+
+#endif // __INFINIOP_AVERAGEPOOL_H__
diff --git a/include/infiniop/ops/batch_norm.h b/include/infiniop/ops/batch_norm.h
new file mode 100644
index 000000000..5487a1f69
--- /dev/null
+++ b/include/infiniop/ops/batch_norm.h
@@ -0,0 +1,37 @@
+#ifndef __INFINIOP_BATCH_NORM_API_H__
+#define __INFINIOP_BATCH_NORM_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopBatchNormDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateBatchNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopBatchNormDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t running_mean_desc,
+	infiniopTensorDescriptor_t running_var_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t weight_desc,
+	infiniopTensorDescriptor_t bias_desc,
+	float momentum,
+	float eps
+);
+
+__C __export infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopBatchNorm(infiniopBatchNormDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	void * running_mean,
+	void * running_var,
+	const void * input,
+	const void * weight,
+	const void * bias,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h
new file mode 100644
index 000000000..8b59843c9
--- /dev/null
+++ b/include/infiniop/ops/cross_entropy_loss.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
+#define __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t;
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle,
+                                                            infiniopCrossEntropyLossDescriptor_t *desc_ptr,
+                                                            infiniopTensorDescriptor_t loss_desc,
+                                                            infiniopTensorDescriptor_t logits_desc,
+                                                            infiniopTensorDescriptor_t target_desc);
+
+__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc,
+                                                            size_t *size);
+
+__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc,
+                                            void *workspace,
+                                            size_t workspace_size,
+                                            void *loss,
+                                            const void *logits,
+                                            const void *target,
+                                            void *stream);
+
+__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc);
+
+#endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
new file mode 100644
index 000000000..e539b440c
--- /dev/null
+++ b/include/infiniop/ops/div.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_DIV_API_H__
+#define __INFINIOP_DIV_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
+                                                        infiniopDivDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h
new file mode 100644
index 000000000..36a81984a
--- /dev/null
+++ b/include/infiniop/ops/equal.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_EQUAL_API_H__
+#define __INFINIOP_EQUAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+);
+
+__C __export infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopEqual(
+    infiniopEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h
new file mode 100644
index 000000000..624bc5363
--- /dev/null
+++ b/include/infiniop/ops/exp.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_EXP_API_H__
+#define __INFINIOP_EXP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopExpDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle,
+                                                        infiniopExpDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/gather.h b/include/infiniop/ops/gather.h
new file mode 100644
index 000000000..9ffe310c9
--- /dev/null
+++ b/include/infiniop/ops/gather.h
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_GATHER_API_H__
+#define __INFINIOP_GATHER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopGatherDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopGather(
+    infiniopGatherDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
new file mode 100644
index 000000000..8d655fe82
--- /dev/null
+++ b/include/infiniop/ops/hardswish.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_HARDSWISH_API_H__
+#define __INFINIOP_HARDSWISH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
+                                                              infiniopHardswishDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/index_copy_inplace.h b/include/infiniop/ops/index_copy_inplace.h
new file mode 100644
index 000000000..e2266299a
--- /dev/null
+++ b/include/infiniop/ops/index_copy_inplace.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__
+#define __INFINIOP_INDEX_COPY_INPLACE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexCopyInplaceDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopIndexCopyInplace(infiniopIndexCopyInplaceDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h
new file mode 100644
index 000000000..7f970dc38
--- /dev/null
+++ b/include/infiniop/ops/interpolate_nearest.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_INTERPOLATE_NEAREST_H__
+#define __INFINIOP_INTERPOLATE_NEAREST_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t;
+
+__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle,
+                                                              infiniopInterpolateNearestDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output_desc,
+                                                              infiniopTensorDescriptor_t input_desc);
+
+__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc,
+                                                              size_t *size);
+
+__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc);
+
+#endif // __INFINIOP_INTERPOLATE_NEAREST_H__
diff --git a/include/infiniop/ops/layer_norm.h b/include/infiniop/ops/layer_norm.h
new file mode 100644
index 000000000..e1d745723
--- /dev/null
+++ b/include/infiniop/ops/layer_norm.h
@@ -0,0 +1,36 @@
+#ifndef __INFINIOP_LAYER_NORM_API_H__
+#define __INFINIOP_LAYER_NORM_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLayerNormDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLayerNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopLayerNormDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_standardization_desc,
+	infiniopTensorDescriptor_t input_std_deviation_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t weight_desc,
+	infiniopTensorDescriptor_t bias_desc,
+	float eps
+);
+
+__C __export infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLayerNorm(infiniopLayerNormDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	void * input_standardization,
+	void * input_std_deviation,
+	const void * input,
+	const void * weight,
+	const void * bias,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/logical_and.h b/include/infiniop/ops/logical_and.h
new file mode 100644
index 000000000..5c237f79c
--- /dev/null
+++ b/include/infiniop/ops/logical_and.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_LOGICAL_AND_API_H__
+#define __INFINIOP_LOGICAL_AND_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogicalAndDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogicalAndDescriptor(infiniopHandle_t handle,
+                                                               infiniopLogicalAndDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t c,
+                                                               infiniopTensorDescriptor_t a,
+                                                               infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLogicalAnd(infiniopLogicalAndDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *c,
+                                               const void *a,
+                                               const void *b,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/logical_or.h b/include/infiniop/ops/logical_or.h
new file mode 100644
index 000000000..1c0066139
--- /dev/null
+++ b/include/infiniop/ops/logical_or.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_LOGICAL_OR_API_H__
+#define __INFINIOP_LOGICAL_OR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogicalOrDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogicalOrDescriptor(infiniopHandle_t handle,
+                                                              infiniopLogicalOrDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t c,
+                                                              infiniopTensorDescriptor_t a,
+                                                              infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLogicalOr(infiniopLogicalOrDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *c,
+                                              const void *a,
+                                              const void *b,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h
new file mode 100644
index 000000000..e47a43aed
--- /dev/null
+++ b/include/infiniop/ops/maxpool.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_MAX_POOL_H__
+#define __INFINIOP_MAX_POOL_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t;
+
+__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
+                                                   infiniopMaxPoolDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t output_desc,
+                                                   infiniopTensorDescriptor_t input_desc,
+                                                   void *kernel_size,
+                                                   void *strides,
+                                                   void *pads,
+                                                   bool ceil_mode);
+
+__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc,
+                                                   size_t *size);
+
+__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   void *output,
+                                   const void *input,
+                                   void *stream);
+
+__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
+
+#endif // __INFINIOP_MAX_POOL_H__
diff --git a/include/infiniop/ops/reduce_max.h b/include/infiniop/ops/reduce_max.h
new file mode 100644
index 000000000..42a3dd62d
--- /dev/null
+++ b/include/infiniop/ops/reduce_max.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_REDUCE_MAX_API_H__
+#define __INFINIOP_REDUCE_MAX_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReduceMaxDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReduceMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    size_t dim);
+
+__C __export infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReduceMax(
+    infiniopReduceMaxDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/reduce_mean.h b/include/infiniop/ops/reduce_mean.h
new file mode 100644
index 000000000..5efd8b227
--- /dev/null
+++ b/include/infiniop/ops/reduce_mean.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_REDUCE_MEAN_API_H__
+#define __INFINIOP_REDUCE_MEAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReduceMeanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReduceMeanDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMeanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim);
+
+__C __export infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReduceMean(
+    infiniopReduceMeanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h
new file mode 100644
index 000000000..22e0eff83
--- /dev/null
+++ b/include/infiniop/ops/scatter.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_SCATTER_API_H__
+#define __INFINIOP_SCATTER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateScatterDescriptor(
+    infiniopHandle_t handle,
+    infiniopScatterDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h
new file mode 100644
index 000000000..640deccc0
--- /dev/null
+++ b/include/infiniop/ops/sin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SIN_API_H__
+#define __INFINIOP_SIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle,
+                                                        infiniopSinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h
new file mode 100644
index 000000000..742dba860
--- /dev/null
+++ b/include/infiniop/ops/tanh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TANH_API_H__
+#define __INFINIOP_TANH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle,
+                                                         infiniopTanhDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h
new file mode 100644
index 000000000..713db102f
--- /dev/null
+++ b/include/infiniop/ops/where.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_WHERE_API_H__
+#define __INFINIOP_WHERE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                          infiniopWhereDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b,
+                                                          infiniopTensorDescriptor_t condition);
+
+__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *c,
+                                          const void *a,
+                                          const void *b,
+                                          const void *condition,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 5348c8c69..de4cb268a 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -25,6 +25,11 @@ def run_tests(args):
         "sub.py",
         "swiglu.py",
         "softplus.py",
+        "where.py",
+        "hardswish.py",
+        "tanh.py",
+        "sin.py",
+        "exp.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..ef6ba8c39 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -15,7 +15,25 @@ DECLARE_INFINIOP_TEST(swiglu)
 DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
+DECLARE_INFINIOP_TEST(div)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(where)
+DECLARE_INFINIOP_TEST(hardswish)
+DECLARE_INFINIOP_TEST(interpolate_nearest)
+DECLARE_INFINIOP_TEST(maxpool)
+DECLARE_INFINIOP_TEST(tanh)
+DECLARE_INFINIOP_TEST(sin)
+DECLARE_INFINIOP_TEST(exp)
+DECLARE_INFINIOP_TEST(averagepool)
+DECLARE_INFINIOP_TEST(cross_entropy_loss)
+DECLARE_INFINIOP_TEST(logical_and)
+DECLARE_INFINIOP_TEST(logical_or)
+DECLARE_INFINIOP_TEST(layer_norm)
+DECLARE_INFINIOP_TEST(index_copy_inplace)
+DECLARE_INFINIOP_TEST(gather)
+DECLARE_INFINIOP_TEST(scatter)
+DECLARE_INFINIOP_TEST(batch_norm)
+DECLARE_INFINIOP_TEST(equal)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -30,19 +48,32 @@ DECLARE_INFINIOP_TEST(sub)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
+#define TEST_BUILDER_MAPPINGS                       \
+    {                                               \
+        REGISTER_INFINIOP_TEST(gemm)                \
+        REGISTER_INFINIOP_TEST(random_sample)       \
+        REGISTER_INFINIOP_TEST(add)                 \
+        REGISTER_INFINIOP_TEST(mul)                 \
+        REGISTER_INFINIOP_TEST(clip)                \
+        REGISTER_INFINIOP_TEST(swiglu)              \
+        REGISTER_INFINIOP_TEST(rope)                \
+        REGISTER_INFINIOP_TEST(rms_norm)            \
+        REGISTER_INFINIOP_TEST(causal_softmax)      \
+        REGISTER_INFINIOP_TEST(rearrange)           \
+        REGISTER_INFINIOP_TEST(sub)                 \
+        REGISTER_INFINIOP_TEST(averagepool)         \
+        REGISTER_INFINIOP_TEST(cross_entropy_loss)  \
+        REGISTER_INFINIOP_TEST(maxpool)             \
+        REGISTER_INFINIOP_TEST(interpolate_nearest) \
+        REGISTER_INFINIOP_TEST(logical_and)         \
+        REGISTER_INFINIOP_TEST(logical_or)          \
+        REGISTER_INFINIOP_TEST(layer_norm)          \
+        REGISTER_INFINIOP_TEST(index_copy_inplace)  \
+        REGISTER_INFINIOP_TEST(gather)              \
+        REGISTER_INFINIOP_TEST(scatter)             \
+        REGISTER_INFINIOP_TEST(batch_norm)          \
+        REGISTER_INFINIOP_TEST(equal)               \
+        REGISTER_INFINIOP_TEST(div)                 \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/averagepool.cpp b/src/infiniop-test/src/ops/averagepool.cpp
new file mode 100644
index 000000000..4f6a80201
--- /dev/null
+++ b/src/infiniop-test/src/ops/averagepool.cpp
@@ -0,0 +1,265 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::averagepool {
+
+struct Test::Attributes {
+    // 输入与期望输出
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> expected_output;
+
+    // 平均池化参数
+    std::vector<size_t> kernel_size;
+    std::vector<size_t> stride;
+    std::vector<size_t> padding;
+    bool ceil_mode;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->expected_output = tensors["output"];
+
+    // (N, C, spatial...) → 池化维度数 = rank - 2
+    size_t pool_ndim = test->_attributes->input->shape().size() - 2;
+    if (pool_ndim == 0) {
+        throw std::runtime_error(
+            "Input tensor must have at least 3 dimensions (N, C, ...)");
+    }
+
+    // ---- 解析并广播 kernel_size ----
+    auto kernel_size_data = attributes["kernel_size"];
+    if (kernel_size_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid kernel_size data size");
+    }
+    size_t kernel_size_count = kernel_size_data.size() / sizeof(int);
+    const int *kernel_size_ptr = reinterpret_cast<const int *>(kernel_size_data.data());
+
+    if (kernel_size_count == pool_ndim) {
+        test->_attributes->kernel_size.clear();
+        for (size_t i = 0; i < kernel_size_count; ++i) {
+            test->_attributes->kernel_size.push_back(
+                static_cast<size_t>(kernel_size_ptr[i]));
+        }
+    } else {
+        test->_attributes->kernel_size.assign(
+            pool_ndim, static_cast<size_t>(kernel_size_ptr[0]));
+    }
+
+    // ---- 解析并广播 stride ----
+    auto stride_data = attributes["stride"];
+    if (stride_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid stride data size");
+    }
+    size_t stride_count = stride_data.size() / sizeof(int);
+    const int *stride_ptr = reinterpret_cast<const int *>(stride_data.data());
+
+    if (stride_count == pool_ndim) {
+        test->_attributes->stride.clear();
+        for (size_t i = 0; i < stride_count; ++i) {
+            test->_attributes->stride.push_back(
+                static_cast<size_t>(stride_ptr[i]));
+        }
+    } else {
+        test->_attributes->stride.assign(
+            pool_ndim, static_cast<size_t>(stride_ptr[0]));
+    }
+
+    // ---- 解析并广播 padding ----
+    auto padding_data = attributes["padding"];
+    if (padding_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid padding data size");
+    }
+    size_t padding_count = padding_data.size() / sizeof(int);
+    const int *padding_ptr = reinterpret_cast<const int *>(padding_data.data());
+
+    if (padding_count == pool_ndim) {
+        test->_attributes->padding.clear();
+        for (size_t i = 0; i < padding_count; ++i) {
+            test->_attributes->padding.push_back(
+                static_cast<size_t>(padding_ptr[i]));
+        }
+    } else {
+        test->_attributes->padding.assign(
+            pool_ndim, static_cast<size_t>(padding_ptr[0]));
+    }
+
+    // ---- 解析 ceil_mode ----
+    auto ceil_mode_data = attributes["ceil_mode"];
+    if (ceil_mode_data.size() == sizeof(bool)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const bool *>(ceil_mode_data.data());
+    } else if (ceil_mode_data.size() == sizeof(uint8_t)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const uint8_t *>(ceil_mode_data.data()) != 0;
+    } else {
+        throw std::runtime_error("Invalid ceil_mode data size");
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopAvgPoolDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto expected_output = _attributes->expected_output;
+
+    auto input_dtype = input->ggml_type();
+    auto output_shape = expected_output->shape();
+
+    size_t output_size_bytes = 1;
+    for (auto d : output_shape) {
+        output_size_bytes *= d;
+    }
+    output_size_bytes *= ggmlTypeSize(input_dtype);
+
+    auto output_memory = std::make_shared<Memory>(output_size_bytes, device, device_id);
+
+    std::vector<ptrdiff_t> output_strides(output_shape.size());
+    if (!output_shape.empty()) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; --i) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, input_dtype);
+
+    // 参数指针（按底层接口需要传 void*）
+    void *kernel_size_ptr = _attributes->kernel_size.data();
+    void *stride_ptr = _attributes->stride.data();
+    void *padding_ptr = _attributes->padding.data();
+
+    // ---- 创建算子描述符 ----
+    CHECK_OR(infiniopCreateAvgPoolDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 input->desc(),
+                 kernel_size_ptr,
+                 stride_ptr,
+                 padding_ptr,
+                 _attributes->ceil_mode),
+             return TEST_FAILED(OP_CREATION_FAILED,
+                                "Failed to create avgpool descriptor."));
+
+    // ---- 获取工作空间大小 ----
+    size_t workspace_size = 0;
+    CHECK_OR(infiniopGetAvgPoolWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED,
+                                "Failed to get workspace size."));
+
+    // ---- 分配工作空间（如需要）----
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED,
+                                    "Failed to allocate workspace."));
+    }
+
+    // ---- 执行平均池化 ----
+    CHECK_OR(infiniopAvgPool(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 input->data(),
+                 /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED,
+                                "Failed during avgpool execution."));
+
+    // ---- 精度校验 ----
+    try {
+        allClose(actual_output, expected_output, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyAvgPoolDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // ---- 性能测试 ----
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopAvgPool(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // ---- 清理资源 ----
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyAvgPoolDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"kernel_size", "stride", "padding", "ceil_mode"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- expected_output: " << _attributes->expected_output->info() << std::endl;
+
+    oss << "- kernel_size: [";
+    for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) {
+        if (i) {
+            oss << ", ";
+        }
+        oss << _attributes->kernel_size[i];
+    }
+    oss << "]\n- stride: [";
+    for (size_t i = 0; i < _attributes->stride.size(); ++i) {
+        if (i) {
+            oss << ", ";
+        }
+        oss << _attributes->stride[i];
+    }
+    oss << "]\n- padding: [";
+    for (size_t i = 0; i < _attributes->padding.size(); ++i) {
+        if (i) {
+            oss << ", ";
+        }
+        oss << _attributes->padding[i];
+    }
+    oss << "]\n- ceil_mode: "
+        << (_attributes->ceil_mode ? "true" : "false") << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::averagepool
diff --git a/src/infiniop-test/src/ops/batch_norm.cpp b/src/infiniop-test/src/ops/batch_norm.cpp
new file mode 100644
index 000000000..a44fee0f0
--- /dev/null
+++ b/src/infiniop-test/src/ops/batch_norm.cpp
@@ -0,0 +1,151 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::batch_norm {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> running_mean;
+    std::shared_ptr<Tensor> running_var;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> weight;
+    std::shared_ptr<Tensor> bias;
+    float momentum;
+    float eps;
+    std::shared_ptr<Tensor> ans_output;
+    std::shared_ptr<Tensor> ans_running_mean;
+    std::shared_ptr<Tensor> ans_running_var;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("running_mean") == tensors.end()
+        || tensors.find("running_var") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("weight") == tensors.end()
+        || tensors.find("bias") == tensors.end()
+        || tensors.find("ans_output") == tensors.end()
+        || tensors.find("ans_running_mean") == tensors.end()
+        || tensors.find("ans_running_var") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->running_mean = tensors["running_mean"];
+    test->_attributes->running_var = tensors["running_var"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->weight = tensors["weight"];
+    test->_attributes->bias = tensors["bias"];
+    test->_attributes->ans_output = tensors["ans_output"];
+    test->_attributes->ans_running_mean = tensors["ans_running_mean"];
+    test->_attributes->ans_running_var = tensors["ans_running_var"];
+    test->_attributes->momentum = *reinterpret_cast<float*>(attributes["momentum"].data());
+    test->_attributes->eps = *reinterpret_cast<float*>(attributes["eps"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopBatchNormDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto running_mean = _attributes->running_mean->to(device, device_id);
+    auto running_var = _attributes->running_var->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto weight = _attributes->weight->to(device, device_id);
+    auto bias = _attributes->bias->to(device, device_id);
+    auto momentum = _attributes->momentum;
+    auto eps = _attributes->eps;
+    CHECK_OR(infiniopCreateBatchNormDescriptor(handle, &op_desc,
+            output->desc(),
+            running_mean->desc(),
+            running_var->desc(),
+            input->desc(),
+            weight->desc(),
+            bias->desc(),
+            momentum,
+            eps
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetBatchNormWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopBatchNorm(op_desc, workspace, workspace_size,
+                    output->data(),
+                    running_mean->data(),
+                    running_var->data(),
+                    input->data(),
+                    weight->data(),
+                    bias->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans_output, _rtol, _atol);
+        allClose(running_mean, _attributes->ans_running_mean, _rtol, _atol);
+        allClose(running_var, _attributes->ans_running_var, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopBatchNorm(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                running_mean->data(),
+                running_var->data(),
+                input->data(),
+                weight->data(),
+                bias->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"momentum", "eps"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "running_mean", "running_var", "input", "weight", "bias", "ans_output", "ans_running_mean", "ans_running_var"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output", "running_mean", "running_var"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- running_mean: " << _attributes->running_mean->info() << std::endl;
+    oss << "- running_var: " << _attributes->running_var->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- weight: " << _attributes->weight->info() << std::endl;
+    oss << "- bias: " << _attributes->bias->info() << std::endl;
+    oss << "- momentum: " << _attributes->momentum << std::endl;
+    oss << "- eps: " << _attributes->eps << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::batch_norm
diff --git a/src/infiniop-test/src/ops/cross_entropy_loss.cpp b/src/infiniop-test/src/ops/cross_entropy_loss.cpp
new file mode 100644
index 000000000..7fac231e0
--- /dev/null
+++ b/src/infiniop-test/src/ops/cross_entropy_loss.cpp
@@ -0,0 +1,156 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cross_entropy_loss {
+
+struct Test::Attributes {
+    // 输入张量
+    std::shared_ptr<Tensor> logits;
+    std::shared_ptr<Tensor> target;
+    std::shared_ptr<Tensor> loss;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    // 检查必需的张量是否存在
+    if (!check_names(tensors, Test::tensor_names()) || !check_names(attributes, Test::attribute_names())) {
+        throw std::runtime_error("Invalid Test: Missing required tensors.");
+    }
+
+    test->_attributes->logits = tensors["logits"];
+    test->_attributes->target = tensors["target"];
+    test->_attributes->loss = tensors["loss"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopCrossEntropyLossDescriptor_t op_desc;
+
+    // 将输入张量移动到目标设备
+    auto logits = _attributes->logits->to(device, device_id);
+    auto target = _attributes->target->to(device, device_id);
+    auto loss = _attributes->loss;
+
+    // 根据期望输出的形状创建实际输出张量
+    auto output_shape = loss->shape();
+    size_t output_size = 1;
+    for (auto dim : output_shape) {
+        output_size *= dim;
+    }
+    output_size *= ggmlTypeSize(logits->ggml_type());
+
+    auto output_memory = std::make_shared<Memory>(output_size, device, device_id);
+    std::vector<ptrdiff_t> output_strides(static_cast<size_t>(output_shape.size()));
+    if (output_shape.size() > 0) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; i--) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, logits->ggml_type());
+
+    // 1. 创建算子描述符
+    CHECK_OR(infiniopCreateCrossEntropyLossDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 logits->desc(),
+                 target->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cross entropy loss descriptor."));
+
+    // 2. 获取并分配工作空间
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCrossEntropyLossWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    // 3. 执行计算
+    CHECK_OR(infiniopCrossEntropyLoss(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 logits->data(),
+                 target->data(),
+                 nullptr), // stream
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during cross entropy loss execution."));
+
+    // 4. 验证结果
+    try {
+        allClose(actual_output, loss, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyCrossEntropyLossDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 5. 性能测试
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopCrossEntropyLoss(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                logits->data(),
+                target->data(),
+                nullptr); // stream
+        },
+        warm_ups, iterations);
+
+    // 6. 清理资源
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyCrossEntropyLossDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+// 定义算子需要的属性名列表
+std::vector<std::string> Test::attribute_names() {
+    return {}; // CrossEntropyLoss 没有额外的属性
+}
+
+// 定义算子需要的张量名列表
+std::vector<std::string> Test::tensor_names() {
+    return {"logits", "target", "loss"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+// 打印测试信息的辅助函数
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- logits: " << _attributes->logits->info() << std::endl;
+    oss << "- target: " << _attributes->target->info() << std::endl;
+    oss << "- loss: " << _attributes->loss->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::cross_entropy_loss
diff --git a/src/infiniop-test/src/ops/div.cpp b/src/infiniop-test/src/ops/div.cpp
new file mode 100644
index 000000000..c1f49bda6
--- /dev/null
+++ b/src/infiniop-test/src/ops/div.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::div {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopDivDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateDivDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetDivWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopDiv(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopDiv(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::div
diff --git a/src/infiniop-test/src/ops/equal.cpp b/src/infiniop-test/src/ops/equal.cpp
new file mode 100644
index 000000000..25bad7014
--- /dev/null
+++ b/src/infiniop-test/src/ops/equal.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::equal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopEqualDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateEqualDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetEqualWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopEqual(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopEqual(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::equal
diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp
new file mode 100644
index 000000000..070f8ef6b
--- /dev/null
+++ b/src/infiniop-test/src/ops/exp.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::exp {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopExpDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopExp(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopExp(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::exp
diff --git a/src/infiniop-test/src/ops/gather.cpp b/src/infiniop-test/src/ops/gather.cpp
new file mode 100644
index 000000000..e1b998fe9
--- /dev/null
+++ b/src/infiniop-test/src/ops/gather.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::gather {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> index;
+    size_t dim;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("index") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->index = tensors["index"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->dim = *reinterpret_cast<size_t *>(attributes["dim"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopGatherDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto index = _attributes->index->to(device, device_id);
+    auto dim = _attributes->dim;
+    CHECK_OR(infiniopCreateGatherDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            index->desc(),
+            dim
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetGatherWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopGather(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    index->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopGather(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                index->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "index", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- index: " << _attributes->index->info() << std::endl;
+    oss << "- dim: " << _attributes->dim << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::gather
diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp
new file mode 100644
index 000000000..0ccf4f52a
--- /dev/null
+++ b/src/infiniop-test/src/ops/hardswish.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::hardswish {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopHardswishDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc,
+                                               output->desc(),
+                                               input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size,
+                               output->data(),
+                               input->data(),
+                               nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopHardswish(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::hardswish
diff --git a/src/infiniop-test/src/ops/index_copy_inplace.cpp b/src/infiniop-test/src/ops/index_copy_inplace.cpp
new file mode 100644
index 000000000..9f5fb5be9
--- /dev/null
+++ b/src/infiniop-test/src/ops/index_copy_inplace.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::index_copy_inplace {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> index;
+    size_t dim;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("index") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->index = tensors["index"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->dim = *reinterpret_cast<size_t *>(attributes["dim"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopIndexCopyInplaceDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto index = _attributes->index->to(device, device_id);
+    auto dim = _attributes->dim;
+    CHECK_OR(infiniopCreateIndexCopyInplaceDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            index->desc(),
+            dim
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetIndexCopyInplaceWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopIndexCopyInplace(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    index->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopIndexCopyInplace(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                index->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "index", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- index: " << _attributes->index->info() << std::endl;
+    oss << "- dim: " << _attributes->dim << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::index_copy_inplace
diff --git a/src/infiniop-test/src/ops/interpolate_nearest.cpp b/src/infiniop-test/src/ops/interpolate_nearest.cpp
new file mode 100644
index 000000000..071527249
--- /dev/null
+++ b/src/infiniop-test/src/ops/interpolate_nearest.cpp
@@ -0,0 +1,151 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::interpolate_nearest {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> expected_output;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        std::cout << "DEBUG: Name check failed" << std::endl;
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];            // F32 输入数据
+    test->_attributes->expected_output = tensors["output"]; // F64 期望结果
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopInterpolateNearestDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto expected_output = _attributes->expected_output; // F64 期望结果
+
+    // 动态创建实际的输出张量，使用期望结果的形状，但使用输入的数据类型
+    auto output_shape = expected_output->shape();
+    auto input_dtype = input->ggml_type();
+
+    // 创建输出张量的内存
+    size_t output_size = 1;
+    for (auto dim : output_shape) {
+        output_size *= dim;
+    }
+    output_size *= ggmlTypeSize(input_dtype);
+
+    auto output_memory = std::make_shared<Memory>(output_size, device, device_id);
+    std::vector<ptrdiff_t> output_strides(output_shape.size());
+
+    // 计算连续的步长
+    if (output_shape.size() > 0) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; i--) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, input_dtype);
+
+    // Create operator descriptor
+    CHECK_OR(infiniopCreateInterpolateNearestDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+
+    // Get workspace size
+    size_t workspace_size;
+    CHECK_OR(infiniopGetInterpolateNearestWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    // Allocate workspace if needed
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    // Execute interpolate nearest
+    CHECK_OR(infiniopInterpolateNearest(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 input->data(),
+                 nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    // Verify result - 比较实际输出和期望结果
+    try {
+        allClose(actual_output, expected_output, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyInterpolateNearestDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // Benchmark
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopInterpolateNearest(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // Cleanup
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyInterpolateNearestDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- expected_output: " << _attributes->expected_output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::interpolate_nearest
diff --git a/src/infiniop-test/src/ops/layer_norm.cpp b/src/infiniop-test/src/ops/layer_norm.cpp
new file mode 100644
index 000000000..b2dfa3e1b
--- /dev/null
+++ b/src/infiniop-test/src/ops/layer_norm.cpp
@@ -0,0 +1,147 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::layer_norm {
+struct Test::Attributes {
+    bool bias_exist;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input_standardization;
+    std::shared_ptr<Tensor> input_std_deviation;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> weight;
+    std::shared_ptr<Tensor> bias;
+    float eps;
+    std::shared_ptr<Tensor> ans_output;
+    std::shared_ptr<Tensor> ans_input_standardization;
+    std::shared_ptr<Tensor> ans_input_std_deviation;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input_standardization") == tensors.end()
+        || tensors.find("input_std_deviation") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("weight") == tensors.end()
+        || tensors.find("bias") == tensors.end()
+        || tensors.find("ans_output") == tensors.end()
+        || tensors.find("ans_input_standardization") == tensors.end()
+        || tensors.find("ans_input_std_deviation") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input_standardization = tensors["input_standardization"];
+    test->_attributes->input_std_deviation = tensors["input_std_deviation"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->weight = tensors["weight"];
+    test->_attributes->bias = tensors["bias"];
+    test->_attributes->ans_output = tensors["ans_output"];
+    test->_attributes->ans_input_standardization = tensors["ans_input_standardization"];
+    test->_attributes->ans_input_std_deviation = tensors["ans_input_std_deviation"];
+    test->_attributes->eps = *reinterpret_cast<float*>(attributes["eps"].data());
+    test->_attributes->bias_exist = *reinterpret_cast<bool *>(attributes["bias_exist"].data());
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLayerNormDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input_standardization = _attributes->input_standardization->to(device, device_id);
+    auto input_std_deviation = _attributes->input_std_deviation->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto weight = _attributes->weight->to(device, device_id);
+    auto bias = _attributes->bias->to(device, device_id);
+    auto eps = _attributes->eps;
+    CHECK_OR(infiniopCreateLayerNormDescriptor(handle, &op_desc,
+            output->desc(),
+            input_standardization->desc(),
+            input_std_deviation->desc(),
+            input->desc(),
+            weight->desc(),
+            (_attributes->bias_exist) ? bias->desc() : nullptr,
+            eps
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLayerNormWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLayerNorm(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input_standardization->data(),
+                    input_std_deviation->data(),
+                    input->data(),
+                    weight->data(),
+                    (_attributes->bias_exist) ? bias->data() : nullptr,
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans_output, _rtol, _atol);
+        allClose(input_standardization, _attributes->ans_input_standardization, _rtol, _atol);
+        allClose(input_std_deviation, _attributes->ans_input_std_deviation, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLayerNorm(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input_standardization->data(),
+                input_std_deviation->data(),
+                input->data(),
+                weight->data(),
+                (_attributes->bias_exist) ? bias->data() : nullptr,
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"bias_exist", "eps"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input_standardization", "input_std_deviation", "input", "weight", "bias", "ans_output", "ans_input_standardization", "ans_input_std_deviation"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output", "input_standardization", "input_std_deviation"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input_standardization: " << _attributes->input_standardization->info() << std::endl;
+    oss << "- input_std_deviation: " << _attributes->input_std_deviation->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- weight: " << _attributes->weight->info() << std::endl;
+    oss << "- bias: " << (_attributes->bias_exist ? _attributes->bias->info() : "null") << std::endl;
+    oss << "- eps: " << _attributes->eps << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::layer_norm
diff --git a/src/infiniop-test/src/ops/logical_and.cpp b/src/infiniop-test/src/ops/logical_and.cpp
new file mode 100644
index 000000000..152a3027d
--- /dev/null
+++ b/src/infiniop-test/src/ops/logical_and.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::logical_and {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLogicalAndDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateLogicalAndDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLogicalAndWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLogicalAnd(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLogicalAnd(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::logical_and
diff --git a/src/infiniop-test/src/ops/logical_or.cpp b/src/infiniop-test/src/ops/logical_or.cpp
new file mode 100644
index 000000000..8f7a261d5
--- /dev/null
+++ b/src/infiniop-test/src/ops/logical_or.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::logical_or {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLogicalOrDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateLogicalOrDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLogicalOrWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLogicalOr(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLogicalOr(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::logical_or
diff --git a/src/infiniop-test/src/ops/maxpool.cpp b/src/infiniop-test/src/ops/maxpool.cpp
new file mode 100644
index 000000000..698c5ad89
--- /dev/null
+++ b/src/infiniop-test/src/ops/maxpool.cpp
@@ -0,0 +1,263 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::maxpool {
+
+struct Test::Attributes {
+    // 输入张量
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> expected_output;
+
+    // 最大池化参数
+    std::vector<size_t> kernel_size;
+    std::vector<size_t> stride;
+    std::vector<size_t> padding;
+    bool ceil_mode;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    auto input_tensor = tensors["input"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->expected_output = tensors["output"];
+
+    // 获取池化维度（输入张量维度 - 2，去掉batch和channel维度）
+    size_t pool_ndim = test->_attributes->input->shape().size() - 2;
+    if (pool_ndim == 0) {
+        throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)");
+    }
+
+    // 解析并广播 kernel_size - 修复类型转换
+    auto kernel_size_data = attributes["kernel_size"];
+    if (kernel_size_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid kernel_size data size");
+    }
+    size_t kernel_size_count = kernel_size_data.size() / sizeof(int);
+    const int *kernel_size_ptr = reinterpret_cast<const int *>(kernel_size_data.data());
+
+    if (kernel_size_count == pool_ndim) {
+        test->_attributes->kernel_size.clear();
+        for (size_t i = 0; i < kernel_size_count; i++) {
+            test->_attributes->kernel_size.push_back(static_cast<size_t>(kernel_size_ptr[i]));
+        }
+    } else {
+        test->_attributes->kernel_size.assign(pool_ndim, static_cast<size_t>(kernel_size_ptr[0]));
+    }
+
+    // 解析并广播 stride
+    auto stride_data = attributes["stride"];
+    if (stride_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid stride data size");
+    }
+    size_t stride_count = stride_data.size() / sizeof(int);
+    const int *stride_ptr = reinterpret_cast<const int *>(stride_data.data());
+
+    if (stride_count == pool_ndim) {
+        // 直接使用提供的值
+        test->_attributes->stride.clear();
+        for (size_t i = 0; i < stride_count; i++) {
+            test->_attributes->stride.push_back(static_cast<size_t>(stride_ptr[i]));
+        }
+    } else {
+        // 广播单个值到所有维度
+        test->_attributes->stride.assign(pool_ndim, static_cast<size_t>(stride_ptr[0]));
+    }
+
+    // 解析并广播 padding
+    auto padding_data = attributes["padding"];
+    if (padding_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid padding data size");
+    }
+    size_t padding_count = padding_data.size() / sizeof(int);
+    const int *padding_ptr = reinterpret_cast<const int *>(padding_data.data());
+
+    if (padding_count == pool_ndim) {
+        test->_attributes->padding.clear();
+        for (size_t i = 0; i < padding_count; i++) {
+            test->_attributes->padding.push_back(static_cast<size_t>(padding_ptr[i]));
+        }
+    } else {
+        test->_attributes->padding.assign(pool_ndim, static_cast<size_t>(padding_ptr[0]));
+    }
+
+    // 解析 ceil_mode
+    auto ceil_mode_data = attributes["ceil_mode"];
+    if (ceil_mode_data.size() == sizeof(bool)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const bool *>(ceil_mode_data.data());
+    } else if (ceil_mode_data.size() == sizeof(uint8_t)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const uint8_t *>(ceil_mode_data.data()) != 0;
+    } else {
+        throw std::runtime_error("Invalid ceil_mode data size");
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopMaxPoolDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto expected_output = _attributes->expected_output;
+
+    auto input_dtype = input->ggml_type();
+
+    auto output_shape = expected_output->shape();
+
+    size_t output_size = 1;
+    for (auto dim : output_shape) {
+        output_size *= dim;
+    }
+    output_size *= ggmlTypeSize(input_dtype);
+
+    auto output_memory = std::make_shared<Memory>(output_size, device, device_id);
+    std::vector<ptrdiff_t> output_strides(output_shape.size());
+
+    if (output_shape.size() > 0) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; i--) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, input_dtype);
+
+    // 准备参数指针
+    void *kernel_size_ptr = _attributes->kernel_size.data();
+    void *stride_ptr = _attributes->stride.data();
+    void *padding_ptr = _attributes->padding.data();
+
+    // 创建算子描述符
+    CHECK_OR(infiniopCreateMaxPoolDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 input->desc(),
+                 kernel_size_ptr,
+                 stride_ptr,
+                 padding_ptr,
+                 _attributes->ceil_mode),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create maxpool descriptor."));
+
+    // 获取工作空间大小
+    size_t workspace_size;
+    CHECK_OR(infiniopGetMaxPoolWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    // 分配工作空间
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    // 执行最大池化
+    CHECK_OR(infiniopMaxPool(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 input->data(),
+                 nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during maxpool execution."));
+
+    // 验证结果
+    try {
+        allClose(actual_output, expected_output, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyMaxPoolDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 性能测试
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopMaxPool(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // 清理资源
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyMaxPoolDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"kernel_size", "stride", "padding", "ceil_mode"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- expected_output: " << _attributes->expected_output->info() << std::endl;
+
+    oss << "- kernel_size: [";
+    for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->kernel_size[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- stride: [";
+    for (size_t i = 0; i < _attributes->stride.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->stride[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- padding: [";
+    for (size_t i = 0; i < _attributes->padding.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->padding[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::maxpool
diff --git a/src/infiniop-test/src/ops/scatter.cpp b/src/infiniop-test/src/ops/scatter.cpp
new file mode 100644
index 000000000..691e338da
--- /dev/null
+++ b/src/infiniop-test/src/ops/scatter.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::scatter {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> index;
+    size_t dim;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("index") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->index = tensors["index"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->dim = *reinterpret_cast<size_t *>(attributes["dim"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopScatterDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto index = _attributes->index->to(device, device_id);
+    auto dim = _attributes->dim;
+    CHECK_OR(infiniopCreateScatterDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            index->desc(),
+            dim
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetScatterWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopScatter(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    index->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopScatter(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                index->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "index", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- index: " << _attributes->index->info() << std::endl;
+    oss << "- dim: " << _attributes->dim << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::scatter
diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp
new file mode 100644
index 000000000..e1406e588
--- /dev/null
+++ b/src/infiniop-test/src/ops/sin.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sin {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSinDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSin(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSin(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::sin
diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp
new file mode 100644
index 000000000..6f966de09
--- /dev/null
+++ b/src/infiniop-test/src/ops/tanh.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::tanh {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopTanhDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc,
+                                          output->desc(),
+                                          input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size,
+                          output->data(),
+                          input->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopTanh(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::tanh
diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp
new file mode 100644
index 000000000..fea9cba92
--- /dev/null
+++ b/src/infiniop-test/src/ops/where.cpp
@@ -0,0 +1,151 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::where {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> condition;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("condition") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->condition = tensors["condition"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->a->ggml_type();
+    if (elemType == GGML_TYPE_I8) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I16) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I32) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I64) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F64) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopWhereDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto condition = _attributes->condition->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc,
+                                           c->desc(),
+                                           a->desc(),
+                                           b->desc(),
+                                           condition->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size,
+                           c->data(),
+                           a->data(),
+                           b->data(),
+                           condition->data(),
+                           nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopWhere(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                condition->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "condition", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- condition: " << _attributes->condition->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::where
diff --git a/src/infiniop/ops/averagepool/averagepool.h b/src/infiniop/ops/averagepool/averagepool.h
new file mode 100644
index 000000000..7762826ab
--- /dev/null
+++ b/src/infiniop/ops/averagepool/averagepool.h
@@ -0,0 +1,52 @@
+#ifndef __AVERAGEPOOL_H__
+#define __AVERAGEPOOL_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+    namespace op::averagepool::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        AvgPoolInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            AvgPoolInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __AVERAGEPOOL_H__
diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc
new file mode 100644
index 000000000..2e8fa6851
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc
@@ -0,0 +1,362 @@
+#include "averagepool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <vector>
+
+namespace op::averagepool::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    AvgPoolInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const AvgPoolInfo &avgpool_info)
+        : handle(handle_ptr), info(avgpool_info) {
+        workspace_size = 0;
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_1d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        const size_t input_nc_stride = input_width;
+        const size_t output_nc_stride = output_width;
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t input_offset = (b * channels + c) * input_nc_stride;
+                const size_t output_offset = (b * channels + c) * output_nc_stride;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    float sum = 0.0f;
+                    int valid_count = 0;
+
+                    const int window_start = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    const int window_end = window_start + static_cast<int>(kernel_width);
+
+                    for (int iw = window_start; iw < window_end; ++iw) {
+                        if (iw >= 0 && iw < static_cast<int>(input_width)) {
+                            sum += utils::cast<float>(input[input_offset + iw]);
+                            valid_count++;
+                        } else if (iw >= -static_cast<int>(pad_width) && 
+                                   iw < static_cast<int>(input_width + pad_width)) {
+                            valid_count++;
+                        }
+                    }
+
+                    float result = 0.0f;
+                    if (valid_count > 0) {
+                        result = sum / static_cast<float>(valid_count);
+                    }
+                    output[output_offset + ow] = utils::cast<Ydata>(result);
+                }
+            }
+        }
+    }    
+    
+    template <typename T, typename Ydata>
+    void _avgpool_2d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        const size_t input_nc_stride = input_height * input_width;
+        const size_t output_nc_stride = output_height * output_width;
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t input_offset = (b * channels + c) * input_nc_stride;
+                const size_t output_offset = (b * channels + c) * output_nc_stride;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        float sum = 0.0f;
+                        int valid_count = 0;
+
+                        const int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        const int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+
+                        for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                            for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                const int ih = start_h + kh;
+                                const int iw = start_w + kw;
+
+                                if (ih >= 0 && ih < static_cast<int>(input_height) &&
+                                    iw >= 0 && iw < static_cast<int>(input_width)) {
+                                    sum += utils::cast<float>(input[input_offset + ih * input_width + iw]);
+                                    valid_count++;
+                                } else if (ih >= -static_cast<int>(pad_height) && 
+                                           ih < static_cast<int>(input_height + pad_height) &&
+                                           iw >= -static_cast<int>(pad_width) &&
+                                           iw < static_cast<int>(input_width + pad_width)) {
+                                    valid_count++;
+                                }
+                            }
+                        }
+
+                        float result = 0.0f;
+                        if (valid_count > 0) {
+                            result = sum / static_cast<float>(valid_count);
+                        }
+                        output[output_offset + oh * output_width + ow] = utils::cast<Ydata>(result);
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_3d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        const size_t input_nc_stride = input_depth * input_height * input_width;
+        const size_t output_nc_stride = output_depth * output_height * output_width;
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t input_offset = (b * channels + c) * input_nc_stride;
+                const size_t output_offset = (b * channels + c) * output_nc_stride;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            float sum = 0.0f;
+                            int valid_count = 0;
+
+                            const int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            const int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            const int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+
+                            for (int kd = 0; kd < static_cast<int>(kernel_depth); ++kd) {
+                                const int id = start_d + kd;
+                                for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                                    const int ih = start_h + kh;
+                                    for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                        const int iw = start_w + kw;
+
+                                        if (id >= 0 && id < static_cast<int>(input_depth) &&
+                                            ih >= 0 && ih < static_cast<int>(input_height) &&
+                                            iw >= 0 && iw < static_cast<int>(input_width)) {
+                                            const size_t idx = id * (input_height * input_width) + 
+                                                            ih * input_width + iw;
+                                            sum += utils::cast<float>(input[input_offset + idx]);
+                                            valid_count++;
+                                        } else if (id >= -static_cast<int>(pad_depth) && 
+                                                   id < static_cast<int>(input_depth + pad_depth) &&
+                                                   ih >= -static_cast<int>(pad_height) && 
+                                                   ih < static_cast<int>(input_height + pad_height) &&
+                                                   iw >= -static_cast<int>(pad_width) && 
+                                                   iw < static_cast<int>(input_width + pad_width)) {
+                                            valid_count++;
+                                        }
+                                    }
+                                }
+                            }
+
+                            float result = 0.0f;
+                            if (valid_count > 0) {
+                                result = sum / static_cast<float>(valid_count);
+                            }
+                            
+                            const size_t out_idx = od * (output_height * output_width) + 
+                                                oh * output_width + ow;
+                            output[output_offset + out_idx] = utils::cast<Ydata>(result);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_cpu(Ydata *output, const T *input) const {
+        switch (info.ndim) {
+        case 1:
+            _avgpool_1d<T, Ydata>(output, input);
+            break;
+        case 2:
+            _avgpool_2d<T, Ydata>(output, input);
+            break;
+        case 3:
+            _avgpool_3d<T, Ydata>(output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           AvgPoolInfo &info) {
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                            void *output, const void *input, infiniDtype_t dtype) const {
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+        
+        size_t output_size = info.batch * info.channels;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            output_size *= info.output_dims[i];
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            _avgpool_cpu<float, float>(typed_output, typed_input);
+            break;
+        }
+        case INFINI_DTYPE_F16: {
+            float *typed_output_f32 = static_cast<float*>(workspace);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            
+            _avgpool_cpu<fp16_t, float>(typed_output_f32, typed_input);
+            
+            fp16_t *typed_output = static_cast<fp16_t*>(output);
+            #pragma omp parallel for
+            for(size_t i = 0; i < output_size; ++i) {
+                typed_output[i] = utils::cast<fp16_t>(typed_output_f32[i]);
+            }
+            break;
+        }
+        case INFINI_DTYPE_BF16: {
+            float *typed_output_f32 = static_cast<float*>(workspace);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+
+            _avgpool_cpu<bf16_t, float>(typed_output_f32, typed_input);
+
+            bf16_t *typed_output = static_cast<bf16_t*>(output);
+            #pragma omp parallel for
+            for(size_t i = 0; i < output_size; ++i) {
+                typed_output[i] = utils::cast<bf16_t>(typed_output_f32[i]);
+            }
+            break;
+        }
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+inline size_t calculateOutputSize(const AvgPoolInfo &info) {
+    size_t size = info.batch * info.channels;
+    for(size_t i = 0; i < info.ndim; ++i) {
+        size *= info.output_dims[i];
+    }
+    return size;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                    strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    size_t workspace_size = 0;
+    if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) {
+        workspace_size = calculateOutputSize(info) * sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size,
+                             opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::averagepool::cpu
diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h
new file mode 100644
index 000000000..8388f80ff
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_CPU_H__
+#define __AVERAGEPOOL_CPU_H__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __AVERAGEPOOL_CPU_H__
diff --git a/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh
new file mode 100644
index 000000000..7c9d0f438
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh
@@ -0,0 +1,185 @@
+#ifndef __AVERAGEPOOL_KERNEL_H__
+#define __AVERAGEPOOL_KERNEL_H__
+
+#include <cmath>
+
+// 1D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool1d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_length,
+    int output_length, int kernel_size, int stride, int padding) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) {
+        return;
+    }
+
+    // 计算输入和输出的偏移
+    const T *input_ptr = input + batch_idx * channels * input_length + channel_idx * input_length;
+    T *output_ptr = output + batch_idx * channels * output_length + channel_idx * output_length;
+
+    // 计算池化窗口的起始位置
+    int window_start = output_idx * stride - padding;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int k = 0; k < kernel_size; ++k) {
+        int input_pos = window_start + k;
+
+        if (input_pos >= 0 && input_pos < input_length) {
+            // 有效的输入位置，转换为单精度进行累加
+            sum += static_cast<float>(input_ptr[input_pos]);
+            valid_count++;
+        } else if (input_pos >= -padding && input_pos < input_length + padding) {
+            // 显式填充区域，值为0，只增加计数
+            valid_count++;
+        }
+        // 其他位置是隐式填充，不计入分母
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+// 2D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool2d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_height,
+    int input_width, int output_height, int output_width, int kernel_h,
+    int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性索引转换为2D坐标
+    int out_h = output_idx / output_width;
+    int out_w = output_idx % output_width;
+
+    // 计算输入和输出的偏移
+    const T *input_ptr = input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width;
+    T *output_ptr = output + batch_idx * channels * output_height * output_width + channel_idx * output_height * output_width;
+
+    // 计算池化窗口的起始位置
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int kh = 0; kh < kernel_h; ++kh) {
+        for (int kw = 0; kw < kernel_w; ++kw) {
+            int input_h = window_start_h + kh;
+            int input_w = window_start_w + kw;
+
+            if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                // 有效的输入位置，转换为单精度进行累加
+                int input_idx = input_h * input_width + input_w;
+                sum += static_cast<float>(input_ptr[input_idx]);
+                valid_count++;
+            } else if (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) {
+                // 显式填充区域，值为0，只增加计数
+                valid_count++;
+            }
+            // 其他位置是隐式填充，不计入分母
+        }
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+// 3D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool3d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_depth,
+    int input_height, int input_width, int output_depth, int output_height,
+    int output_width, int kernel_d, int kernel_h, int kernel_w, int stride_d,
+    int stride_h, int stride_w, int pad_d, int pad_h, int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_depth * output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性索引转换为3D坐标
+    int out_d = output_idx / (output_height * output_width);
+    int remaining = output_idx % (output_height * output_width);
+    int out_h = remaining / output_width;
+    int out_w = remaining % output_width;
+
+    // 计算输入和输出的偏移
+    int input_spatial_size = input_depth * input_height * input_width;
+    int output_spatial_size = output_depth * output_height * output_width;
+
+    const T *input_ptr = input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size;
+    T *output_ptr = output + batch_idx * channels * output_spatial_size + channel_idx * output_spatial_size;
+
+    // 计算池化窗口的起始位置
+    int window_start_d = out_d * stride_d - pad_d;
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int kd = 0; kd < kernel_d; ++kd) {
+        for (int kh = 0; kh < kernel_h; ++kh) {
+            for (int kw = 0; kw < kernel_w; ++kw) {
+                int input_d = window_start_d + kd;
+                int input_h = window_start_h + kh;
+                int input_w = window_start_w + kw;
+
+                if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                    // 有效的输入位置，转换为单精度进行累加
+                    int input_idx = (input_d * input_height + input_h) * input_width + input_w;
+                    sum += static_cast<float>(input_ptr[input_idx]);
+                    valid_count++;
+                } else if (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) {
+                    // 显式填充区域，值为0，只增加计数
+                    valid_count++;
+                }
+                // 其他位置是隐式填充，不计入分母
+            }
+        }
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+#endif // __AVERAGEPOOL_KERNEL_H__
diff --git a/src/infiniop/ops/averagepool/info.h b/src/infiniop/ops/averagepool/info.h
new file mode 100644
index 000000000..871e827a7
--- /dev/null
+++ b/src/infiniop/ops/averagepool/info.h
@@ -0,0 +1,136 @@
+#ifndef __AVERAGEPOOL_INFO_H__
+#define __AVERAGEPOOL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <cstddef>
+#include <vector>
+
+namespace op::averagepool {
+
+inline utils::Result<size_t> calculatePoolOutputSize(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding = 0,
+    bool ceil_mode = false) {
+
+    if (stride == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+    if (kernel_size == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+
+    size_t padded_input_size = input_size + 2 * padding;
+
+    if (padded_input_size < kernel_size) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+
+    size_t output_size;
+    if (ceil_mode) {
+        // 等效于整数的上取整
+        output_size = (padded_input_size - kernel_size + stride - 1) / stride + 1;
+    } else {
+        // 等效于整数的下取整
+        output_size = (padded_input_size - kernel_size) / stride + 1;
+    }
+
+    return utils::Result<size_t>(output_size);
+}
+
+// 检查是否存在隐式填充
+inline bool hasImplicitPadding(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    bool ceil_mode) {
+
+    if (!ceil_mode) {
+        return false;
+    }
+    return ((input_size + 2 * padding) - kernel_size) % stride != 0;
+}
+
+class AvgPoolInfo {
+    AvgPoolInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;
+    std::vector<size_t> output_dims;
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+    bool has_implicit_padding = false;
+
+    static utils::Result<AvgPoolInfo> create(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        AvgPoolInfo info;
+
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != output_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info.ndim = input_desc->ndim() - 2; // 空间维度
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // 初始化隐式填充标志
+        info.has_implicit_padding = false;
+
+        // 获取并校验空间维度
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+
+            auto output_size_result = calculatePoolOutputSize(
+                info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode);
+            CHECK_RESULT(output_size_result);
+
+            size_t expected_size = output_size_result.take();
+            if (expected_size != output_desc->dim(i + 2)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            info.output_dims.push_back(output_desc->dim(i + 2));
+
+            // 检查当前维度是否存在隐式填充
+            if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i],
+                                   info.strides[i], info.pads[i], info.ceil_mode)) {
+                info.has_implicit_padding = true;
+            }
+        }
+        return utils::Result<AvgPoolInfo>(std::move(info));
+    }
+};
+} // namespace op::averagepool
+
+#endif // __AVERAGEPOOL_INFO_H__
diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.h b/src/infiniop/ops/averagepool/metax/averagepool_metax.h
new file mode 100644
index 000000000..eef332b5f
--- /dev/null
+++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.h
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_METAX_H__
+#define __AVERAGEPOOL_METAX_H__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(metax)
+
+#endif // __AVERAGEPOOL_METAX_CUH__
diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.maca b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca
new file mode 100644
index 000000000..ee3c4bd9c
--- /dev/null
+++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca
@@ -0,0 +1,332 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "averagepool_metax.h"
+#include "../cuda/averagepool_kernel.cuh"
+#include <cstdio>
+
+infiniStatus_t launch_avgpool_pytorch_kernel(
+    const op::averagepool::AvgPoolInfo& info,
+    const void* input, void* output,
+    infiniDtype_t data_type, hcStream_t stream) {
+    
+    int batch_size = static_cast<int>(info.batch);
+    int channels = static_cast<int>(info.channels);
+    
+    if (info.ndim == 1) {
+        // 1D平均池化
+        int input_length = static_cast<int>(info.input_dims[0]);
+        int output_length = static_cast<int>(info.output_dims[0]);
+        int kernel_size = static_cast<int>(info.kernel_sizes[0]);
+        int stride = static_cast<int>(info.strides[0]);
+        int padding = static_cast<int>(info.pads[0]);
+        
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (output_length + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool1d_pytorch_compatible_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(input), static_cast<float*>(output),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool1d_pytorch_compatible_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(input), static_cast<half*>(output),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool1d_pytorch_compatible_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(input), static_cast<__hpcc_bfloat16*>(output),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else if (info.ndim == 2) {
+        // 2D平均池化
+        int input_height = static_cast<int>(info.input_dims[0]);
+        int input_width = static_cast<int>(info.input_dims[1]);
+        int output_height = static_cast<int>(info.output_dims[0]);
+        int output_width = static_cast<int>(info.output_dims[1]);
+        int kernel_h = static_cast<int>(info.kernel_sizes[0]);
+        int kernel_w = static_cast<int>(info.kernel_sizes[1]);
+        int stride_h = static_cast<int>(info.strides[0]);
+        int stride_w = static_cast<int>(info.strides[1]);
+        int pad_h = static_cast<int>(info.pads[0]);
+        int pad_w = static_cast<int>(info.pads[1]);
+        
+        int total_output_elements = output_height * output_width;
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool2d_pytorch_compatible_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(input), static_cast<float*>(output),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool2d_pytorch_compatible_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(input), static_cast<half*>(output),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool2d_pytorch_compatible_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(input), static_cast<__hpcc_bfloat16*>(output),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else if (info.ndim == 3) {
+        // 3D平均池化
+        int input_depth = static_cast<int>(info.input_dims[0]);
+        int input_height = static_cast<int>(info.input_dims[1]);
+        int input_width = static_cast<int>(info.input_dims[2]);
+        int output_depth = static_cast<int>(info.output_dims[0]);
+        int output_height = static_cast<int>(info.output_dims[1]);
+        int output_width = static_cast<int>(info.output_dims[2]);
+        int kernel_d = static_cast<int>(info.kernel_sizes[0]);
+        int kernel_h = static_cast<int>(info.kernel_sizes[1]);
+        int kernel_w = static_cast<int>(info.kernel_sizes[2]);
+        int stride_d = static_cast<int>(info.strides[0]);
+        int stride_h = static_cast<int>(info.strides[1]);
+        int stride_w = static_cast<int>(info.strides[2]);
+        int pad_d = static_cast<int>(info.pads[0]);
+        int pad_h = static_cast<int>(info.pads[1]);
+        int pad_w = static_cast<int>(info.pads[2]);
+        
+        int total_output_elements = output_depth * output_height * output_width;
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool3d_pytorch_compatible_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(input), static_cast<float*>(output),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool3d_pytorch_compatible_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(input), static_cast<half*>(output),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool3d_pytorch_compatible_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(input), static_cast<__hpcc_bfloat16*>(output),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func)                       \
+  do {                                                                         \
+    if (desc_ptr) {                                                            \
+      destroy_func(desc_ptr);                                                  \
+      desc_ptr = nullptr;                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CLEANUP_hcdnn_DESCRIPTORS()                                            \
+  do {                                                                         \
+    DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor);        \
+    DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor);       \
+    DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor);     \
+  } while (0)
+
+namespace op::averagepool::metax {
+
+struct Descriptor::Opaque {
+  std::shared_ptr<device::metax::Handle::Internal> internal;
+  size_t workspace_size = 0;
+
+#ifdef ENABLE_HCDNN_API
+  hcdnnTensorDescriptor_t input_desc = nullptr;
+  hcdnnTensorDescriptor_t output_desc = nullptr;
+  hcdnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+  Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+      : internal(internal_ptr) {}
+
+#ifdef ENABLE_HCDNN_API
+  infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info,
+                                         hcdnnDataType_t hcdnn_data_type) {
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc));
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc));
+    CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc));
+
+    std::vector<int> input_dims = {static_cast<int>(info.batch), static_cast<int>(info.channels)};
+    std::vector<int> output_dims = {static_cast<int>(info.batch), static_cast<int>(info.channels)};
+    for (size_t i = 0; i < info.ndim; ++i) {
+      input_dims.push_back(static_cast<int>(info.input_dims[i]));
+      output_dims.push_back(static_cast<int>(info.output_dims[i]));
+    }
+    while (input_dims.size() < 5)  input_dims.push_back(1);
+    while (output_dims.size() < 5) output_dims.push_back(1);
+    std::vector<int> input_strides(input_dims.size(), 1);
+    std::vector<int> output_strides(output_dims.size(), 1);
+    for (int i = input_dims.size() - 2; i >= 0; --i) {
+      input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+      output_strides[i] = output_strides[i + 1] * output_dims[i + 1];
+    }
+
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(input_desc, hcdnn_data_type,
+                                           input_dims.size(), input_dims.data(), input_strides.data()));
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(output_desc, hcdnn_data_type,
+                                           output_dims.size(), output_dims.data(), output_strides.data()));
+
+    return INFINI_STATUS_SUCCESS;
+  }
+
+  infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) {
+    std::vector<int> kernel_size, strides, pads;
+    for (size_t i = 0; i < info.ndim; ++i) {
+      kernel_size.push_back(static_cast<int>(info.kernel_sizes[i]));
+      strides.push_back(static_cast<int>(info.strides[i]));
+      pads.push_back(static_cast<int>(info.pads[i]));
+    }
+    while (kernel_size.size() < 3) kernel_size.push_back(1);
+    while (strides.size() < 3)     strides.push_back(1);
+    while (pads.size() < 3)        pads.push_back(0);
+    CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(pooling_desc, HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+                                            HCDNN_NOT_PROPAGATE_NAN, kernel_size.size(),
+                                            kernel_size.data(), pads.data(), strides.data()));
+    return INFINI_STATUS_SUCCESS;
+  }
+
+  infiniStatus_t initializehcdnnContext(AvgPoolInfo &info,
+                                       infiniDtype_t data_type) {
+    hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type);
+    CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type));
+    CHECK_STATUS(setupPoolingDescriptor(info));
+    workspace_size = 0;
+    return INFINI_STATUS_SUCCESS;
+  }
+#endif
+
+public:
+  Opaque(Opaque &&other) noexcept
+      : internal(std::move(other.internal)),
+        workspace_size(other.workspace_size)
+#ifdef ENABLE_HCDNN_API
+        , input_desc(other.input_desc)
+        , output_desc(other.output_desc)
+        , pooling_desc(other.pooling_desc)
+#endif
+  {
+#ifdef ENABLE_HCDNN_API
+    other.input_desc = nullptr;
+    other.output_desc = nullptr;
+    other.pooling_desc = nullptr;
+#endif
+    other.workspace_size = 0;
+  }
+
+  ~Opaque() {
+#ifdef ENABLE_HCDNN_API
+    CLEANUP_hcdnn_DESCRIPTORS();
+#endif
+  }
+
+  static inline utils::Result<Opaque>
+  create(std::shared_ptr<device::metax::Handle::Internal> internal_ptr,
+         AvgPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_HCDNN_API
+    Opaque opaque(internal_ptr);
+    auto status = opaque.initializehcdnnContext(info, data_type);
+    if (status != INFINI_STATUS_SUCCESS) {
+      return status;
+    }
+    return utils::Result<Opaque>(std::move(opaque));
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+  }
+};
+
+Descriptor::~Descriptor() {
+  if (_opaque) {
+    delete _opaque;
+  }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+#ifdef ENABLE_HCDNN_API
+  auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+  auto dtype = input_desc->dtype();
+  CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+  auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                    strides, pads, ceil_mode);
+  CHECK_RESULT(result);
+  auto info = result.take();
+  auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+  CHECK_RESULT(opaque_result);
+  auto opaque = new Opaque(opaque_result.take());
+
+  *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                             opaque, handle->device, handle->device_id);
+  return INFINI_STATUS_SUCCESS;
+#else
+  return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+#ifdef ENABLE_HCDNN_API
+  if (_info.has_implicit_padding) {
+    // 使用自定义kernel实现PyTorch兼容的逻辑
+    return launch_avgpool_pytorch_kernel(_info, input, output, _dtype, (hcStream_t)stream);
+  } else {
+    const float alpha = 1.0f, beta = 0.0f;
+    CHECK_STATUS(_opaque->internal->useMcdnn(
+      (hcStream_t)stream, [&](hcdnnHandle_t handle) {
+        CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                        _opaque->input_desc, input, &beta,
+                                        _opaque->output_desc, output));
+        return INFINI_STATUS_SUCCESS;
+      }));
+    return INFINI_STATUS_SUCCESS;
+  }  
+#else
+  return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::averagepool::metax
diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool.cu b/src/infiniop/ops/averagepool/nvidia/averagepool.cu
new file mode 100644
index 000000000..6f276aac8
--- /dev/null
+++ b/src/infiniop/ops/averagepool/nvidia/averagepool.cu
@@ -0,0 +1,220 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "averagepool_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);    \
+        DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor);   \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::averagepool::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc));
+
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) {
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+            CUDNN_NOT_PROPAGATE_NAN, kernel_vec.size(), kernel_vec.data(),
+            pad_vec.data(), stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(AvgPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // Average pooling typically doesn't need a workspace
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+        , input_desc(other.input_desc)
+        , output_desc(other.output_desc)
+        , pooling_desc(other.pooling_desc)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           AvgPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::averagepool::nvidia
diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh
new file mode 100644
index 000000000..ef19aa1dc
--- /dev/null
+++ b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_CUDA_CUH__
+#define __AVERAGEPOOL_CUDA_CUH__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __AVERAGEPOOL_CUDA_CUH__
diff --git a/src/infiniop/ops/averagepool/operator.cc b/src/infiniop/ops/averagepool/operator.cc
new file mode 100644
index 000000000..c664504d5
--- /dev/null
+++ b/src/infiniop/ops/averagepool/operator.cc
@@ -0,0 +1,155 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/averagepool.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/averagepool_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/averagepool_metax.h"
+#endif
+#ifdef ENABLE_CPU_API
+#include "cpu/averagepool_cpu.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAvgPoolDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPoolDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        return op::averagepool::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::averagepool::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                           \
+            input_desc,                                                            \
+            kernel_size,                                                           \
+            strides,                                                               \
+            pads,                                                                  \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(
+    infiniopAvgPoolDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                             \
+    case CASE:                                                                                           \
+        *size = reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopAvgPool(
+    infiniopAvgPoolDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size,                                    \
+                        output,                                                       \
+                        input,                                                        \
+                        stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                         \
+        delete reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/batch_norm/batch_norm.h b/src/infiniop/ops/batch_norm/batch_norm.h
new file mode 100644
index 000000000..b550a752b
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/batch_norm.h
@@ -0,0 +1,56 @@
+#ifndef __BATCH_NORM_H__
+#define __BATCH_NORM_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::batch_norm::NAMESPACE {                             \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        BatchNormInfo _info;                                          \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            BatchNormInfo info,                                       \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t running_mean_desc,             \
+            infiniopTensorDescriptor_t running_var_desc,              \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t weight_desc,                   \
+            infiniopTensorDescriptor_t bias_desc,                     \
+            float momentum,                                           \
+            float eps                                                 \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            void * running_mean,                                      \
+            void * running_var,                                       \
+            const void * input,                                       \
+            const void * weight,                                      \
+            const void * bias,                                        \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc
new file mode 100644
index 000000000..69c563c3a
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc
@@ -0,0 +1,129 @@
+#include "batch_norm_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::batch_norm::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t running_mean_desc,
+	infiniopTensorDescriptor_t running_var_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t weight_desc,
+	infiniopTensorDescriptor_t bias_desc,
+	float momentum,
+	float eps
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    auto result = BatchNormInfo::createBatchNormInfo(
+		output_desc,
+		running_mean_desc,
+		running_var_desc,
+		input_desc,
+		weight_desc,
+		bias_desc,
+		momentum,
+		eps
+    );
+    CHECK_RESULT(result);
+    const BatchNormInfo &info = result.take();
+    size_t WorkSpaceSize = 0;
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculate_batch_norm(
+    const BatchNormInfo &info,
+	Tdata * output,
+	Tdata * running_mean,
+	Tdata * running_var,
+	const Tdata *input,
+	const Tdata *weight,
+	const Tdata *bias
+) {
+
+#pragma omp parallel for
+    for(size_t c = 0; c < info.channel_size; c++)
+    {
+        float sum_sq = 0., sum=0.;
+        for(size_t b = 0; b < info.batch_size; b++)
+        {
+            sum += op::common_cpu::reduce_op::sum(
+                input + (b * info.channel_size + c) * info.dim_size,
+                info.dim_size,
+                1
+            );
+            sum_sq += op::common_cpu::reduce_op::sumSquared(
+                input + (b * info.channel_size + c) * info.dim_size,
+                info.dim_size,
+                1
+            );
+        }
+        float batch_and_dim_size = (info.batch_size * info.dim_size);
+        float E = sum / batch_and_dim_size;
+        float var_biased = sum_sq / batch_and_dim_size - E * E;
+        float var_unbiased = var_biased * batch_and_dim_size / (batch_and_dim_size - 1.0);
+
+        auto running_mean_ptr = running_mean + c * info.running_mean_stride;
+        auto running_var_ptr = running_var + c * info.running_var_stride;
+        *running_mean_ptr =  utils::cast<Tdata>((1 - info.momentum) * utils::cast<float>(*running_mean_ptr) + info.momentum * E);
+        *running_var_ptr =  utils::cast<Tdata>((1 - info.momentum) * utils::cast<float>(*running_var_ptr) + info.momentum * var_unbiased);
+
+        for(size_t b = 0; b < info.batch_size; b++)
+        {
+            for(size_t d = 0; d < info.dim_size; d++)
+            {
+                auto input_ptr = input + ((b * info.channel_size + c) * info.dim_size) + d;
+                auto output_ptr = output + ((b * info.channel_size + c) * info.dim_size) + d;;
+                auto weight_ptr = weight + c * info.weight_stride;
+                auto bias_ptr = bias + c * info.bias_stride;
+                *output_ptr = utils::cast<Tdata>(
+                    (utils::cast<float>(*input_ptr) - E) / std::sqrt(var_biased + info.eps) * utils::cast<float>(*weight_ptr) + utils::cast<float>(*bias_ptr)
+                );
+            }
+        }    
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BATCH_NORM(TDATA) \
+    CHECK_STATUS(calculate_batch_norm<TDATA>(_info, \
+(TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias))
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	void * running_mean,
+	void * running_var,
+	const void * input,
+	const void * weight,
+	const void * bias,
+    void *stream
+) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CALCULATE_BATCH_NORM(fp16_t);
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CALCULATE_BATCH_NORM(bf16_t);
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CALCULATE_BATCH_NORM(float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::cpu
diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h
new file mode 100644
index 000000000..ac38987ef
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __BATCH_NORM_CPU_H__
+#define __BATCH_NORM_CPU_H__
+
+#include "../batch_norm.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __BATCH_NORM_CPU_H__
diff --git a/src/infiniop/ops/batch_norm/cuda/kernel.cuh b/src/infiniop/ops/batch_norm/cuda/kernel.cuh
new file mode 100644
index 000000000..952d7ef79
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/cuda/kernel.cuh
@@ -0,0 +1,64 @@
+#ifndef __BATCH_NORM_KERNEL_CUH__
+#define __BATCH_NORM_KERNEL_CUH__
+
+#include <cub/block/block_reduce.cuh>
+#include "../../../reduce/cuda/reduce.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void batchNormKernel(
+    Tdata * output,
+    Tdata * running_mean,
+    Tdata * running_var,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+
+    size_t batch_size,
+    size_t channel_size,
+    size_t dim_size,
+    ptrdiff_t running_mean_stride,
+    ptrdiff_t running_var_stride,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+    float momentum,
+    float eps    
+) {
+    auto output_ptr = output + dim_size * blockIdx.x;
+    auto input_ptr = input + dim_size * blockIdx.x;
+    
+    auto running_mean_ptr = running_mean + running_mean_stride * blockIdx.x;
+    auto running_var_ptr = running_var + running_var_stride * blockIdx.x;
+    auto weight_ptr = weight + weight_stride * blockIdx.x;
+    auto bias_ptr = bias + bias_stride * blockIdx.x;
+
+    Tcompute sum_squared = 0., sum = 0.;
+    for(size_t b = 0; b < batch_size; b++)
+    {
+        sum += op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(
+            input_ptr + b * (channel_size * dim_size), dim_size
+        );
+        sum_squared += op::common_cuda::reduce_op::sumSquared<BLOCK_SIZE, Tdata, Tcompute>(
+            input_ptr + b * (channel_size * dim_size), dim_size
+        );
+    }
+    
+    __shared__ Tcompute E, var_biased;
+    if (threadIdx.x == 0) {
+        E = sum / Tcompute(batch_size * dim_size);
+        var_biased = sum_squared / Tcompute(batch_size * dim_size) - E * E;
+        Tcompute var_unbiased = var_biased * Tcompute(batch_size * dim_size) / Tcompute(batch_size * dim_size - 1);
+        *running_mean_ptr = Tcompute(1 - momentum) * Tcompute(*running_mean_ptr) + Tcompute(momentum) * E;
+        *running_var_ptr = Tcompute(1 - momentum) * Tcompute(*running_var_ptr) + Tcompute(momentum) * var_unbiased;
+    }
+    __syncthreads();
+
+    for (size_t n = threadIdx.x; n < batch_size * dim_size; n += BLOCK_SIZE)
+    {
+        size_t b = n / dim_size, d = n % dim_size;
+        *(output_ptr + b * channel_size * dim_size + d) = (
+                Tcompute(*(input_ptr + b * channel_size * dim_size + d)) - E
+            ) / sqrtf(float(var_biased + Tcompute(eps))) * Tcompute(*weight_ptr) + Tcompute(*bias_ptr);
+    }
+}
+
+#endif // __BATCH_NORM_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/batch_norm/info.h b/src/infiniop/ops/batch_norm/info.h
new file mode 100644
index 000000000..f78359077
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/info.h
@@ -0,0 +1,73 @@
+#ifndef __BATCH_NORM_INFO_H__
+#define __BATCH_NORM_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::batch_norm {
+
+class BatchNormInfo {
+private:
+    BatchNormInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t batch_size, channel_size, dim_size;
+
+    ptrdiff_t running_mean_stride;
+    ptrdiff_t running_var_stride;
+    ptrdiff_t weight_stride;
+    ptrdiff_t bias_stride;
+    float momentum;
+    float eps;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<BatchNormInfo> createBatchNormInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t running_mean_desc,
+        infiniopTensorDescriptor_t running_var_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t weight_desc,
+        infiniopTensorDescriptor_t bias_desc,
+        float momentum,
+        float eps
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(
+            input_desc->ndim() == 3, 
+            INFINI_STATUS_BAD_TENSOR_SHAPE
+        );        
+        CHECK_SAME_SHAPE(output_desc->shape(), input_desc->shape());
+        size_t batch_size = output_desc->dim(0),
+            channel_size = output_desc->dim(1),
+            dim_size = output_desc->dim(2);
+        CHECK_SAME_SHAPE(
+            running_mean_desc->shape(), running_var_desc->shape(),
+            weight_desc->shape(), bias_desc->shape()
+        );            
+        CHECK_OR_RETURN(
+            running_mean_desc->ndim() == 1 && running_mean_desc->dim(0) == channel_size,
+            INFINI_STATUS_BAD_TENSOR_SHAPE
+        );
+
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<BatchNormInfo>(BatchNormInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+            batch_size, channel_size, dim_size,
+            running_mean_desc->stride(0),
+            running_var_desc->stride(0),
+            weight_desc->stride(0),
+            bias_desc->stride(0),
+            momentum,
+            eps
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __BATCH_NORM_INFO_H__
diff --git a/src/infiniop/ops/batch_norm/metax/batch_norm_metax.h b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.h
new file mode 100644
index 000000000..0c2d8c800
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.h
@@ -0,0 +1,8 @@
+#ifndef __BATCH_NORM_METAX_H__
+#define __BATCH_NORM_METAX_H__
+
+#include "../batch_norm.h"
+
+DESCRIPTOR(metax)
+
+#endif // __BATCH_NORM_METAX_H__
diff --git a/src/infiniop/ops/batch_norm/metax/batch_norm_metax.maca b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.maca
new file mode 100644
index 000000000..0fa8e22a2
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.maca
@@ -0,0 +1,181 @@
+#include "../../../devices/metax/metax_common.h"
+#include "batch_norm_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::batch_norm::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    Tdata * running_mean,
+    Tdata * running_var,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+
+    size_t batch_size,
+    size_t channel_size,
+    size_t dim_size,
+    ptrdiff_t running_mean_stride,
+    ptrdiff_t running_var_stride,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+
+    float momentum,
+    float eps
+) {
+    batchNormKernel<BLOCK_SIZE, Tdata, Tcompute>(
+        output,
+        running_mean,
+        running_var,
+        input,
+        weight,
+        bias,
+
+        batch_size,
+        channel_size,
+        dim_size,
+
+        running_mean_stride,
+        running_var_stride,
+        weight_stride,
+        bias_stride,
+
+        momentum,
+        eps
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_batch_norm(
+    const BatchNormInfo &info,
+    Tdata * output,
+    Tdata * running_mean,
+    Tdata * running_var,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+    hcStream_t stream
+) {
+    launchKernel<BLOCK_SIZE, Tdata, float><<<info.channel_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        running_mean,
+        running_var,
+        input,
+        weight,
+        bias,
+
+        info.batch_size,
+        info.channel_size,
+        info.dim_size,
+
+        info.running_mean_stride,
+        info.running_var_stride,
+        info.weight_stride,
+        info.bias_stride,
+        info.momentum,
+        info.eps
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t running_mean_desc,
+    infiniopTensorDescriptor_t running_var_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float momentum,
+    float eps
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    auto result = BatchNormInfo::createBatchNormInfo(
+        output_desc,
+        running_mean_desc,
+        running_var_desc,
+        input_desc,
+        weight_desc,
+        bias_desc,
+        momentum,
+        eps
+    );
+    CHECK_RESULT(result);
+    const BatchNormInfo &info = result.take();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    void * running_mean,
+    void * running_var,
+    const void * input,
+    const void * weight,
+    const void * bias,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_BATCH_NORM(BLOCK_SIZE, TDATA) \
+        calculate_batch_norm<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream)
+    #define CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK(BLOCK_SIZE)         \
+    {                                                                 \
+        if (_info.dtype == INFINI_DTYPE_F16)                          \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, half);            \
+        else if (_info.dtype == INFINI_DTYPE_F32)                     \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, float);           \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                    \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, cuda_bfloat16);   \
+        else                                                          \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+
+
+    #undef CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK
+    #undef CALCULATE_BATCH_NORM
+    
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::batch_norm::metax
diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu
new file mode 100644
index 000000000..b79fed65f
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu
@@ -0,0 +1,186 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "batch_norm_nvidia.cuh"
+
+#include <cub/block/block_reduce.cuh>
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+
+
+#include "../info.h"
+
+namespace op::batch_norm::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    Tdata * running_mean,
+    Tdata * running_var,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+
+    size_t batch_size,
+    size_t channel_size,
+    size_t dim_size,
+    ptrdiff_t running_mean_stride,
+    ptrdiff_t running_var_stride,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+
+    float momentum,
+    float eps
+) {
+
+    batchNormKernel<BLOCK_SIZE, Tdata, Tcompute>(
+        output,
+        running_mean,
+        running_var,
+        input,
+        weight,
+        bias,
+
+        batch_size,
+        channel_size,
+        dim_size,
+
+        running_mean_stride,
+        running_var_stride,
+        weight_stride,
+        bias_stride,
+
+        momentum,
+        eps
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_batch_norm(
+    const BatchNormInfo &info,
+    Tdata * output,
+    Tdata * running_mean,
+    Tdata * running_var,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+
+    cudaStream_t stream
+) {
+    launchKernel<BLOCK_SIZE, Tdata, float><<<info.channel_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        running_mean,
+        running_var,
+        input,
+        weight,
+        bias,
+
+        info.batch_size,
+        info.channel_size,
+        info.dim_size,
+
+        info.running_mean_stride,
+        info.running_var_stride,
+        info.weight_stride,
+        info.bias_stride,
+        info.momentum,
+        info.eps
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t running_mean_desc,
+    infiniopTensorDescriptor_t running_var_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float momentum,
+    float eps
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = BatchNormInfo::createBatchNormInfo(
+        output_desc,
+        running_mean_desc,
+        running_var_desc,
+        input_desc,
+        weight_desc,
+        bias_desc,
+        momentum,
+        eps
+    );
+    CHECK_RESULT(result);
+    const BatchNormInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    void * running_mean,
+    void * running_var,
+    const void * input,
+    const void * weight,
+    const void * bias,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_BATCH_NORM(BLOCK_SIZE, TDATA) \
+        calculate_batch_norm<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream)
+    #define CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(BLOCK_SIZE)          \
+    {                                                                 \
+        if (_info.dtype == INFINI_DTYPE_F16)                          \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, half);            \
+        else if (_info.dtype == INFINI_DTYPE_F32)                     \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, float);           \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                    \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, __nv_bfloat16);   \
+        else                                                          \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::batch_norm::nvidia
diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh
new file mode 100644
index 000000000..33c93f2b4
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh
@@ -0,0 +1,10 @@
+#ifndef __BATCH_NORM_NVIDIA_API_H__
+#define __BATCH_NORM_NVIDIA_API_H__
+
+// #ifdef ENABLE_NINETOOTHED
+#include "../batch_norm.h"
+DESCRIPTOR(nvidia)
+
+// #endif
+
+#endif // __BATCH_NORM_NVIDIA_API_H__
diff --git a/src/infiniop/ops/batch_norm/operator.cc b/src/infiniop/ops/batch_norm/operator.cc
new file mode 100644
index 000000000..81418b2d2
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/operator.cc
@@ -0,0 +1,168 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/batch_norm.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/batch_norm_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/batch_norm_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/batch_norm_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateBatchNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopBatchNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t running_mean_desc,
+    infiniopTensorDescriptor_t running_var_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float momentum,
+    float eps
+) {
+#define CREATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        return op::batch_norm::NAMESPACE::Descriptor::create(                             \
+            handle,                                                                       \
+            reinterpret_cast<op::batch_norm::NAMESPACE::Descriptor **>(desc_ptr),         \
+            output_desc,                                                                  \
+            running_mean_desc,                                                            \
+            running_var_desc,                                                             \
+            input_desc,                                                                   \
+            weight_desc,                                                                  \
+            bias_desc,                                                                    \
+            momentum,                                                                     \
+            eps                                                                           \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::batch_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize();   \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopBatchNorm(
+    infiniopBatchNormDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    void * running_mean,
+    void * running_var,
+    const void * input,
+    const void * weight,
+    const void * bias,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::batch_norm::NAMESPACE::Descriptor *>(desc)->calculate(    \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            running_mean,                                                                           \
+            running_var,                                                                            \
+            input,                                                                                  \
+            weight,                                                                                 \
+            bias,                                                                                   \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::batch_norm::NAMESPACE::Descriptor *>(desc);     \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc
new file mode 100644
index 000000000..af97c1d09
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc
@@ -0,0 +1,321 @@
+#include "cross_entropy_loss_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+namespace op::cross_entropy_loss::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    std::vector<size_t> logits_shape;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const std::vector<size_t> &shape)
+        : handle(handle_ptr), logits_shape(shape) {
+        // 计算workspace大小：需要存储per-sample loss
+        size_t N = logits_shape[0];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+        workspace_size = N * inner_size * sizeof(float);
+    }
+
+    void cross_entropy_f16_as_float(float *workspace, float *loss_result,
+                                    const fp16_t *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        // 转换F16 logits为float
+        size_t total_logits_size = N * C * inner_size;
+        std::vector<float> float_logits(total_logits_size);
+        for (size_t i = 0; i < total_logits_size; ++i) {
+            float_logits[i] = utils::cast<float>(logits[i]);
+        }
+
+        // 使用float精度计算
+        cross_entropy_cpu_float(workspace, loss_result, float_logits.data(), target);
+    }
+
+    // 通用的float版本交叉熵计算
+    void cross_entropy_cpu_float(float *workspace, float *loss_result,
+                                 const float *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        const int64_t ignore_index = -100;
+        float *per_sample_loss = workspace;
+
+        // 计算每个样本的损失
+        for (size_t n = 0; n < N; ++n) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+                size_t sample_idx = n * inner_size + inner;
+                int64_t t = target[sample_idx];
+
+                // 检查ignore_index或无效target
+                if (t == ignore_index || t < 0 || t >= static_cast<int64_t>(C)) {
+                    per_sample_loss[sample_idx] = 0.0f;
+                    continue;
+                }
+
+                // 计算这个位置的logits基址
+                size_t base_offset = n * C * inner_size + inner;
+
+                // 数值稳定的softmax计算：先找最大值
+                float max_logit = -std::numeric_limits<float>::infinity();
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    max_logit = std::max(max_logit, logits[logit_idx]);
+                }
+
+                // 计算exp的和（减去最大值保证数值稳定）
+                float sum_exp = 0.0f;
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    sum_exp += std::exp(logits[logit_idx] - max_logit);
+                }
+
+                // 计算目标类别的logit
+                size_t target_logit_idx = base_offset + static_cast<size_t>(t) * inner_size;
+                float target_logit = logits[target_logit_idx];
+
+                // 计算交叉熵损失：log_softmax[target] = logit[target] - log(sum_exp) - max_logit
+                // 所以 -log_softmax[target] = log(sum_exp) + max_logit - logit[target]
+                per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit;
+            }
+        }
+
+        // 计算平均损失（忽略ignore_index的样本）
+        double total_loss = 0.0;
+        size_t valid_count = 0;
+        size_t total_samples = N * inner_size;
+
+        for (size_t i = 0; i < total_samples; ++i) {
+            if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast<int64_t>(C)) {
+                total_loss += static_cast<double>(per_sample_loss[i]);
+                valid_count++;
+            }
+        }
+
+        *loss_result = valid_count > 0 ? static_cast<float>(total_loss / valid_count) : 0.0f;
+    }
+
+    // 通用模板版本（用于F32和BF16）
+    template <typename T>
+    void cross_entropy_cpu_generic(float *workspace, T *loss_result,
+                                   const T *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        const int64_t ignore_index = -100;
+        float *per_sample_loss = workspace;
+
+        // 计算每个样本的损失
+        for (size_t n = 0; n < N; ++n) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+                size_t sample_idx = n * inner_size + inner;
+                int64_t t = target[sample_idx];
+
+                // 检查ignore_index或无效target
+                if (t == ignore_index || t < 0 || t >= static_cast<int64_t>(C)) {
+                    per_sample_loss[sample_idx] = 0.0f;
+                    continue;
+                }
+
+                // 计算这个位置的logits基址
+                size_t base_offset = n * C * inner_size + inner;
+
+                // 数值稳定的softmax计算：先找最大值
+                float max_logit = -std::numeric_limits<float>::infinity();
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    float logit_val;
+                    if constexpr (std::is_same<T, bf16_t>::value) {
+                        logit_val = utils::cast<float>(logits[logit_idx]);
+                    } else {
+                        logit_val = logits[logit_idx];
+                    }
+                    max_logit = std::max(max_logit, logit_val);
+                }
+
+                // 计算exp的和
+                float sum_exp = 0.0f;
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    float logit_val;
+                    if constexpr (std::is_same<T, bf16_t>::value) {
+                        logit_val = utils::cast<float>(logits[logit_idx]);
+                    } else {
+                        logit_val = logits[logit_idx];
+                    }
+                    sum_exp += std::exp(logit_val - max_logit);
+                }
+
+                // 计算目标类别的logit
+                size_t target_logit_idx = base_offset + static_cast<size_t>(t) * inner_size;
+                float target_logit;
+                if constexpr (std::is_same<T, bf16_t>::value) {
+                    target_logit = utils::cast<float>(logits[target_logit_idx]);
+                } else {
+                    target_logit = logits[target_logit_idx];
+                }
+
+                // 计算交叉熵损失
+                per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit;
+            }
+        }
+
+        // 计算平均损失
+        double total_loss = 0.0;
+        size_t valid_count = 0;
+        size_t total_samples = N * inner_size;
+
+        for (size_t i = 0; i < total_samples; ++i) {
+            if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast<int64_t>(C)) {
+                total_loss += static_cast<double>(per_sample_loss[i]);
+                valid_count++;
+            }
+        }
+
+        float mean_loss = valid_count > 0 ? static_cast<float>(total_loss / valid_count) : 0.0f;
+
+        // 转换回输出类型
+        if constexpr (std::is_same<T, bf16_t>::value) {
+            *loss_result = utils::cast<T>(mean_loss);
+        } else {
+            *loss_result = static_cast<T>(mean_loss);
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          logits_shape(std::move(other.logits_shape)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr, const std::vector<size_t> &shape) {
+        Opaque opaque(handle_ptr, shape);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *loss, const void *logits, const void *target,
+                             infiniDtype_t dtype) const {
+        if (!workspace || !loss || !logits || !target) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        if (workspace_size < this->workspace_size) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+
+        float *workspace_ptr = static_cast<float *>(workspace);
+        const int64_t *target_ptr = static_cast<const int64_t *>(target);
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            const float *logits_ptr = static_cast<const float *>(logits);
+            float *loss_ptr = static_cast<float *>(loss);
+            cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            const fp16_t *logits_ptr = static_cast<const fp16_t *>(logits);
+            fp16_t *loss_ptr = static_cast<fp16_t *>(loss);
+
+            // F16特殊处理：使用float计算
+            float temp_loss;
+            cross_entropy_f16_as_float(workspace_ptr, &temp_loss, logits_ptr, target_ptr);
+            *loss_ptr = utils::cast<fp16_t>(temp_loss);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            const bf16_t *logits_ptr = static_cast<const bf16_t *>(logits);
+            bf16_t *loss_ptr = static_cast<bf16_t *>(loss);
+            cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t get_workspace_size() const {
+        return workspace_size;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t /*loss_desc*/,
+                                  infiniopTensorDescriptor_t logits_desc,
+                                  infiniopTensorDescriptor_t /*target_desc*/) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    const auto &orig_shape = logits_desc->shape();
+    std::vector<size_t> logits_shape;
+
+    if (orig_shape.size() == 1) {
+        logits_shape = {1, orig_shape[0]};
+    } else {
+        logits_shape = orig_shape;
+    }
+
+    if (logits_shape.size() < 2) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto opaque_result = Opaque::create(handle, logits_shape);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, opaque->get_workspace_size(), opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *loss, const void *logits,
+                                     const void *target, void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, loss, logits, target, _dtype);
+}
+
+} // namespace op::cross_entropy_loss::cpu
diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h
new file mode 100644
index 000000000..8afec63d0
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_CPU_H__
+#define __CROSS_ENTROPY_LOSS_CPU_H__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __CROSS_ENTROPY_LOSS_CPU_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h
new file mode 100644
index 000000000..dad108d78
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h
@@ -0,0 +1,48 @@
+#ifndef __CROSS_ENTROPY_LOSS_H__
+#define __CROSS_ENTROPY_LOSS_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::cross_entropy_loss::NAMESPACE {                \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t loss_desc,                \
+            infiniopTensorDescriptor_t logits_desc,              \
+            infiniopTensorDescriptor_t target_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *loss,                                          \
+            const void *logits,                                  \
+            const void *target,                                  \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __CROSS_ENTROPY_LOSS_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh
new file mode 100644
index 000000000..5279011ef
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh
@@ -0,0 +1,67 @@
+#ifndef __CROSS_ENTROPY_KERNEL_CUH__
+#define __CROSS_ENTROPY_KERNEL_CUH__
+
+#include <cstdint>
+#include <hpcc_fp16.h>
+#include <math.h>
+
+__device__ __forceinline__ float to_float(float val) { return val; }
+
+__device__ __forceinline__ float to_float(half val) {
+    return __half2float(val);
+}
+
+__device__ __forceinline__ float to_float(__hpcc_bfloat16 val) {
+    return __bfloat162float(val);
+}
+
+template <typename T_in, typename T_out>
+__global__ void cross_entropy_loss_kernel(T_out *loss, const T_in *logits,
+                                          const int64_t *target, int N, int C,
+                                          long long inner_size,
+                                          int64_t ignore_index) {
+
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    long long total = (long long)N * inner_size;
+    if (idx >= total) {
+        return;
+    }
+
+    int n = (int)(idx / inner_size);
+    int inner = (int)(idx % inner_size);
+
+    int64_t t = target[idx];
+
+    if (t == ignore_index) {
+        loss[idx] = (T_out)0.0f;
+        return;
+    }
+    if (t < 0 || t >= C) {
+        loss[idx] = (T_out)0.0f;
+        return;
+    }
+
+    const long long base_offset = ((long long)n * C * inner_size) + inner;
+
+    // 1. 找到 logits 中的最大值
+    float max_val = -HUGE_VALF; // 使用浮点数的最大负值
+    for (int c = 0; c < C; ++c) {
+        long long offset = base_offset + (long long)c * inner_size;
+        max_val = fmaxf(max_val, to_float(logits[offset]));
+    }
+
+    // 2. 计算 sum(exp(x - max_val))
+    float sum_exp = 0.0f;
+    for (int c = 0; c < C; ++c) {
+        long long offset = base_offset + (long long)c * inner_size;
+        sum_exp += expf(to_float(logits[offset]) - max_val);
+    }
+
+    // 3. 计算最终 loss
+    long long target_offset = base_offset + (long long)t * inner_size;
+    float logit_tgt = to_float(logits[target_offset]);
+
+    loss[idx] = (T_out)(logf(sum_exp) + max_val - logit_tgt);
+}
+
+#endif // __CROSS_ENTROPY_KERNEL_CUH__
diff --git a/src/infiniop/ops/cross_entropy_loss/info.h b/src/infiniop/ops/cross_entropy_loss/info.h
new file mode 100644
index 000000000..5278bf912
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/info.h
@@ -0,0 +1,36 @@
+#ifndef __CROSS_ENTROPY_LOSS_INFO_H__
+#define __CROSS_ENTROPY_LOSS_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::cross_entropy_loss {
+
+class CrossEntropyInfo {
+public:
+    CrossEntropyInfo() = default;
+    size_t batch = 0;
+    size_t num_classes = 0;
+    infiniDtype_t dtype;
+
+    static utils::Result<CrossEntropyInfo> create(
+        infiniopTensorDescriptor_t loss,
+        infiniopTensorDescriptor_t logits,
+        infiniopTensorDescriptor_t target) {
+
+        if (logits->ndim() != 2 || loss->ndim() != 1 || target->ndim() != 1) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        CrossEntropyInfo info;
+        info.batch = logits->dim(0);
+        info.num_classes = logits->dim(1);
+        info.dtype = logits->dtype();
+        return utils::Result<CrossEntropyInfo>(std::move(info));
+    }
+};
+
+} // namespace op::cross_entropy_loss
+
+#endif // __CROSS_ENTROPY_LOSS_INFO_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h
new file mode 100644
index 000000000..382d555e0
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_METAX_H__
+#define __CROSS_ENTROPY_METAX_H__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(metax)
+
+#endif // __CROSS_ENTROPY_METAX_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca
new file mode 100644
index 000000000..94f611e7a
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca
@@ -0,0 +1,145 @@
+#include <hpcc_fp16.h>
+#include <math.h>
+#include <vector>
+#include <memory>
+#include <numeric>
+#include <limits>
+#include <cstdint> 
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "cross_entropy_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::cross_entropy_loss::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    std::vector<size_t> logits_shape;
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+    ~Opaque() = default;
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t /*loss_desc*/,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t /*target_desc*/) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+    if (dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_BF16) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto opaque = new Opaque(handle->internal());
+    const auto &orig_shape = logits_desc->shape();
+
+    if (orig_shape.size() == 1) {
+        opaque->logits_shape = {1, orig_shape[0]};
+    } else {
+        opaque->logits_shape = orig_shape;
+    }
+    
+    if (opaque->logits_shape.size() < 2) return INFINI_STATUS_BAD_TENSOR_SHAPE;
+
+    const auto &s = opaque->logits_shape;
+    long long N = (long long)s[0];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) inner *= (long long)s[i];
+
+    size_t workspace_size = (size_t)(N * inner) * sizeof(float);
+    *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size, void *loss,
+    const void *logits, const void *target, void *stream_) const {
+
+    const auto &shape = _opaque->logits_shape;
+    int N = (int)shape[0];
+    int C = (int)shape[1];
+    long long inner_size = 1;
+    for (size_t i = 2; i < shape.size(); ++i)
+        inner_size *= shape[i];
+
+    long long total = (long long)N * inner_size;
+
+    size_t need_ws = (size_t)total * sizeof(float);
+    if (workspace_size < need_ws) return INFINI_STATUS_INTERNAL_ERROR;
+    float* per_sample_loss = reinterpret_cast<float*>(workspace);
+
+    const int64_t *typed_target = reinterpret_cast<const int64_t *>(target);
+    const int64_t ignore_index = -100;
+    hcStream_t stream = (hcStream_t)stream_;
+
+    dim3 blockSize(256);
+    dim3 gridSize((total + blockSize.x - 1) / blockSize.x);
+
+    if (_dtype == INFINI_DTYPE_F32) {
+        cross_entropy_loss_kernel<float, float>
+            <<<gridSize, blockSize, 0, stream>>>(
+                per_sample_loss, (const float*)logits, typed_target,
+                N, C, inner_size, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        cross_entropy_loss_kernel<half, float>
+            <<<gridSize, blockSize, 0, stream>>>(
+                per_sample_loss, (const half*)logits, typed_target,
+                N, C, inner_size, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        cross_entropy_loss_kernel<__hpcc_bfloat16, float>
+            <<<gridSize, blockSize, 0, stream>>>(
+                per_sample_loss, (const __hpcc_bfloat16*)logits, typed_target,
+                N, C, inner_size, ignore_index);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (hcGetLastError() != hcSuccess) return INFINI_STATUS_INTERNAL_ERROR;
+
+    std::vector<float> h_loss((size_t)total);
+    std::vector<int64_t> h_target((size_t)total);
+    if (hcMemcpyAsync(h_loss.data(), per_sample_loss, need_ws, hcMemcpyDeviceToHost, stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+    if (hcMemcpyAsync(h_target.data(), typed_target, (size_t)total * sizeof(int64_t), hcMemcpyDeviceToHost, stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+    if (hcStreamSynchronize(stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+
+    double acc = 0.0;
+    long long count = 0;
+    for (long long i = 0; i < total; ++i) {
+        if (h_target[i] != ignore_index) {
+            acc += (double)h_loss[i];
+            count++;
+        }
+    }
+    double mean = (count > 0) ? (acc / (double)count) : 0.0;
+
+    if (_dtype == INFINI_DTYPE_F32) {
+        float v = (float)mean;
+        if (hcMemcpyAsync(loss, &v, sizeof(float), hcMemcpyHostToDevice, stream) != hcSuccess)
+            return INFINI_STATUS_INTERNAL_ERROR;
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        half v = __float2half((float)mean);
+        if (hcMemcpyAsync(loss, &v, sizeof(half), hcMemcpyHostToDevice, stream) != hcSuccess)
+            return INFINI_STATUS_INTERNAL_ERROR;
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        __hpcc_bfloat16 v = __float2bfloat16_rn((float)mean);
+        if (hcMemcpyAsync(loss, &v, sizeof(__hpcc_bfloat16), hcMemcpyHostToDevice, stream) != hcSuccess)
+            return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (hcStreamSynchronize(stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::cross_entropy_loss::metax
diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu
new file mode 100644
index 000000000..3d795a67a
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu
@@ -0,0 +1,217 @@
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <limits.h>
+#include <math_constants.h>
+#include <memory>
+#include <numeric>
+#include <stdio.h>
+#include <vector>
+
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "cross_entropy_loss_nvidia.cuh"
+
+namespace op::cross_entropy_loss::nvidia {
+namespace cuda {
+
+__device__ __forceinline__ float to_float(float v) { return v; }
+__device__ __forceinline__ float to_float(double v) { return (float)v; }
+__device__ __forceinline__ float to_float(half v) { return __half2float(v); }
+__device__ __forceinline__ float to_float(__nv_bfloat16 v) {
+    return __bfloat162float(v);
+}
+
+template <typename T_in, typename T_out = float>
+__global__ void
+softmaxCrossEntropy_per_sample(T_out *__restrict__ loss,
+                               const T_in *__restrict__ logits,
+                               const int64_t *__restrict__ target, int N, int C,
+                               long long inner_size, int64_t ignore_index) {
+    long long total = (long long)N * inner_size;
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) {
+        return;
+    }
+
+    int n = (int)(idx / inner_size);
+    int inr = (int)(idx % inner_size);
+
+    int64_t t = target[(long long)n * inner_size + inr];
+    if (ignore_index != LLONG_MIN && t == ignore_index) {
+        loss[idx] = (T_out)0;
+        return;
+    }
+    if (t < 0 || t >= C) {
+        loss[idx] = (T_out)0;
+        return;
+    }
+
+    const long long base = ((long long)n * C * inner_size) + inr;
+
+    // 数值稳定 LSE：lse = log(sum exp(x - m)) + m
+    float m = -CUDART_INF_F;
+    for (int c = 0; c < C; ++c) {
+        m = fmaxf(m, to_float(logits[base + (long long)c * inner_size]));
+    }
+
+    float sum_exp = 0.f;
+    for (int c = 0; c < C; ++c) {
+        sum_exp += expf(to_float(logits[base + (long long)c * inner_size]) - m);
+    }
+
+    float lse = logf(sum_exp) + m;
+    float logit_t = to_float(logits[base + (long long)(int)t * inner_size]);
+    loss[idx] = (T_out)(lse - logit_t);
+}
+
+} // namespace cuda
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    std::vector<size_t> logits_shape;
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> p) : internal(p) {}
+    ~Opaque() = default;
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t /*loss_desc*/,
+                                  infiniopTensorDescriptor_t logits_desc,
+                                  infiniopTensorDescriptor_t /*target_desc*/) {
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    const auto &orig = logits_desc->shape();
+    auto opaque = new Opaque(handle->internal());
+
+    if (orig.size() == 1) {
+        opaque->logits_shape = {1, orig[0]};
+    } else {
+        opaque->logits_shape = orig;
+    }
+
+    const auto &s = opaque->logits_shape;
+    long long N = (long long)s[0];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) {
+        inner *= (long long)s[i];
+    }
+
+    size_t workspace_size = (size_t)(N * inner) * sizeof(float);
+    *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device,
+                               handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *loss, const void *logits,
+                                     const void *target, void *stream) const {
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+    const auto &s = _opaque->logits_shape;
+    int N = (int)s[0];
+    int C = (int)s[1];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) {
+        inner *= (long long)s[i];
+    }
+    long long total = (long long)N * inner;
+
+    size_t need_ws = (size_t)total * sizeof(float);
+    if (workspace_size < need_ws) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    float *per_sample = reinterpret_cast<float *>(workspace);
+
+    const int64_t *tgt_i64 = reinterpret_cast<const int64_t *>(target);
+    const int64_t ignore_index = -100;
+
+    // 1) 写 per-sample loss -> workspace（float）
+    dim3 block(256);
+    dim3 grid((total + block.x - 1) / block.x);
+    cudaStream_t st = (cudaStream_t)stream;
+
+    if (_dtype == INFINI_DTYPE_F32) {
+        cuda::softmaxCrossEntropy_per_sample<float, float><<<grid, block, 0, st>>>(
+            per_sample, (const float *)logits, tgt_i64, N, C, inner, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        cuda::softmaxCrossEntropy_per_sample<half, float><<<grid, block, 0, st>>>(
+            per_sample, (const half *)logits, tgt_i64, N, C, inner, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        cuda::softmaxCrossEntropy_per_sample<__nv_bfloat16, float>
+            <<<grid, block, 0, st>>>(per_sample, (const __nv_bfloat16 *)logits,
+                                     tgt_i64, N, C, inner, ignore_index);
+    }
+    {
+        auto err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    }
+
+    // 2) host 侧 mean（仅统计 target != ignore_index）
+    std::vector<float> h_loss((size_t)total);
+    std::vector<int64_t> h_tgt((size_t)total);
+    if (cudaMemcpyAsync(h_loss.data(), per_sample, need_ws,
+                        cudaMemcpyDeviceToHost, st)
+        != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (cudaMemcpyAsync(h_tgt.data(), tgt_i64, (size_t)total * sizeof(int64_t),
+                        cudaMemcpyDeviceToHost, st)
+        != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (cudaStreamSynchronize(st) != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    double acc = 0.0;
+    long long cnt = 0;
+    for (long long i = 0; i < total; ++i) {
+        if (h_tgt[i] != ignore_index) {
+            acc += (double)h_loss[i];
+            ++cnt;
+        }
+    }
+    double mean = (cnt > 0) ? (acc / (double)cnt) : 0.0;
+
+    // 3) 把标量 mean 写回 device 的 loss 指针（按输入 dtype 写 1 个元素）
+    if (_dtype == INFINI_DTYPE_F32) {
+        float v = (float)mean;
+        if (cudaMemcpyAsync(loss, &v, sizeof(float), cudaMemcpyHostToDevice, st) != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        half v = __float2half((float)mean);
+        if (cudaMemcpyAsync(loss, &v, sizeof(half), cudaMemcpyHostToDevice, st) != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        __nv_bfloat16 v = __float2bfloat16((float)mean);
+        if (cudaMemcpyAsync(loss, &v, sizeof(__nv_bfloat16), cudaMemcpyHostToDevice,
+                            st)
+            != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    }
+    if (cudaStreamSynchronize(st) != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+} // namespace op::cross_entropy_loss::nvidia
diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh
new file mode 100644
index 000000000..843fc943d
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_CUDA_CUH__
+#define __CROSS_ENTROPY_LOSS_CUDA_CUH__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __CROSS_ENTROPY_LOSS_CUDA_CUH__
diff --git a/src/infiniop/ops/cross_entropy_loss/operator.cc b/src/infiniop/ops/cross_entropy_loss/operator.cc
new file mode 100644
index 000000000..e9a47558f
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/operator.cc
@@ -0,0 +1,143 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cross_entropy_loss.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cross_entropy_loss_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cross_entropy_loss_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cross_entropy_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(
+    infiniopHandle_t handle,
+    infiniopCrossEntropyLossDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t loss_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::cross_entropy_loss::NAMESPACE::Descriptor::create(        \
+            handle,                                                          \
+            reinterpret_cast<                                                \
+                op::cross_entropy_loss::NAMESPACE::Descriptor **>(desc_ptr), \
+            loss_desc, logits_desc, target_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(
+    infiniopCrossEntropyLossDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                   \
+        *size = reinterpret_cast<                                                \
+                    const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \
+                    ->workspaceSize();                                           \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopCrossEntropyLoss(
+    infiniopCrossEntropyLossDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *loss,
+    const void *logits,
+    const void *target,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<                                                \
+                   const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, loss, logits, target,        \
+                        stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(
+    infiniopCrossEntropyLossDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                           \
+    case CASE:                                                            \
+        delete reinterpret_cast<                                          \
+            const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc
new file mode 100644
index 000000000..6b5edef36
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.cc
@@ -0,0 +1,54 @@
+#include "div_cpu.h"
+
+namespace op::div::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<DivOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<DivOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<DivOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<DivOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::cpu
diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h
new file mode 100644
index 000000000..0373b766f
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __DIV_CPU_H__
+#define __DIV_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(div, cpu)
+
+namespace op::div::cpu {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a / b;
+    }
+} DivOp;
+} // namespace op::div::cpu
+
+#endif // __DIV_CPU_H__
diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh
new file mode 100644
index 000000000..cefbf0141
--- /dev/null
+++ b/src/infiniop/ops/div/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __DIV_CUDA_H__
+#define __DIV_CUDA_H__
+
+namespace op::div::cuda {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __h2div(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return __hdiv(a, b);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fdiv_rn(a, b);
+        } else {
+            return a / b;
+        }
+    }
+} DivOp;
+} // namespace op::div::cuda
+
+#endif // __DIV_CUDA_H__
diff --git a/src/infiniop/ops/div/metax/div_metax.h b/src/infiniop/ops/div/metax/div_metax.h
new file mode 100644
index 000000000..1e56a7d44
--- /dev/null
+++ b/src/infiniop/ops/div/metax/div_metax.h
@@ -0,0 +1,8 @@
+#ifndef __DIV_METAX_API_H__
+#define __DIV_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(div, metax)
+
+#endif // __DIV_METAX_API_H__
diff --git a/src/infiniop/ops/div/metax/div_metax.maca b/src/infiniop/ops/div/metax/div_metax.maca
new file mode 100644
index 000000000..a8ecd8643
--- /dev/null
+++ b/src/infiniop/ops/div/metax/div_metax.maca
@@ -0,0 +1,62 @@
+#include "div_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::div::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::metax
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu
new file mode 100644
index 000000000..4cb64af63
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "div_nvidia.cuh"
+
+namespace op::div::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::nvidia
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
new file mode 100644
index 000000000..1ad8af94e
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __DIV_CUDA_API_H__
+#define __DIV_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(div, nvidia)
+
+#endif // __DIV_CUDA_API_H__
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
new file mode 100644
index 000000000..162156887
--- /dev/null
+++ b/src/infiniop/ops/div/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/div.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/div_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/div_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/div_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateDivDescriptor(
+    infiniopHandle_t handle,
+    infiniopDivDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::div::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::div::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::div::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDiv(
+    infiniopDivDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc
new file mode 100644
index 000000000..66555cdd5
--- /dev/null
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc
@@ -0,0 +1,82 @@
+#include "equal_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::equal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = c_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL);
+    CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = EqualInfo::createEqualInfo(
+        c_desc,
+        a_desc,
+        b_desc
+    );
+    CHECK_RESULT(result);
+    const EqualInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,
+    void *stream
+) const {
+    std::vector<ptrdiff_t> contiguous_strides(_info.ndim);
+	ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for(size_t d = 0; d < _info.ndim; d ++)
+    {
+        contiguous_strides[d] = last_dim * last_stride;  
+        last_dim = _info.a_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    auto c_ptr = reinterpret_cast<bool*>(c);
+    *c_ptr = true;
+    #pragma omp parallel for
+    for(size_t i = 0; i < total_size; i ++) {
+        auto a_ptr = reinterpret_cast<const char*>(a);
+        auto b_ptr = reinterpret_cast<const char*>(b);
+        size_t rem = i;
+        for(int d = _info.ndim - 1; d >= 0; d --) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            a_ptr += dim_index * _info.a_strides[d];
+            b_ptr += dim_index * _info.b_strides[d];
+        }
+        if (memcmp(a_ptr, b_ptr, elem_size) != 0) {
+            *c_ptr = false;
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h
new file mode 100644
index 000000000..a09c63d9b
--- /dev/null
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __EQUAL_CPU_H__
+#define __EQUAL_CPU_H__
+
+#include "../equal.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __EQUAL_CPU_H__
diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh
new file mode 100644
index 000000000..193c94333
--- /dev/null
+++ b/src/infiniop/ops/equal/cuda/kernel.cuh
@@ -0,0 +1,38 @@
+#ifndef __EQUAL_KERNEL_CUH__
+#define __EQUAL_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void equalKernel(
+    bool * c,
+    const Tdata * a,
+    const Tdata * b,
+    size_t ndim,
+    size_t total_size,
+    ptrdiff_t* contiguous_strides,
+    ptrdiff_t* a_strides,
+    ptrdiff_t* b_strides
+) {
+    if (threadIdx.x == 0)
+    {
+        *c = true;
+    }
+    __syncthreads();
+    for(size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) {
+        auto a_ptr = a;
+        auto b_ptr = b;
+        size_t rem = i;
+        for(int d = ndim - 1; d >= 0; d --) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            a_ptr += dim_index * a_strides[d];
+            b_ptr += dim_index * b_strides[d];
+        }
+        if ((*a_ptr != *b_ptr)  && (*c == true)) {
+            *c = false;
+        }
+
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __EQUAL_KERNEL_CUH__
diff --git a/src/infiniop/ops/equal/equal.h b/src/infiniop/ops/equal/equal.h
new file mode 100644
index 000000000..12cc0ba16
--- /dev/null
+++ b/src/infiniop/ops/equal/equal.h
@@ -0,0 +1,48 @@
+#ifndef __EQUAL_H__
+#define __EQUAL_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::equal::NAMESPACE {                                  \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        EqualInfo _info;                                              \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            EqualInfo info,                                           \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t c_desc,                        \
+            infiniopTensorDescriptor_t a_desc,                        \
+            infiniopTensorDescriptor_t b_desc                         \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * c,                                                 \
+            const void * a,                                           \
+            const void * b,                                           \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/equal/info.h b/src/infiniop/ops/equal/info.h
new file mode 100644
index 000000000..5dd2c0a54
--- /dev/null
+++ b/src/infiniop/ops/equal/info.h
@@ -0,0 +1,46 @@
+#ifndef __EQUAL_INFO_H__
+#define __EQUAL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::equal {
+
+class EqualInfo {
+private:
+    EqualInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    size_t ndim;    
+    infiniDtype_t dtype;
+    std::vector<size_t> a_shape;
+    std::vector<ptrdiff_t> a_strides;
+    std::vector<ptrdiff_t> b_strides;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<EqualInfo> createEqualInfo(
+        infiniopTensorDescriptor_t c_desc,
+        infiniopTensorDescriptor_t a_desc,
+        infiniopTensorDescriptor_t b_desc
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(c_desc->ndim() == 1 && c_desc->dim(0) == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_SAME_SHAPE(a_desc->shape(), b_desc->shape());
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<EqualInfo>(EqualInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            a_desc->ndim(),            
+            a_desc->dtype(),
+            a_desc->shape(),
+            a_desc->strides(),
+            b_desc->strides()
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __EQUAL_INFO_H__
diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h
new file mode 100644
index 000000000..0ebb67bb3
--- /dev/null
+++ b/src/infiniop/ops/equal/metax/equal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __EQUAL_METAX_H__
+#define __EQUAL_METAX_H__
+
+#include "../equal.h"
+
+DESCRIPTOR(metax)
+
+#endif // __EQUAL_METAX_H__
diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca
new file mode 100644
index 000000000..c8f4dda7d
--- /dev/null
+++ b/src/infiniop/ops/equal/metax/equal_metax.maca
@@ -0,0 +1,162 @@
+#include "../../../devices/metax/metax_common.h"
+#include "equal_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::equal::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    bool * c,
+    const Tdata * a,
+    const Tdata * b,
+    size_t ndim,
+    size_t total_size,
+    ptrdiff_t* contiguous_strides,
+    ptrdiff_t* a_strides,
+    ptrdiff_t* b_strides
+) {
+    equalKernel<BLOCK_SIZE, Tdata>(
+        c,
+        a,
+        b,
+        ndim,
+        total_size,
+        contiguous_strides,
+        a_strides,
+        b_strides
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_equal(
+    const EqualInfo &info,
+    bool * c,
+    const Tdata * a,
+    const Tdata * b,
+    hcStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.a_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+
+    launchKernel<BLOCK_SIZE, Tdata><<<1, BLOCK_SIZE, 0, stream>>>(
+        c,
+        a,
+        b,
+        info.ndim,
+        total_size,
+        contiguous_strides_cuda,
+        a_strides_cuda,
+        b_strides_cuda
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = a_desc->dtype();
+    auto result = EqualInfo::createEqualInfo(
+        c_desc,
+        a_desc,
+        b_desc
+    );
+    CHECK_RESULT(result);
+    const EqualInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_EQUAL(TDATA) \
+        calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
+    switch (_info.dtype) {                   
+        case INFINI_DTYPE_U8:                       
+            return CALCULATE_EQUAL(uint8_t);  
+        case INFINI_DTYPE_U16:                      
+            return CALCULATE_EQUAL(uint16_t); 
+        case INFINI_DTYPE_U32:                      
+            return CALCULATE_EQUAL(uint32_t); 
+        case INFINI_DTYPE_U64:                      
+            return CALCULATE_EQUAL(uint64_t); 
+        case INFINI_DTYPE_I8:                       
+            return CALCULATE_EQUAL(int8_t);   
+        case INFINI_DTYPE_I16:                      
+            return CALCULATE_EQUAL(int16_t);  
+        case INFINI_DTYPE_I32:                      
+            return CALCULATE_EQUAL(int32_t);
+        case INFINI_DTYPE_I64:
+            return CALCULATE_EQUAL(int64_t);
+        case INFINI_DTYPE_F16:
+            return CALCULATE_EQUAL(half);
+        case INFINI_DTYPE_F32:
+            return CALCULATE_EQUAL(float);     
+        case INFINI_DTYPE_BF16:
+            return CALCULATE_EQUAL(cuda_bfloat16);         
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_EQUAL
+}
+} // namespace op::equal::metax
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
new file mode 100644
index 000000000..d1bfab8f0
--- /dev/null
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
@@ -0,0 +1,163 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "equal_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::equal::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    bool * c,
+    const Tdata * a,
+    const Tdata * b,
+    size_t ndim,
+    size_t total_size,
+    ptrdiff_t* contiguous_strides,
+    ptrdiff_t* a_strides,
+    ptrdiff_t* b_strides
+) {
+    equalKernel<BLOCK_SIZE, Tdata>(
+        c,
+        a,
+        b,
+        ndim,
+        total_size,
+        contiguous_strides,
+        a_strides,
+        b_strides
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_equal(
+    const EqualInfo &info,
+    bool * c,
+    const Tdata * a,
+    const Tdata * b,
+    cudaStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.a_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    
+    launchKernel<BLOCK_SIZE, Tdata><<<1, BLOCK_SIZE, 0, stream>>>(
+        c,
+        a,
+        b,
+        info.ndim,
+        total_size,
+        contiguous_strides_cuda,
+        a_strides_cuda,
+        b_strides_cuda
+    );
+    
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = a_desc->dtype();
+    auto result = EqualInfo::createEqualInfo(
+        c_desc,
+        a_desc,
+        b_desc
+    );
+    CHECK_RESULT(result);
+    const EqualInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,
+    void *stream_
+) const {
+    
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_EQUAL(TDATA) \
+        calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
+    switch (_info.dtype) {                   
+        case INFINI_DTYPE_U8:                       
+            return CALCULATE_EQUAL(uint8_t);  
+        case INFINI_DTYPE_U16:                      
+            return CALCULATE_EQUAL(uint16_t); 
+        case INFINI_DTYPE_U32:                      
+            return CALCULATE_EQUAL(uint32_t); 
+        case INFINI_DTYPE_U64:                      
+            return CALCULATE_EQUAL(uint64_t); 
+        case INFINI_DTYPE_I8:                       
+            return CALCULATE_EQUAL(int8_t);   
+        case INFINI_DTYPE_I16:                      
+            return CALCULATE_EQUAL(int16_t);  
+        case INFINI_DTYPE_I32:                      
+            return CALCULATE_EQUAL(int32_t);
+        case INFINI_DTYPE_I64:
+            return CALCULATE_EQUAL(int64_t);
+        case INFINI_DTYPE_F16:
+            return CALCULATE_EQUAL(half);
+        case INFINI_DTYPE_F32:
+            return CALCULATE_EQUAL(float);     
+        case INFINI_DTYPE_BF16:
+            return CALCULATE_EQUAL(cuda_bfloat16);         
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_EQUAL
+}
+} // namespace op::equal::nvidia
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
new file mode 100644
index 000000000..11760c91d
--- /dev/null
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __EQUAL_NVIDIA_API_H__
+#define __EQUAL_NVIDIA_API_H__
+#include "../equal.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __EQUAL_NVIDIA_API_H__
diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc
new file mode 100644
index 000000000..81607fef8
--- /dev/null
+++ b/src/infiniop/ops/equal/operator.cc
@@ -0,0 +1,152 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/equal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/equal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/equal_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/equal_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::equal::NAMESPACE::Descriptor::create(                        \
+            handle,                                                             \
+            reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr),    \
+            c_desc,                                                             \
+            a_desc,                                                             \
+            b_desc                                                              \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize();        \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopEqual(
+    infiniopEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc)->calculate(         \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            c,                                                                                      \
+            a,                                                                                      \
+            b,                                                                                      \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc);          \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc
new file mode 100644
index 000000000..58a6d0f2d
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc
@@ -0,0 +1,52 @@
+#include "exp_cpu.h"
+
+namespace op::exp::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ExpOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ExpOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ExpOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ExpOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::cpu
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h
new file mode 100644
index 000000000..867c7afa5
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __EXP_CPU_H__
+#define __EXP_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(exp, cpu)
+
+namespace op::exp::cpu {
+typedef struct ExpOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::exp(input);
+    }
+} ExpOp;
+} // namespace op::exp::cpu
+
+#endif // __EXP_CPU_H__
diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh
new file mode 100644
index 000000000..12446f31a
--- /dev/null
+++ b/src/infiniop/ops/exp/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef __EXP_CUDA_H__
+#define __EXP_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::exp::cuda {
+typedef struct ExpOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(__expf(vf.x), __expf(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float inputf = __half2float(input);
+            return __float2half_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(__expf(f0), __expf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float inputf = __bfloat162float(input);
+            return __float2bfloat16_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __expf(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::exp(input);
+        } else {
+            return std::exp(input);
+        }
+    }
+} ExpOp;
+} // namespace op::exp::cuda
+
+#endif // __EXP_CUDA_H__
diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h
new file mode 100644
index 000000000..fb10faf9b
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.h
@@ -0,0 +1,8 @@
+#ifndef __EXP_METAX_API_H__
+#define __EXP_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(exp, metax)
+
+#endif // __EXP_METAX_API_H__
diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca
new file mode 100644
index 000000000..c71703c6d
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.maca
@@ -0,0 +1,60 @@
+#include "exp_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::exp::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::metax
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
new file mode 100644
index 000000000..3bdf2eb45
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "exp_nvidia.cuh"
+
+namespace op::exp::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::nvidia
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
new file mode 100644
index 000000000..7545e8f3e
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EXP_CUDA_API_H__
+#define __EXP_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(exp, nvidia)
+
+#endif // __EXP_CUDA_API_H__
diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc
new file mode 100644
index 000000000..ee1dc6768
--- /dev/null
+++ b/src/infiniop/ops/exp/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/exp.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/exp_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/exp_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/exp_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateExpDescriptor(
+    infiniopHandle_t handle,
+    infiniopExpDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::exp::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::exp::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopExp(
+    infiniopExpDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.cc b/src/infiniop/ops/gather/cpu/gather_cpu.cc
new file mode 100644
index 000000000..cc1d5b740
--- /dev/null
+++ b/src/infiniop/ops/gather/cpu/gather_cpu.cc
@@ -0,0 +1,106 @@
+#include "gather_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::gather::cpu {
+
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    char * output,
+    const char * input,
+    const int64_t * index
+) {
+//  -------------------------------- start: perform operator on CPU --------------------------------
+    std::vector<ptrdiff_t> contiguous_strides(info.ndim);
+	ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for(size_t d = 0; d < info.ndim; d ++)
+    {
+        contiguous_strides[d] = last_dim * last_stride;  
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+    
+    int gather_dim = info.dim;
+    size_t element_size = infiniSizeOf(info.dtype);
+
+    #pragma omp parallel for
+    for(size_t i = 0; i < total_size; i++)
+    {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;
+        size_t rem = i;
+        for (int d = info.ndim - 1; d >= 0; d--) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * element_size * info.output_strides[d];
+            index_ptr += dim_index * info.index_strides[d];
+            if(d != gather_dim)
+                input_ptr += dim_index * element_size * info.input_strides[d];
+        }
+        int64_t gather_number = *index_ptr;
+        input_ptr += gather_number * element_size * info.input_strides[gather_dim];
+        // *output_ptr = *input_ptr;
+        memcpy(
+            output_ptr,
+            input_ptr,
+            element_size
+        );
+    }
+//  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream
+) const {
+
+    return calculate_gather(_info, (char *)output, (const char *)input, (const int64_t *)index);
+}
+}
+
diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.h b/src/infiniop/ops/gather/cpu/gather_cpu.h
new file mode 100644
index 000000000..bc74fd669
--- /dev/null
+++ b/src/infiniop/ops/gather/cpu/gather_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __GATHER_CPU_H__
+#define __GATHER_CPU_H__
+
+#include "../gather.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __GATHER_CPU_H__
diff --git a/src/infiniop/ops/gather/cuda/kernel.cuh b/src/infiniop/ops/gather/cuda/kernel.cuh
new file mode 100644
index 000000000..e49ca0fd2
--- /dev/null
+++ b/src/infiniop/ops/gather/cuda/kernel.cuh
@@ -0,0 +1,38 @@
+#ifndef __GATHER_KERNEL_CUH__
+#define __GATHER_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void gatherKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int gather_dim
+) {
+    auto output_ptr = output;
+    auto input_ptr = input;
+    auto index_ptr = index;
+    size_t rem = blockIdx.x;
+    for (int d = ndim - 1; d >= 0; d--) {
+        if (d == gather_dim)
+            continue;        
+        size_t dim_index = rem / contiguous_strides[d];
+        rem = rem % contiguous_strides[d];
+        output_ptr += dim_index * output_strides[d];
+        input_ptr += dim_index * input_strides[d];
+        index_ptr += dim_index * index_strides[d];
+    }
+    for (size_t c = threadIdx.x; c < index_gather_size; c ++) {
+        int64_t gather_number = *(index_ptr + c * index_strides[gather_dim]);
+        *(output_ptr + c * output_strides[gather_dim]) = \
+            *(input_ptr + gather_number * input_strides[gather_dim]);        
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __GATHER_KERNEL_CUH__
diff --git a/src/infiniop/ops/gather/gather.h b/src/infiniop/ops/gather/gather.h
new file mode 100644
index 000000000..70991f5db
--- /dev/null
+++ b/src/infiniop/ops/gather/gather.h
@@ -0,0 +1,49 @@
+#ifndef __GATHER_H__
+#define __GATHER_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::gather::NAMESPACE {                                 \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        GatherInfo _info;                                             \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            GatherInfo info,                                          \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t index_desc,                    \
+            size_t dim                                                \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            const void * index,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/gather/info.h b/src/infiniop/ops/gather/info.h
new file mode 100644
index 000000000..aa7a54a16
--- /dev/null
+++ b/src/infiniop/ops/gather/info.h
@@ -0,0 +1,58 @@
+#ifndef __GATHER_INFO_H__
+#define __GATHER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::gather {
+
+class GatherInfo {
+private:
+    GatherInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t ndim;
+    std::vector<size_t> output_shape;
+    size_t input_dim_size;
+    std::vector<ptrdiff_t> output_strides;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> index_strides;
+    size_t dim;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<GatherInfo> createGatherInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t index_desc,
+        size_t dim
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_SAME_SHAPE(output_desc->shape(), index_desc->shape());
+        size_t ndim = output_desc->ndim();
+        for (size_t d = 0; d < ndim; d ++) {
+            if (d != dim)
+                CHECK_OR_RETURN(input_desc->dim(d) == output_desc->dim(d), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }        
+        CHECK_OR_RETURN(ndim > dim, INFINI_STATUS_BAD_PARAM);
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<GatherInfo>(GatherInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+            ndim,
+            output_desc->shape(),
+            input_desc->dim(dim),
+            output_desc->strides(),
+            input_desc->strides(),
+            index_desc->strides(),
+            dim
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __GATHER_INFO_H__
diff --git a/src/infiniop/ops/gather/metax/gather_metax.h b/src/infiniop/ops/gather/metax/gather_metax.h
new file mode 100644
index 000000000..bf0a25a36
--- /dev/null
+++ b/src/infiniop/ops/gather/metax/gather_metax.h
@@ -0,0 +1,8 @@
+#ifndef __GATHER_METAX_H__
+#define __GATHER_METAX_H__
+
+#include "../gather.h"
+
+DESCRIPTOR(metax)
+
+#endif // __GATHER_METAX_H__
diff --git a/src/infiniop/ops/gather/metax/gather_metax.maca b/src/infiniop/ops/gather/metax/gather_metax.maca
new file mode 100644
index 000000000..7254078c3
--- /dev/null
+++ b/src/infiniop/ops/gather/metax/gather_metax.maca
@@ -0,0 +1,190 @@
+#include "../../../devices/metax/metax_common.h"
+#include "gather_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::gather::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int gather_dim
+) {
+    gatherKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_gather_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        gather_dim
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    hcStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t gather_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == gather_dim) 
+            continue;         
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));    
+
+    
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.output_shape[gather_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        info.dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+
+    #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \
+        calculate_gather<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+
+    #define CALCULATE_GATHER_WITH_METAX_BLOCK(BLOCK_SIZE)             \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, bool);                \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint8_t);             \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint16_t);            \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint32_t);            \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint64_t);            \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, int8_t);              \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int16_t);             \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int32_t);             \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int64_t);             \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, half);                \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, float);               \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16);       \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }    
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_GATHER_WITH_METAX_BLOCK
+    #undef CALCULATE_GATHER
+}
+} // namespace op::gather::metax
diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cu b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu
new file mode 100644
index 000000000..94741a7d5
--- /dev/null
+++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu
@@ -0,0 +1,189 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "gather_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::gather::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int gather_dim
+) {
+    gatherKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_gather_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        gather_dim
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    cudaStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t gather_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == gather_dim) 
+            continue;        
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));    
+
+    launchKernel<1, Tdata><<<batch_size, 1, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.output_shape[gather_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        info.dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \
+        calculate_gather<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+    #define CALCULATE_GATHER_WITH_BLOCK_SIZE(BLOCK_SIZE)              \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, bool);                \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint8_t);             \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint16_t);            \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint32_t);            \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint64_t);            \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, int8_t);              \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int16_t);             \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int32_t);             \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int64_t);             \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, half);                \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, float);               \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16);       \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }    
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+
+    #undef CALCULATE_GATHER_WITH_BLOCK_SIZE
+    #undef CALCULATE_GATHER
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gather::nvidia
diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh
new file mode 100644
index 000000000..46d42fa0c
--- /dev/null
+++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __GATHER_NVIDIA_API_H__
+#define __GATHER_NVIDIA_API_H__
+#include "../gather.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __GATHER_NVIDIA_API_H__
diff --git a/src/infiniop/ops/gather/operator.cc b/src/infiniop/ops/gather/operator.cc
new file mode 100644
index 000000000..c748f811c
--- /dev/null
+++ b/src/infiniop/ops/gather/operator.cc
@@ -0,0 +1,154 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/gather.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gather_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/gather_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/gather_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::gather::NAMESPACE::Descriptor::create(                       \
+            handle,                                                             \
+            reinterpret_cast<op::gather::NAMESPACE::Descriptor **>(desc_ptr),   \
+            output_desc,                                                        \
+            input_desc,                                                         \
+            index_desc,                                                         \
+            dim                                                                 \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::gather::NAMESPACE::Descriptor *>(desc)->workspaceSize();       \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGather(
+    infiniopGatherDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::gather::NAMESPACE::Descriptor *>(desc)->calculate(        \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            input,                                                                                  \
+            index,                                                                                  \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::gather::NAMESPACE::Descriptor *>(desc);         \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
new file mode 100644
index 000000000..e7b68508a
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
@@ -0,0 +1,52 @@
+#include "hardswish_cpu.h"
+
+namespace op::hardswish::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HardswishOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HardswishOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HardswishOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HardswishOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::cpu
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
new file mode 100644
index 000000000..e137be8a0
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -0,0 +1,30 @@
+#ifndef __HARDSWISH_CPU_H__
+#define __HARDSWISH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
+
+namespace op::hardswish::cpu {
+typedef struct HardswishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        if constexpr (std::is_integral_v<T>) {
+            return static_cast<T>(0);
+        } else {
+            // x * clamp(x + 3, 0, 6) / 6
+            auto x = static_cast<double>(input);
+            double y = x + 3.0;
+            y = std::min(std::max(y, 0.0), 6.0);
+            double out = x * (y / 6.0);
+            return static_cast<T>(out);
+        }
+    }
+} HardswishOp;
+} // namespace op::hardswish::cpu
+
+#endif // __HARDSWISH_CPU_H__
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
new file mode 100644
index 000000000..d5b369bce
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -0,0 +1,55 @@
+#ifndef __HARDSWISH_CUDA_H__
+#define __HARDSWISH_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::hardswish::cuda {
+
+typedef struct HardswishOp {
+    static constexpr size_t num_inputs = 1;
+
+    // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
+    __device__ __forceinline__ float hswish_f32(float x) const {
+        float y = x + 3.0f;
+        y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
+        return x * (y * (1.0f / 6.0f));
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(
+                hswish_f32(vf.x),
+                hswish_f32(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = hswish_f32(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            return __float2bfloat16_rz(hswish_f32(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return hswish_f32(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        } else {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        }
+    }
+} HardswishOp;
+
+} // namespace op::hardswish::cuda
+
+#endif // __HARDSWISH_CUDA_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
new file mode 100644
index 000000000..16b131aa9
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_METAX_API_H__
+#define __HARDSWISH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, metax)
+
+#endif // __HARDSWISH_METAX_API_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
new file mode 100644
index 000000000..e53b94357
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
@@ -0,0 +1,60 @@
+#include "hardswish_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::hardswish::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::metax
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
new file mode 100644
index 000000000..9e279c2ef
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hardswish_nvidia.cuh"
+
+namespace op::hardswish::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::nvidia
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
new file mode 100644
index 000000000..f869ad52f
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_CUDA_API_H__
+#define __HARDSWISH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
+
+#endif // __HARDSWISH_CUDA_API_H__
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
new file mode 100644
index 000000000..e8ba19fc1
--- /dev/null
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hardswish.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hardswish_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/hardswish_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/hardswish_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateHardswishDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardswishDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::hardswish::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                         \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopHardswish(
+    infiniopHardswishDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc
new file mode 100644
index 000000000..f45ddef7a
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc
@@ -0,0 +1,108 @@
+#include "index_copy_inplace_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../../rearrange/cpu/rearrange_cpu.h"
+#include "../info.h"
+
+namespace op::index_copy_inplace::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+
+
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+//  ---------------------- end: check data type and calculate workspace size -----------------------    
+    InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()
+    );
+    InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()
+    );        
+    
+    void * in_rearrange_descriptor = nullptr;
+    void * out_rearrange_descriptor = nullptr;
+
+    op::rearrange::cpu::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::cpu::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc
+    );
+    op::rearrange::cpu::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::cpu::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc
+    );
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream
+) const {
+    size_t size_of_dtype = infiniSizeOf(_info.dtype);
+    auto index_ptr = reinterpret_cast<const int64_t *>(index);
+
+
+    char* workspace_in = reinterpret_cast<char*>(workspace);
+    char* workspace_out = workspace_in + size_of_dtype * _info.total_input_size;
+    
+    
+    reinterpret_cast<op::rearrange::cpu::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream);
+    memset(workspace_out, 0, _info.total_output_size * size_of_dtype);
+    size_t copy_unit_size = _info.meta_strides[_info.dim] * size_of_dtype;
+    #pragma omp parallel for
+    for (size_t dst_index = 0; dst_index < _info.output_shape[_info.dim]; dst_index++) {
+        size_t src_index = _info.index_shape[0] - 1;
+        while (true)
+        {
+            if (*(index_ptr + src_index * _info.index_strides[0]) == int64_t(dst_index)) {
+                std::memcpy(
+                    workspace_out + size_of_dtype * dst_index * _info.meta_strides[_info.dim],
+                    workspace_in + size_of_dtype * src_index * _info.meta_strides[_info.dim],
+                    copy_unit_size
+                );
+                break;
+            }
+            else if (src_index == 0)
+                break;
+            src_index --;
+        }
+    }
+    reinterpret_cast<op::rearrange::cpu::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream);
+
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h
new file mode 100644
index 000000000..384197013
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __INDEX_COPY_INPLACE_CPU_H__
+#define __INDEX_COPY_INPLACE_CPU_H__
+
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(cpu)
+
+
+#endif // __INDEX_COPY_INPLACE_CPU_H__
diff --git a/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h
new file mode 100644
index 000000000..f99fd3116
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h
@@ -0,0 +1,55 @@
+#ifndef __INDEX_COPY_INPLACE_H__
+#define __INDEX_COPY_INPLACE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define INDEX_COPY_INPLACE_DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::index_copy_inplace::NAMESPACE {                     \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        IndexCopyInplaceInfo _info;                                   \
+        size_t _workspace_size;                                       \
+        void *_rearrange_desc_in;                                     \
+        void *_rearrange_desc_out;                                    \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            IndexCopyInplaceInfo info,                                \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id,                                            \
+            void *rearrange_desc_in,                                  \
+            void *rearrange_desc_out                                  \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_),                       \
+              _rearrange_desc_in(rearrange_desc_in),                  \
+              _rearrange_desc_out(rearrange_desc_out) {}              \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t index_desc,                    \
+            size_t dim                                                \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            const void * index,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy_inplace/info.h b/src/infiniop/ops/index_copy_inplace/info.h
new file mode 100644
index 000000000..99d6a3a1e
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/info.h
@@ -0,0 +1,76 @@
+#ifndef __INDEX_COPY_INPLACE_INFO_H__
+#define __INDEX_COPY_INPLACE_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::index_copy_inplace {
+
+class IndexCopyInplaceInfo {
+private:
+    IndexCopyInplaceInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t total_input_size;
+	size_t total_output_size;
+    std::vector<size_t> output_shape;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> index_shape;
+    std::vector<ptrdiff_t> output_strides;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> index_strides;
+    std::vector<ptrdiff_t> meta_strides;
+    size_t dim;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<IndexCopyInplaceInfo> createIndexCopyInplaceInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t index_desc,
+        size_t dim
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+		std::vector<ptrdiff_t> meta_strides(input_desc->ndim());
+		ptrdiff_t last_dim = 1;
+		ptrdiff_t last_stride = 1;
+		size_t total_input_size = 1;
+		size_t total_output_size = 1;
+		for (size_t d = 0; d < input_desc->ndim(); d++){
+			total_input_size *= input_desc->dim(d);
+			total_output_size *= output_desc->dim(d);
+			if (d == dim) {
+            	continue;
+			}
+			else {
+				meta_strides[d] = last_dim * last_stride;
+				last_dim = input_desc->dim(d);
+				last_stride = meta_strides[d];
+			}	
+		}
+		meta_strides[dim] = last_dim * last_stride;
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<IndexCopyInplaceInfo>(IndexCopyInplaceInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+			total_input_size,
+			total_output_size,
+			output_desc->shape(),
+			input_desc->shape(),
+			index_desc->shape(),
+			output_desc->strides(),
+			input_desc->strides(),
+			index_desc->strides(),
+			meta_strides,
+            dim
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __INDEX_COPY_INPLACE_INFO_H__
diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h
new file mode 100644
index 000000000..65673c88c
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h
@@ -0,0 +1,8 @@
+#ifndef __INDEX_COPY_INPLACE_METAX_H__
+#define __INDEX_COPY_INPLACE_METAX_H__
+
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(metax)
+
+#endif // __INDEX_COPY_INPLACE_METAX_H__
diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca
new file mode 100644
index 000000000..e540b6201
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca
@@ -0,0 +1,147 @@
+#include "../../../devices/metax/metax_common.h"
+#include "index_copy_inplace_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../../rearrange/metax/rearrange_metax.h"
+#include "../info.h"
+
+namespace op::index_copy_inplace::metax {
+
+infiniStatus_t calculate_index_copy_inplace(
+    char * output,
+    const char * input,
+    const int64_t * index,
+    size_t copy_unit_size,
+    size_t output_len,
+    size_t index_len,
+    ptrdiff_t index_stride,
+    hcStream_t stream
+) {
+    int64_t* dst_index = new int64_t;
+    size_t sizeof_int64_t = sizeof(int64_t);
+    for (size_t src_index = 0; src_index < index_len; src_index ++) {
+        CHECK_METAX(hcMemcpyAsync(
+            dst_index,
+            index + src_index * index_stride,
+            sizeof_int64_t,
+            hcMemcpyDeviceToHost,
+            stream
+        ));
+        hcStreamSynchronize(stream);
+        CHECK_METAX(hcMemcpyAsync(
+            output + (size_t)(*dst_index) * copy_unit_size,
+            input + src_index * copy_unit_size,
+            copy_unit_size,
+            hcMemcpyDeviceToDevice,
+            stream
+        ));
+        hcStreamSynchronize(stream);
+    }
+    delete dst_index;
+    return INFINI_STATUS_SUCCESS;
+}
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_in);
+    delete reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_out);
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+
+
+    InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()
+    );
+    InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()
+    );        
+    
+    void * in_rearrange_descriptor = nullptr;
+    void * out_rearrange_descriptor = nullptr;
+
+    op::rearrange::metax::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::metax::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc
+    );
+    op::rearrange::metax::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::metax::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc
+    );   
+
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    char* workspace_in = reinterpret_cast<char*>(workspace);
+    char* workspace_out = workspace_in + elem_size * _info.total_input_size;
+    CHECK_STATUS(reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream));
+    hcMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream);
+
+    hcDeviceSynchronize();
+    CHECK_STATUS(calculate_index_copy_inplace(
+        reinterpret_cast<char*>(workspace_out),
+        reinterpret_cast<char*>(workspace_in),
+        reinterpret_cast<const int64_t*>(index),
+        elem_size * _info.meta_strides[_info.dim],
+        _info.output_shape[_info.dim],
+        _info.index_shape[0],
+        _info.index_strides[0],
+        stream
+    ));
+    hcDeviceSynchronize();
+
+    CHECK_STATUS(reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream));
+    return INFINI_STATUS_SUCCESS;
+
+}
+} // namespace op::index_copy_inplace::metax
diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu
new file mode 100644
index 000000000..ebf5907e0
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu
@@ -0,0 +1,140 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "index_copy_inplace_nvidia.cuh"
+#include "../../rearrange/nvidia/rearrange_nvidia.cuh"
+#include "../info.h"
+
+namespace op::index_copy_inplace::nvidia {
+
+infiniStatus_t calculate_index_copy_inplace(
+    char * output,
+    const char * input,
+    const int64_t * index,
+    size_t copy_unit_size,
+    size_t output_len,
+    size_t index_len,
+    ptrdiff_t index_stride,
+    cudaStream_t stream
+) {
+    int64_t* dst_index = new int64_t;
+    size_t sizeof_int64_t = sizeof(int64_t);
+    for (size_t src_index = 0; src_index < index_len; src_index ++) {
+        CHECK_CUDA(cudaMemcpyAsync(
+            dst_index,
+            index + src_index * index_stride,
+            sizeof_int64_t,
+            cudaMemcpyDeviceToHost,
+            stream
+        ));
+        cudaStreamSynchronize(stream);
+        CHECK_CUDA(cudaMemcpyAsync(
+            output + (size_t)(*dst_index) * copy_unit_size,
+            input + src_index * copy_unit_size,
+            copy_unit_size,
+            cudaMemcpyDeviceToDevice,
+            stream
+        ));
+        cudaStreamSynchronize(stream);
+    }
+    delete dst_index;
+    return INFINI_STATUS_SUCCESS;
+}
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_in);
+    delete reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_out);    
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+
+    InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()
+    );
+    InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()
+    );        
+    
+    void * in_rearrange_descriptor = nullptr;
+    void * out_rearrange_descriptor = nullptr;
+
+    op::rearrange::nvidia::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::nvidia::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc
+    );
+    op::rearrange::nvidia::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::nvidia::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc
+    );    
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor        
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    char* workspace_in = reinterpret_cast<char*>(workspace);
+    char* workspace_out = workspace_in + elem_size * _info.total_input_size;
+    CHECK_STATUS(reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream));
+    cudaMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream);
+    cudaDeviceSynchronize();
+    CHECK_STATUS(calculate_index_copy_inplace(
+        reinterpret_cast<char*>(workspace_out),
+        reinterpret_cast<char*>(workspace_in),
+        reinterpret_cast<const int64_t*>(index),
+        elem_size * _info.meta_strides[_info.dim],
+        _info.output_shape[_info.dim],
+        _info.index_shape[0],
+        _info.index_strides[0],
+        stream
+    ));
+    cudaDeviceSynchronize();
+
+    CHECK_STATUS(reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream));
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::index_copy_inplace::nvidia
diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh
new file mode 100644
index 000000000..04c3c86f7
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __INDEX_COPY_INPLACE_NVIDIA_API_H__
+#define __INDEX_COPY_INPLACE_NVIDIA_API_H__
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(nvidia)
+
+#endif // __INDEX_COPY_INPLACE_NVIDIA_API_H__
diff --git a/src/infiniop/ops/index_copy_inplace/operator.cc b/src/infiniop/ops/index_copy_inplace/operator.cc
new file mode 100644
index 000000000..e8886b65f
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/operator.cc
@@ -0,0 +1,154 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/index_copy_inplace.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/index_copy_inplace_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/index_copy_inplace_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/index_copy_inplace_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexCopyInplaceDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+#define CREATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        return op::index_copy_inplace::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                       \
+            reinterpret_cast<op::index_copy_inplace::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                                  \
+            input_desc,                                                                   \
+            index_desc,                                                                   \
+            dim                                                                           \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                                  \
+    case CASE:                                                                                                \
+        *size = reinterpret_cast<op::index_copy_inplace::NAMESPACE::Descriptor *>(desc)->workspaceSize();     \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopIndexCopyInplace(
+    infiniopIndexCopyInplaceDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                                \
+        return reinterpret_cast<const op::index_copy_inplace::NAMESPACE::Descriptor *>(desc)->calculate(      \
+            workspace,                                                                                        \
+            workspace_size,                                                                                   \
+            output,                                                                                           \
+            input,                                                                                            \
+            index,                                                                                            \
+            stream                                                                                            \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                      \
+        delete reinterpret_cast<const op::index_copy_inplace::NAMESPACE::Descriptor *>(desc);       \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc
new file mode 100644
index 000000000..508dcecc6
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc
@@ -0,0 +1,284 @@
+#include "interpolate_nearest_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+namespace op::interpolate_nearest::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    InterpolateNearestInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const InterpolateNearestInfo &interpolate_info)
+        : handle(handle_ptr), info(interpolate_info) {
+        workspace_size = 0;
+    }
+
+    template <typename T>
+    size_t compute_input_index_1d(size_t idx) const {
+        size_t temp = idx;
+
+        // 1D插值：3D张量 (N, C, W)
+        size_t w = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale)),
+                                  info.input_size[0] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2];
+    }
+
+    // 计算2D插值的输入索引
+    template <typename T>
+    size_t compute_input_index_2d(size_t idx) const {
+        size_t temp = idx;
+
+        // 2D插值：4D张量 (N, C, H, W)
+        size_t w = temp % info.output_size[1]; // width在索引1
+        temp /= info.output_size[1];
+        size_t h = temp % info.output_size[0]; // height在索引0
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale_h = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        float inv_scale_w = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+
+        size_t input_h = std::min(static_cast<size_t>(std::floor(static_cast<float>(h) * inv_scale_h)),
+                                  info.input_size[0] - 1);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale_w)),
+                                  info.input_size[1] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3];
+    }
+
+    // 计算3D插值的输入索引
+    template <typename T>
+    size_t compute_input_index_3d(size_t idx) const {
+        size_t temp = idx;
+
+        // 3D插值：5D张量 (N, C, D, H, W)
+        size_t w = temp % info.output_size[2]; // width在索引2
+        temp /= info.output_size[2];
+        size_t h = temp % info.output_size[1]; // height在索引1
+        temp /= info.output_size[1];
+        size_t d = temp % info.output_size[0]; // depth在索引0
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale_d = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        float inv_scale_h = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+        float inv_scale_w = static_cast<float>(info.input_size[2]) / static_cast<float>(info.output_size[2]);
+
+        size_t input_d = std::min(static_cast<size_t>(std::floor(static_cast<float>(d) * inv_scale_d)),
+                                  info.input_size[0] - 1);
+        size_t input_h = std::min(static_cast<size_t>(std::floor(static_cast<float>(h) * inv_scale_h)),
+                                  info.input_size[1] - 1);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale_w)),
+                                  info.input_size[2] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4];
+    }
+
+    // 计算输出索引
+    template <typename T>
+    size_t compute_output_index(size_t idx) const {
+        size_t temp = idx;
+        size_t w, h, d, c, b;
+
+        switch (info.dim) {
+        case INTERPOLATE_1D: {
+            // 3D张量 (N, C, W)
+            w = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2];
+        }
+
+        case INTERPOLATE_2D: {
+            // 4D张量 (N, C, H, W)
+            w = temp % info.output_size[1];
+            temp /= info.output_size[1];
+            h = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3];
+        }
+
+        case INTERPOLATE_3D: {
+            // 5D张量 (N, C, D, H, W)
+            w = temp % info.output_size[2];
+            temp /= info.output_size[2];
+            h = temp % info.output_size[1];
+            temp /= info.output_size[1];
+            d = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4];
+        }
+
+        default:
+            return 0;
+        }
+    }
+
+    // 计算总元素数
+    size_t calculate_total_elements() const {
+        size_t total = info.batch_size * info.channels;
+        switch (info.dim) {
+        case INTERPOLATE_1D:
+            total *= info.output_size[0]; // width
+            break;
+        case INTERPOLATE_2D:
+            total *= info.output_size[0] * info.output_size[1]; // height * width
+            break;
+        case INTERPOLATE_3D:
+            total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width
+            break;
+        }
+        return total;
+    }
+
+    // 主要的插值计算函数
+    template <typename T>
+    void interpolate_nearest_cpu(T *output, const T *input) const {
+        size_t total_elements = calculate_total_elements();
+
+#pragma omp parallel for schedule(static)
+        for (ptrdiff_t idx = 0; idx < static_cast<ptrdiff_t>(total_elements); ++idx) {
+            size_t input_idx;
+
+            switch (info.dim) {
+            case INTERPOLATE_1D:
+                input_idx = compute_input_index_1d<T>(idx);
+                break;
+            case INTERPOLATE_2D:
+                input_idx = compute_input_index_2d<T>(idx);
+                break;
+            case INTERPOLATE_3D:
+                input_idx = compute_input_index_3d<T>(idx);
+                break;
+            default:
+                continue;
+            }
+
+            size_t output_idx = compute_output_index<T>(idx);
+            output[output_idx] = input[input_idx];
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           const InterpolateNearestInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16 && data_type != INFINI_DTYPE_I8) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *output, const void *input, infiniDtype_t dtype) const {
+
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            fp16_t *typed_output = static_cast<fp16_t *>(output);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            bf16_t *typed_output = static_cast<bf16_t *>(output);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_I8: {
+            int8_t *typed_output = static_cast<int8_t *>(output);
+            const int8_t *typed_input = static_cast<const int8_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    // 检查数据类型支持
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_I8);
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, info, opaque->workspace_size, opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::interpolate_nearest::cpu
diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h
new file mode 100644
index 000000000..78dd3ff97
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __INTERPOLATE_NEAREST_CPU_H__
+#define __INTERPOLATE_NEAREST_CPU_H__
+
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __INTERPOLATE_NEAREST_CPU_H__
diff --git a/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh
new file mode 100644
index 000000000..60c798792
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh
@@ -0,0 +1,168 @@
+#ifndef INTERPOLATE_NEAREST_KERNEL_CUH
+#define INTERPOLATE_NEAREST_KERNEL_CUH
+
+#include "../info.h"
+#include <cmath>
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_1d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 1D 插值：3D 张量 (N, C, W)
+    size_t w = temp % info.output_size[0]; // width 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale)),
+                         info.input_size[0] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_2d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 2D 插值：4D 张量 (N, C, H, W)
+    size_t w = temp % info.output_size[1]; // width 在索引 1
+    temp /= info.output_size[1];
+    size_t h = temp % info.output_size[0]; // height 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale_h = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    float inv_scale_w = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+
+    size_t input_h = min(static_cast<size_t>(floorf(static_cast<float>(h) * inv_scale_h)),
+                         info.input_size[0] - 1);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale_w)),
+                         info.input_size[1] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_3d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 3D 插值：5D 张量 (N, C, D, H, W)
+    size_t w = temp % info.output_size[2]; // width 在索引 2
+    temp /= info.output_size[2];
+    size_t h = temp % info.output_size[1]; // height 在索引 1
+    temp /= info.output_size[1];
+    size_t d = temp % info.output_size[0]; // depth 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale_d = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    float inv_scale_h = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+    float inv_scale_w = static_cast<float>(info.input_size[2]) / static_cast<float>(info.output_size[2]);
+
+    size_t input_d = min(static_cast<size_t>(floorf(static_cast<float>(d) * inv_scale_d)),
+                         info.input_size[0] - 1);
+    size_t input_h = min(static_cast<size_t>(floorf(static_cast<float>(h) * inv_scale_h)),
+                         info.input_size[1] - 1);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale_w)),
+                         info.input_size[2] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_output_index(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+    size_t w, h, d, c, b;
+
+    switch (info.dim) {
+    case INTERPOLATE_1D: {
+        // 3D 张量 (N, C, W)
+        w = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2];
+    }
+
+    case INTERPOLATE_2D: {
+        // 4D 张量 (N, C, H, W)
+        w = temp % info.output_size[1];
+        temp /= info.output_size[1];
+        h = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3];
+    }
+
+    case INTERPOLATE_3D: {
+        // 5D 张量 (N, C, D, H, W)
+        w = temp % info.output_size[2];
+        temp /= info.output_size[2];
+        h = temp % info.output_size[1];
+        temp /= info.output_size[1];
+        d = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4];
+    }
+
+    default:
+        return 0;
+    }
+}
+
+__host__ __device__ inline size_t
+calculate_total_elements(const InterpolateNearestInfo &info) {
+    size_t total = info.batch_size * info.channels;
+    switch (info.dim) {
+    case INTERPOLATE_1D:
+        total *= info.output_size[0]; // width
+        break;
+    case INTERPOLATE_2D:
+        total *= info.output_size[0] * info.output_size[1]; // height * width
+        break;
+    case INTERPOLATE_3D:
+        total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width
+        break;
+    }
+    return total;
+}
+
+template <typename T>
+__global__ void interpolate_nearest_kernel(T *output, const T *input,
+                                           InterpolateNearestInfo info) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total_elements = calculate_total_elements(info);
+
+    if (idx < total_elements) {
+        size_t input_idx;
+
+        switch (info.dim) {
+        case INTERPOLATE_1D:
+            input_idx = compute_input_index_1d<T>(idx, info);
+            break;
+        case INTERPOLATE_2D:
+            input_idx = compute_input_index_2d<T>(idx, info);
+            break;
+        case INTERPOLATE_3D:
+            input_idx = compute_input_index_3d<T>(idx, info);
+            break;
+        default:
+            return;
+        }
+
+        size_t output_idx = compute_output_index<T>(idx, info);
+        output[output_idx] = input[input_idx];
+    }
+}
+
+#endif // INTERPOLATE_NEAREST_KERNEL_CUH
diff --git a/src/infiniop/ops/interpolate_nearest/info.h b/src/infiniop/ops/interpolate_nearest/info.h
new file mode 100644
index 000000000..162d6eb02
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/info.h
@@ -0,0 +1,118 @@
+#ifndef __INTERPOLATE_NEAREST_INFO_H__
+#define __INTERPOLATE_NEAREST_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <cstddef>
+
+enum InterpolateDim {
+    INTERPOLATE_1D = 1, // 3D 张量 (N, C, W)
+    INTERPOLATE_2D = 2, // 4D 张量 (N, C, H, W)
+    INTERPOLATE_3D = 3  // 5D 张量 (N, C, D, H, W)
+};
+
+struct InterpolateNearestInfo {
+    size_t batch_size;
+    size_t channels;
+
+    // 输入和输出的空间维度大小
+    size_t input_size[3];  // [depth/height/width] 根据维度使用不同数量
+    size_t output_size[3]; // [depth/height/width] 根据维度使用不同数量
+
+    InterpolateDim dim; // 插值维度：1D, 2D, 3D
+    infiniDtype_t dtype;
+
+    // 张量步长（最多支持 5D 张量）
+    size_t input_stride[5];
+    size_t output_stride[5];
+
+    static infiniStatus_t create(
+        InterpolateNearestInfo *info,
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc) {
+
+        // 检查数据类型
+        if (input_desc->dtype() != output_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        auto input_shape = input_desc->shape();
+        auto output_shape = output_desc->shape();
+        auto input_stride = input_desc->strides();
+        auto output_stride = output_desc->strides();
+
+        // 根据张量维度确定插值类型
+        if (input_desc->ndim() == 3 && output_desc->ndim() == 3) {
+            // 1D 插值：3D 张量 (N, C, W)
+            info->dim = INTERPOLATE_1D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // width
+            info->output_size[0] = output_shape[2]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 3; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else if (input_desc->ndim() == 4 && output_desc->ndim() == 4) {
+            // 2D 插值：4D 张量 (N, C, H, W)
+            info->dim = INTERPOLATE_2D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // height
+            info->input_size[1] = input_shape[3];   // width
+            info->output_size[0] = output_shape[2]; // height
+            info->output_size[1] = output_shape[3]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 4; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else if (input_desc->ndim() == 5 && output_desc->ndim() == 5) {
+            // 3D 插值：5D 张量 (N, C, D, H, W)
+            info->dim = INTERPOLATE_3D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // depth
+            info->input_size[1] = input_shape[3];   // height
+            info->input_size[2] = input_shape[4];   // width
+            info->output_size[0] = output_shape[2]; // depth
+            info->output_size[1] = output_shape[3]; // height
+            info->output_size[2] = output_shape[4]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 5; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info->dtype = input_desc->dtype();
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+#endif // __INTERPOLATE_NEAREST_INFO_H__
diff --git a/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h
new file mode 100644
index 000000000..73499c2ff
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h
@@ -0,0 +1,51 @@
+#ifndef __INTERPOLATE_NEAREST_H__
+#define __INTERPOLATE_NEAREST_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::interpolate_nearest::NAMESPACE {               \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+                                                                 \
+        InterpolateNearestInfo _info;                            \
+        infiniDtype_t _dtype;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            InterpolateNearestInfo info,                         \
+            size_t workspace_size,                               \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _dtype(dtype),                                     \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc);              \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __INTERPOLATE_NEAREST_H__
diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h
new file mode 100644
index 000000000..1619dbf2f
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h
@@ -0,0 +1,8 @@
+#ifndef __INTERPOLATE_NEAREST_METAX_H__
+#define __INTERPOLATE_NEAREST_METAX_H__
+
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(metax)
+
+#endif // __INTERPOLATE_NEAREST_METAX_H__
diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca
new file mode 100644
index 000000000..5cf0e5e66
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca
@@ -0,0 +1,86 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "interpolate_nearest_metax.h"
+#include <cstddef>
+#include <cstdint>
+
+#include "../cuda/kernel.cuh"
+
+namespace op::interpolate_nearest::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_) : internal(internal_) {}
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 &&
+        dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()}, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+inline void launch_interpolate_nearest_kernel(T *output, const T *input, InterpolateNearestInfo info, int grid_size, int block_size, hcStream_t stream) {
+    interpolate_nearest_kernel<T><<<grid_size, block_size, 0, stream>>>(output, input, info);
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+
+    int total_elements = calculate_total_elements(_info);
+    int block_size = 256;
+    int grid_size = (total_elements + block_size - 1) / block_size;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        auto typed_output = reinterpret_cast<float *>(output);
+        auto typed_input = reinterpret_cast<const float *>(input);
+        launch_interpolate_nearest_kernel<float>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    case INFINI_DTYPE_F16: {
+        auto typed_output = reinterpret_cast<half *>(output);
+        auto typed_input = reinterpret_cast<const half *>(input);
+        launch_interpolate_nearest_kernel<half>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        auto typed_output = reinterpret_cast<__hpcc_bfloat16 *>(output);
+        auto typed_input = reinterpret_cast<const __hpcc_bfloat16 *>(input);
+        launch_interpolate_nearest_kernel<__hpcc_bfloat16>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    case INFINI_DTYPE_I8: {
+        auto typed_output = reinterpret_cast<int8_t *>(output);
+        auto typed_input = reinterpret_cast<const int8_t *>(input);
+        launch_interpolate_nearest_kernel<int8_t>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::interpolate_nearest::metax
diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu
new file mode 100644
index 000000000..a7b63c6f4
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu
@@ -0,0 +1,93 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "interpolate_nearest_nvidia.cuh"
+#include <cstddef>
+#include <cstdint>
+#include <cuda_bf16.h>
+
+namespace op::interpolate_nearest::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_)
+        : internal(internal_) {}
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    // Check supported data types
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()},
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    size_t total_elements = calculate_total_elements(_info);
+
+    int block_size = 256;
+    int grid_size = (total_elements + block_size - 1) / block_size;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        float *typed_output = reinterpret_cast<float *>(output);
+        const float *typed_input = reinterpret_cast<const float *>(input);
+        interpolate_nearest_kernel<float>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+
+    case INFINI_DTYPE_F16: {
+        half *typed_output = reinterpret_cast<half *>(output);
+        const half *typed_input = reinterpret_cast<const half *>(input);
+        interpolate_nearest_kernel<half><<<grid_size, block_size, 0, cuda_stream>>>(
+            typed_output, typed_input, _info);
+    } break;
+
+    case INFINI_DTYPE_BF16: {
+        auto typed_output = reinterpret_cast<__nv_bfloat16 *>(output);
+        auto typed_input = reinterpret_cast<const __nv_bfloat16 *>(input);
+        interpolate_nearest_kernel<__nv_bfloat16>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+
+    case INFINI_DTYPE_I8: {
+        auto typed_output = reinterpret_cast<int8_t *>(output);
+        auto typed_input = reinterpret_cast<const int8_t *>(input);
+        interpolate_nearest_kernel<int8_t>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_CUDA(cudaGetLastError());
+    CHECK_CUDA(cudaStreamSynchronize(cuda_stream));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::interpolate_nearest::nvidia
diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh
new file mode 100644
index 000000000..aab5f7882
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh
@@ -0,0 +1,9 @@
+#ifndef __INTERPOLATE_NEAREST_NVIDIA_CUH__
+#define __INTERPOLATE_NEAREST_NVIDIA_CUH__
+
+#include "../../../devices/nvidia/nvidia_handle.h"
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INTERPOLATE_NEAREST_NVIDIA_CUH__
diff --git a/src/infiniop/ops/interpolate_nearest/operator.cc b/src/infiniop/ops/interpolate_nearest/operator.cc
new file mode 100644
index 000000000..0a0f99ee1
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/interpolate_nearest.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/interpolate_nearest_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/interpolate_nearest_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/interpolate_nearest_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(
+    infiniopHandle_t handle,
+    infiniopInterpolateNearestDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                             \
+        return op::interpolate_nearest::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                        \
+            reinterpret_cast<op::interpolate_nearest::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                                   \
+            input_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(
+    infiniopInterpolateNearestDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                               \
+    case CASE:                                                                                             \
+        *size = reinterpret_cast<op::interpolate_nearest::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopInterpolateNearest(
+    infiniopInterpolateNearestDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                                \
+        return reinterpret_cast<const op::interpolate_nearest::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(
+    infiniopInterpolateNearestDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                                 \
+        delete reinterpret_cast<const op::interpolate_nearest::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.cc b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.cc
new file mode 100644
index 000000000..8833cb999
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.cc
@@ -0,0 +1,126 @@
+#include "layer_norm_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::layer_norm::cpu {
+
+template <typename Tdata>
+infiniStatus_t calculate_layer_norm(
+    const LayerNormInfo &info,
+	Tdata * output,
+	Tdata * input_standardization,
+	Tdata * input_std_deviation,
+	const Tdata * input,
+	const Tdata * weight,
+	const Tdata * bias
+) {
+//  -------------------------------- start: perform operator on CPU --------------------------------
+    #pragma omp parallel for
+    for(size_t b = 0; b < info.input_shape[0] * info.input_shape[1]; b ++)
+    {
+        size_t b0 = b / info.input_shape[1], b1 = b % info.input_shape[1];
+        auto output_ptr = output + b0 * info.output_strides[0] + b1 * info.output_strides[1];
+        auto input_ptr = input + b0 * info.input_strides[0] + b1 * info.input_strides[1];
+        auto standard_ptr = input_standardization + b0 * info.input_standardization_strides[0] + b1 * info.input_standardization_strides[1];
+        auto std_ptr = input_std_deviation + b0 * info.input_std_deviation_strides[0] + b1 * info.input_std_deviation_strides[1];
+        float mean = op::common_cpu::reduce_op::sum(
+            input_ptr,
+            info.normalized_size,
+            info.input_strides[2]
+        ) / info.input_shape[2];
+        float sum_sq = op::common_cpu::reduce_op::sumSquared(
+            input_ptr,
+            info.normalized_size,
+            info.input_strides[2]
+        );
+        float var = sum_sq / (info.normalized_size) - mean * mean;
+        float std_deviation = std::sqrt(var + info.eps);
+        *std_ptr = utils::cast<Tdata>(std_deviation);
+
+        for(size_t d = 0; d < info.normalized_size; d ++)
+        {
+            float x_standard = (utils::cast<float>(*(input_ptr + d * info.input_strides[2])) - mean) / std_deviation;
+            *(standard_ptr + d * info.input_standardization_strides[2]) = utils::cast<Tdata>(x_standard);
+            *(output_ptr + d * info.output_strides[2]) = utils::cast<Tdata>(
+                x_standard * utils::cast<float>(*(weight + d * info.weight_strides[0])) + \
+                (info.bias_exist ? utils::cast<float>(*(bias + d * info.bias_strides[0])) : float(0))
+            );
+        }           
+
+        
+    
+}
+//  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_standardization_desc,
+	infiniopTensorDescriptor_t input_std_deviation_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t weight_desc,
+	infiniopTensorDescriptor_t bias_desc,
+	float eps
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = input_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = LayerNormInfo::createLayerNormInfo(
+		output_desc,
+		input_standardization_desc,
+		input_std_deviation_desc,
+		input_desc,
+		weight_desc,
+		bias_desc,
+		eps
+    );
+    CHECK_RESULT(result);
+    const LayerNormInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_LAYER_NORM(TDATA) \
+    CHECK_STATUS(calculate_layer_norm<TDATA>(_info, \
+(TDATA *)output, (TDATA *)input_standardization, (TDATA *)input_std_deviation, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias))
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	void * input_standardization,
+	void * input_std_deviation,
+	const void * input,
+	const void * weight,
+	const void * bias,
+    void *stream
+) const {
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CALCULATE_LAYER_NORM(fp16_t);
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CALCULATE_LAYER_NORM(bf16_t);
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CALCULATE_LAYER_NORM(float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.h b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.h
new file mode 100644
index 000000000..710373a4b
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LAYER_NORM_CPU_H__
+#define __LAYER_NORM_CPU_H__
+
+#include "../layer_norm.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __LAYER_NORM_CPU_H__
diff --git a/src/infiniop/ops/layer_norm/cuda/kernel.cuh b/src/infiniop/ops/layer_norm/cuda/kernel.cuh
new file mode 100644
index 000000000..6a2ba964a
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/cuda/kernel.cuh
@@ -0,0 +1,49 @@
+#ifndef __LAYER_NORM_KERNEL_CUH__
+#define __LAYER_NORM_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void layerNormKernel(
+    Tdata * output,
+    Tdata * input_standardization,
+    Tdata * input_std_deviation,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+    float eps,
+    size_t normalized_size,
+    const ptrdiff_t* output_strides,
+    const ptrdiff_t* input_standardization_strides,
+    const ptrdiff_t* input_std_deviation_strides,
+    const ptrdiff_t* input_strides,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+    bool bias_exist
+) {
+    size_t b0 = blockIdx.x, b1 = blockIdx.y;
+
+    auto output_ptr = output + b0 * output_strides[0] + b1 * output_strides[1];
+    auto input_ptr = input + b0 * input_strides[0] + b1 * input_strides[1];
+    auto standard_ptr = input_standardization + b0 * input_standardization_strides[0] + b1 * input_standardization_strides[1];
+    auto std_ptr = input_std_deviation + b0 * input_std_deviation_strides[0] + b1 * input_std_deviation_strides[1];
+    Tcompute mean = op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(
+        input_ptr,
+        normalized_size
+    ) / normalized_size;
+    Tcompute sum_squared = op::common_cuda::reduce_op::sumSquared<BLOCK_SIZE, Tdata, Tcompute>(
+        input_ptr,
+        normalized_size
+    );
+
+    Tcompute var = sum_squared / normalized_size - mean * mean;
+    Tcompute std_deviation = sqrtf(var + Tcompute(eps));
+    *std_ptr = std_deviation;
+
+    for (size_t d = 0; d < normalized_size; d ++) {
+        Tcompute x_standard = (Tcompute(input_ptr[d]) - mean) / std_deviation;
+        standard_ptr[d] = x_standard;
+        output_ptr[d] = x_standard * Tcompute(*(weight + d * weight_stride)) + (bias_exist ? Tcompute(*(bias + d * bias_stride)) : Tcompute(0));
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __LAYER_NORM_KERNEL_CUH__
diff --git a/src/infiniop/ops/layer_norm/info.h b/src/infiniop/ops/layer_norm/info.h
new file mode 100644
index 000000000..9b6c01692
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/info.h
@@ -0,0 +1,84 @@
+#ifndef __LAYER_NORM_INFO_H__
+#define __LAYER_NORM_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::layer_norm {
+
+class LayerNormInfo {
+private:
+    LayerNormInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+	size_t ndim;
+	std::vector<size_t> input_shape;
+	size_t normalized_size;
+	std::vector<ptrdiff_t> output_strides;
+	std::vector<ptrdiff_t> input_standardization_strides;
+	std::vector<ptrdiff_t> input_std_deviation_strides;
+	std::vector<ptrdiff_t> input_strides;
+	std::vector<ptrdiff_t> weight_strides;
+	std::vector<ptrdiff_t> bias_strides;
+	float eps;
+	bool bias_exist;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<LayerNormInfo> createLayerNormInfo(
+		infiniopTensorDescriptor_t output_desc,
+		infiniopTensorDescriptor_t input_standardization_desc,
+		infiniopTensorDescriptor_t input_std_deviation_desc,
+		infiniopTensorDescriptor_t input_desc,
+		infiniopTensorDescriptor_t weight_desc,
+		infiniopTensorDescriptor_t bias_desc,
+		float eps
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+		CHECK_SAME_SHAPE(
+            output_desc->shape(), input_desc->shape(), input_standardization_desc->shape()
+        );
+        size_t batch_size = input_desc->dim(0),
+            channel_size = input_desc->dim(1),
+            feature_size = input_desc->dim(2);	
+			
+		bool bias_exist = bias_desc != nullptr;
+		CHECK_OR_RETURN(
+            (!bias_exist) || (bias_desc->ndim() == 1 && bias_desc->dim(0) == feature_size),
+            INFINI_STATUS_BAD_TENSOR_SHAPE
+        );
+		CHECK_OR_RETURN(
+            (weight_desc->ndim() == 1) && (weight_desc->dim(0) == feature_size),
+            INFINI_STATUS_BAD_TENSOR_SHAPE
+        );
+        CHECK_OR_RETURN(
+            input_std_deviation_desc->ndim() == 2 && \
+			input_std_deviation_desc->dim(0) == batch_size && \
+			input_std_deviation_desc->dim(1) == channel_size,
+            INFINI_STATUS_BAD_TENSOR_SHAPE
+        );
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<LayerNormInfo>(LayerNormInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+			input_desc->ndim(),
+			input_desc->shape(),
+			input_desc->dim(input_desc->ndim() - 1),
+			output_desc->strides(),
+			input_standardization_desc->strides(),
+			input_std_deviation_desc->strides(),
+			input_desc->strides(),
+			weight_desc->strides(),
+			bias_exist ? bias_desc->strides() : std::vector<ptrdiff_t>(),
+			eps,
+			bias_exist
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __LAYER_NORM_INFO_H__
diff --git a/src/infiniop/ops/layer_norm/layer_norm.h b/src/infiniop/ops/layer_norm/layer_norm.h
new file mode 100644
index 000000000..8b847a80d
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/layer_norm.h
@@ -0,0 +1,55 @@
+#ifndef __LAYER_NORM_H__
+#define __LAYER_NORM_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::layer_norm::NAMESPACE {                             \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        LayerNormInfo _info;                                          \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            LayerNormInfo info,                                       \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_standardization_desc,    \
+            infiniopTensorDescriptor_t input_std_deviation_desc,      \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t weight_desc,                   \
+            infiniopTensorDescriptor_t bias_desc,                     \
+            float eps                                                 \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            void * input_standardization,                             \
+            void * input_std_deviation,                               \
+            const void * input,                                       \
+            const void * weight,                                      \
+            const void * bias,                                        \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
diff --git a/src/infiniop/ops/layer_norm/metax/layer_norm_metax.h b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.h
new file mode 100644
index 000000000..6634031a3
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LAYER_NORM_METAX_H__
+#define __LAYER_NORM_METAX_H__
+
+#include "../layer_norm.h"
+
+DESCRIPTOR(metax)
+
+#endif // __LAYER_NORM_METAX_H__
diff --git a/src/infiniop/ops/layer_norm/metax/layer_norm_metax.maca b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.maca
new file mode 100644
index 000000000..ee6080999
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.maca
@@ -0,0 +1,182 @@
+#include "../../../devices/metax/metax_common.h"
+#include "layer_norm_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::layer_norm::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    Tdata * input_standardization,
+    Tdata * input_std_deviation,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+    float eps,
+    size_t normalized_size,
+    const ptrdiff_t* output_strides,
+    const ptrdiff_t* input_standardization_strides,
+    const ptrdiff_t* input_std_deviation_strides,
+    const ptrdiff_t* input_strides,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+    bool bias_exist
+) {
+    layerNormKernel<BLOCK_SIZE, Tdata, Tcompute>(
+        output,
+        input_standardization,
+        input_std_deviation,
+        input,
+        weight,
+        bias,
+        eps,
+        normalized_size,
+        output_strides,
+        input_standardization_strides,
+        input_std_deviation_strides,
+        input_strides,
+        weight_stride,
+        bias_stride,
+        bias_exist
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_layer_norm(
+    const LayerNormInfo &info,
+    Tdata * output,
+    Tdata * input_standardization,
+    Tdata * input_std_deviation,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+    hcStream_t stream,
+    void *workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * input_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * input_standardization_strides_cuda = output_strides_cuda + ndim;
+    ptrdiff_t * input_std_deviation_strides_cuda = input_standardization_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_standardization_strides_cuda, info.input_standardization_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_std_deviation_strides_cuda, info.input_std_deviation_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), hcMemcpyHostToDevice, stream));
+
+    launchKernel<1, Tdata, float><<<dim3(info.input_shape[0], info.input_shape[1]), 1, 0, stream>>>(
+        output,
+        input_standardization,
+        input_std_deviation,
+        input,
+        weight,
+        bias,
+        info.eps,
+        info.normalized_size,
+        output_strides_cuda,
+        input_standardization_strides_cuda,
+        input_std_deviation_strides_cuda,
+        input_strides_cuda,
+        info.weight_strides[0],
+        info.bias_exist ? info.bias_strides[0] : 0,
+        info.bias_exist
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_standardization_desc,
+    infiniopTensorDescriptor_t input_std_deviation_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float eps
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    auto result = LayerNormInfo::createLayerNormInfo(
+        output_desc,
+        input_standardization_desc,
+        input_std_deviation_desc,
+        input_desc,
+        weight_desc,
+        bias_desc,
+        eps
+    );
+    CHECK_RESULT(result);
+    const LayerNormInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    void * input_standardization,
+    void * input_std_deviation,
+    const void * input,
+    const void * weight,
+    const void * bias,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_LAYER_NORM(BLOCK_SIZE, TDATA) \
+        calculate_layer_norm<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (TDATA *)input_standardization, (TDATA *)input_std_deviation, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream, workspace)
+    #define CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK(BLOCK_SIZE)         \
+    {                                                                 \
+        if (_info.dtype == INFINI_DTYPE_F16)                          \
+            return CALCULATE_LAYER_NORM(BLOCK_SIZE, half);            \
+        else if (_info.dtype == INFINI_DTYPE_F32)                     \
+            return CALCULATE_LAYER_NORM(BLOCK_SIZE, float);           \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                    \
+            return CALCULATE_LAYER_NORM(BLOCK_SIZE, cuda_bfloat16);   \
+        else                                                          \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK
+    #undef CALCULATE_LAYER_NORM
+}
+} // namespace op::layer_norm::metax
diff --git a/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu
new file mode 100644
index 000000000..5d512423d
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu
@@ -0,0 +1,184 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include <cub/block/block_reduce.cuh>
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "layer_norm_nvidia.cuh"
+#include "../info.h"
+
+namespace op::layer_norm::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    Tdata * input_standardization,
+    Tdata * input_std_deviation,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+    float eps,
+    size_t normalized_size,
+    const ptrdiff_t* output_strides,
+    const ptrdiff_t* input_standardization_strides,
+    const ptrdiff_t* input_std_deviation_strides,
+    const ptrdiff_t* input_strides,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+    bool bias_exist
+) {
+    layerNormKernel<BLOCK_SIZE, Tdata, Tcompute>(
+        output,
+        input_standardization,
+        input_std_deviation,
+        input,
+        weight,
+        bias,
+        eps,
+        normalized_size,
+        output_strides,
+        input_standardization_strides,
+        input_std_deviation_strides,
+        input_strides,
+        weight_stride,
+        bias_stride,
+        bias_exist
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_layer_norm(
+    const LayerNormInfo &info,
+    Tdata * output,
+    Tdata * input_standardization,
+    Tdata * input_std_deviation,
+    const Tdata * input,
+    const Tdata * weight,
+    const Tdata * bias,
+    cudaStream_t stream,
+    void *workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * input_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * input_standardization_strides_cuda = output_strides_cuda + ndim;
+    ptrdiff_t * input_std_deviation_strides_cuda = input_standardization_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_standardization_strides_cuda, info.input_standardization_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_std_deviation_strides_cuda, info.input_std_deviation_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), cudaMemcpyHostToDevice, stream));
+
+    launchKernel<1, Tdata, float><<<dim3(info.input_shape[0], info.input_shape[1]), 1, 0, stream>>>(
+        output,
+        input_standardization,
+        input_std_deviation,
+        input,
+        weight,
+        bias,
+        info.eps,
+        info.normalized_size,
+        output_strides_cuda,
+        input_standardization_strides_cuda,
+        input_std_deviation_strides_cuda,
+        input_strides_cuda,
+        info.weight_strides[0],
+        info.bias_exist ? info.bias_strides[0] : 0,
+        info.bias_exist
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_standardization_desc,
+    infiniopTensorDescriptor_t input_std_deviation_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float eps
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = output_desc->ndim() * sizeof(size_t) * 5;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = LayerNormInfo::createLayerNormInfo(
+        output_desc,
+        input_standardization_desc,
+        input_std_deviation_desc,
+        input_desc,
+        weight_desc,
+        bias_desc,
+        eps
+    );
+    CHECK_RESULT(result);
+    const LayerNormInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    void * input_standardization,
+    void * input_std_deviation,
+    const void * input,
+    const void * weight,
+    const void * bias,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_LAYER_NORM(BLOCK_SIZE, TDATA) \
+        calculate_layer_norm<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (TDATA *)input_standardization, (TDATA *)input_std_deviation, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream, workspace)
+    #define CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(BLOCK_SIZE)          \
+    {                                                                 \
+        if (_info.dtype == INFINI_DTYPE_F16)                          \
+            return CALCULATE_LAYER_NORM(BLOCK_SIZE, half);            \
+        else if (_info.dtype == INFINI_DTYPE_F32)                     \
+            return CALCULATE_LAYER_NORM(BLOCK_SIZE, float);           \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                    \
+            return CALCULATE_LAYER_NORM(BLOCK_SIZE, __nv_bfloat16);   \
+        else                                                          \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::layer_norm::nvidia
diff --git a/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cuh b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cuh
new file mode 100644
index 000000000..bb770d19a
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __LAYER_NORM_NVIDIA_API_H__
+#define __LAYER_NORM_NVIDIA_API_H__
+#include "../layer_norm.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __LAYER_NORM_NVIDIA_API_H__
diff --git a/src/infiniop/ops/layer_norm/operator.cc b/src/infiniop/ops/layer_norm/operator.cc
new file mode 100644
index 000000000..846c9ce7f
--- /dev/null
+++ b/src/infiniop/ops/layer_norm/operator.cc
@@ -0,0 +1,166 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/layer_norm.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/layer_norm_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/layer_norm_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/layer_norm_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLayerNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopLayerNormDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_standardization_desc,
+	infiniopTensorDescriptor_t input_std_deviation_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t weight_desc,
+	infiniopTensorDescriptor_t bias_desc,
+	float eps
+) {
+#define CREATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        return op::layer_norm::NAMESPACE::Descriptor::create(                             \
+            handle,                                                                       \
+            reinterpret_cast<op::layer_norm::NAMESPACE::Descriptor **>(desc_ptr),         \
+            output_desc,                                                                  \
+            input_standardization_desc,                                                   \
+            input_std_deviation_desc,                                                     \
+            input_desc,                                                                   \
+            weight_desc,                                                                  \
+            bias_desc,                                                                    \
+            eps                                                                           \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::layer_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize();   \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLayerNorm(
+    infiniopLayerNormDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	void * input_standardization,
+	void * input_std_deviation,
+	const void * input,
+	const void * weight,
+	const void * bias,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::layer_norm::NAMESPACE::Descriptor *>(desc)->calculate(    \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            input_standardization,                                                                  \
+            input_std_deviation,                                                                    \
+            input,                                                                                  \
+            weight,                                                                                 \
+            bias,                                                                                   \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::layer_norm::NAMESPACE::Descriptor *>(desc);     \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
new file mode 100644
index 000000000..c81f0a539
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
@@ -0,0 +1,66 @@
+#include "logical_and_cpu.h"
+#include "infinicore.h"
+
+namespace op::logical_and::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<LogicalAndOp, bool, bool, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<LogicalAndOp, bool, int8_t, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<LogicalAndOp, bool, int16_t, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<LogicalAndOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<LogicalAndOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<LogicalAndOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<LogicalAndOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<LogicalAndOp, bool, float, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<LogicalAndOp, bool, double, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::cpu
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
new file mode 100644
index 000000000..701960bd5
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
@@ -0,0 +1,29 @@
+#ifndef __LOGICAL_AND_CPU_H__
+#define __LOGICAL_AND_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, cpu)
+
+namespace op::logical_and::cpu {
+typedef struct LogicalAndOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        if constexpr (std::is_same_v<Ta, bf16_t> || std::is_same_v<Ta, fp16_t>) {
+            float f_a = utils::cast<float, Ta>(a);
+            float f_b = utils::cast<float, Ta>(b);
+            return f_a && f_b;
+        } else {
+            return a && b;
+        }
+    }
+} LogicalAndOp;
+} // namespace op::logical_and::cpu
+
+#endif // __LOGICAL_AND_CPU_H__
diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh
new file mode 100644
index 000000000..0b763d951
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh
@@ -0,0 +1,19 @@
+#ifndef __LOGICAL_AND_CUDA_H__
+#define __LOGICAL_AND_CUDA_H__
+
+namespace op::logical_and::cuda {
+typedef struct LogicalAndOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        return a && b;
+    }
+} LogicalAndOp;
+} // namespace op::logical_and::cuda
+
+#endif // __LOGICAL_AND_CUDA_H__
diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.h b/src/infiniop/ops/logical_and/metax/logical_and_metax.h
new file mode 100644
index 000000000..696697322
--- /dev/null
+++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_AND_METAX_API_H__
+#define __LOGICAL_AND_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, metax)
+
+#endif // __LOGICAL_AND_METAX_API_H__
diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.maca b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca
new file mode 100644
index 000000000..68e3f0ddb
--- /dev/null
+++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca
@@ -0,0 +1,73 @@
+#include "logical_and_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::logical_and::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::metax
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
new file mode 100644
index 000000000..7f0680a57
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
@@ -0,0 +1,73 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_and_nvidia.cuh"
+
+namespace op::logical_and::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::nvidia
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
new file mode 100644
index 000000000..9d68754bf
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_AND_CUDA_API_H__
+#define __LOGICAL_AND_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, nvidia)
+
+#endif // __LOGICAL_AND_CUDA_API_H__
diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc
new file mode 100644
index 000000000..ccc168a90
--- /dev/null
+++ b/src/infiniop/ops/logical_and/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/logical_and.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_and_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/logical_and_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/logical_and_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLogicalAndDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogicalAndDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::logical_and::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::logical_and::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::logical_and::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLogicalAnd(
+    infiniopLogicalAndDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::logical_and::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::logical_and::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
new file mode 100644
index 000000000..1324c98f1
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
@@ -0,0 +1,66 @@
+#include "logical_or_cpu.h"
+#include "infinicore.h"
+
+namespace op::logical_or::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<LogicalOrOp, bool, bool, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<LogicalOrOp, bool, int8_t, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<LogicalOrOp, bool, int16_t, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<LogicalOrOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<LogicalOrOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<LogicalOrOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<LogicalOrOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<LogicalOrOp, bool, float, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<LogicalOrOp, bool, double, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::cpu
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
new file mode 100644
index 000000000..7c26c4d37
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
@@ -0,0 +1,29 @@
+#ifndef __LOGICAL_OR_CPU_H__
+#define __LOGICAL_OR_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, cpu)
+
+namespace op::logical_or::cpu {
+typedef struct LogicalOrOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        if constexpr (std::is_same_v<Ta, bf16_t> || std::is_same_v<Ta, fp16_t>) {
+            float f_a = utils::cast<float, Ta>(a);
+            float f_b = utils::cast<float, Ta>(b);
+            return f_a || f_b;
+        } else {
+            return a || b;
+        }
+    }
+} LogicalOrOp;
+} // namespace op::logical_or::cpu
+
+#endif // __LOGICAL_OR_CPU_H__
diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh
new file mode 100644
index 000000000..3c705428e
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh
@@ -0,0 +1,19 @@
+#ifndef __LOGICAL_OR_CUDA_H__
+#define __LOGICAL_OR_CUDA_H__
+
+namespace op::logical_or::cuda {
+typedef struct LogicalOrOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        return a || b;
+    }
+} LogicalOrOp;
+} // namespace op::logical_or::cuda
+
+#endif // __LOGICAL_OR_CUDA_H__
diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.h b/src/infiniop/ops/logical_or/metax/logical_or_metax.h
new file mode 100644
index 000000000..e530d9ed5
--- /dev/null
+++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_OR_METAX_API_H__
+#define __LOGICAL_OR_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, metax)
+
+#endif // __LOGICAL_OR_METAX_API_H__
diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.maca b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca
new file mode 100644
index 000000000..17d1f8ed0
--- /dev/null
+++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca
@@ -0,0 +1,73 @@
+#include "logical_or_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::logical_or::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::metax
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
new file mode 100644
index 000000000..151079f07
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
@@ -0,0 +1,73 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_or_nvidia.cuh"
+
+namespace op::logical_or::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::nvidia
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
new file mode 100644
index 000000000..a70bd8da7
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_OR_CUDA_API_H__
+#define __LOGICAL_OR_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, nvidia)
+
+#endif // __LOGICAL_OR_CUDA_API_H__
diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc
new file mode 100644
index 000000000..463d7fc1b
--- /dev/null
+++ b/src/infiniop/ops/logical_or/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/logical_or.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_or_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/logical_or_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/logical_or_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLogicalOrDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogicalOrDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::logical_or::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::logical_or::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::logical_or::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLogicalOr(
+    infiniopLogicalOrDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::logical_or::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::logical_or::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc
new file mode 100644
index 000000000..5c729e7e8
--- /dev/null
+++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc
@@ -0,0 +1,322 @@
+#include "maxpool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace op::maxpool::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    MaxPoolInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const MaxPoolInfo &maxpool_info)
+        : handle(handle_ptr), info(maxpool_info) {
+        // CPU实现通常不需要额外的工作空间
+        workspace_size = 0;
+    }
+
+    // 获取数据类型的最小值
+    template <typename T>
+    static T get_min_value() {
+        if constexpr (std::is_same<T, float>::value) {
+            return -std::numeric_limits<float>::infinity();
+        } else if constexpr (std::is_same<T, fp16_t>::value) {
+            return _f32_to_f16(-std::numeric_limits<float>::infinity());
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return _f32_to_bf16(-std::numeric_limits<float>::infinity());
+        } else {
+            return std::numeric_limits<T>::lowest();
+        }
+    }
+
+    // 比较两个值的大小（处理半精度类型）
+    template <typename T>
+    static bool is_greater(const T &a, const T &b) {
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else {
+            return a > b;
+        }
+    }
+
+    // 1D最大池化
+    template <typename T>
+    void maxpool_1d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_width + c * input_width;
+                size_t output_offset = b * channels * output_width + c * output_width;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    T max_val = get_min_value<T>();
+                    bool found_valid = false;
+
+                    int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    int end_w = start_w + static_cast<int>(kernel_width);
+
+                    for (int kw = start_w; kw < end_w; ++kw) {
+                        if (kw >= 0 && kw < static_cast<int>(input_width)) {
+                            T val = input[input_offset + kw];
+                            if (!found_valid || is_greater(val, max_val)) {
+                                max_val = val;
+                                found_valid = true;
+                            }
+                        }
+                    }
+
+                    output[output_offset + ow] = max_val;
+                }
+            }
+        }
+    }
+
+    // 2D最大池化
+    template <typename T>
+    void maxpool_2d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width;
+                size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        T max_val = get_min_value<T>();
+                        bool found_valid = false;
+
+                        int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        int end_h = start_h + static_cast<int>(kernel_height);
+                        int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                        int end_w = start_w + static_cast<int>(kernel_width);
+
+                        for (int kh = start_h; kh < end_h; ++kh) {
+                            for (int kw = start_w; kw < end_w; ++kw) {
+                                if (kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                    T val = input[input_offset + kh * input_width + kw];
+                                    if (!found_valid || is_greater(val, max_val)) {
+                                        max_val = val;
+                                        found_valid = true;
+                                    }
+                                }
+                            }
+                        }
+
+                        output[output_offset + oh * output_width + ow] = max_val;
+                    }
+                }
+            }
+        }
+    }
+
+    // 3D最大池化
+    template <typename T>
+    void maxpool_3d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width;
+                size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            T max_val = get_min_value<T>();
+                            bool found_valid = false;
+
+                            int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            int end_d = start_d + static_cast<int>(kernel_depth);
+                            int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            int end_h = start_h + static_cast<int>(kernel_height);
+                            int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                            int end_w = start_w + static_cast<int>(kernel_width);
+
+                            for (int kd = start_d; kd < end_d; ++kd) {
+                                for (int kh = start_h; kh < end_h; ++kh) {
+                                    for (int kw = start_w; kw < end_w; ++kw) {
+                                        if (kd >= 0 && kd < static_cast<int>(input_depth) && kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                            T val = input[input_offset + kd * input_height * input_width + kh * input_width + kw];
+                                            if (!found_valid || is_greater(val, max_val)) {
+                                                max_val = val;
+                                                found_valid = true;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            output[output_offset + od * output_height * output_width + oh * output_width + ow] = max_val;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // 主要的最大池化计算函数
+    template <typename T>
+    void maxpool_cpu(T *output, const T *input) const {
+        switch (info.ndim) {
+        case 1:
+            maxpool_1d(output, input);
+            break;
+        case 2:
+            maxpool_2d(output, input);
+            break;
+        case 3:
+            maxpool_3d(output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           MaxPoolInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *output, const void *input, infiniDtype_t dtype) const {
+
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            fp16_t *typed_output = static_cast<fp16_t *>(output);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            bf16_t *typed_output = static_cast<bf16_t *>(output);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::maxpool::cpu
diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h
new file mode 100644
index 000000000..f3ecd349d
--- /dev/null
+++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_CPU_H__
+#define __MAX_POOL_CPU_H__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __MAX_POOL_CPU_H__
diff --git a/src/infiniop/ops/maxpool/info.h b/src/infiniop/ops/maxpool/info.h
new file mode 100644
index 000000000..ff56fe28c
--- /dev/null
+++ b/src/infiniop/ops/maxpool/info.h
@@ -0,0 +1,113 @@
+#ifndef __MAX_POOL_INFO_H__
+#define __MAX_POOL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::maxpool {
+
+inline utils::Result<size_t> calculateMaxPoolOutputSize(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding = 0,
+    bool ceil_mode = false) {
+
+    if (stride == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+    if (kernel_size == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+
+    // 理论最大输出数
+    size_t max_output = 0;
+    if (ceil_mode) {
+        max_output = (input_size + 2 * padding - kernel_size + stride - 1) / stride + 1;
+    } else {
+        max_output = (input_size + 2 * padding - kernel_size) / stride + 1;
+    }
+
+    size_t valid_output = 0;
+    for (size_t i = 0; i < max_output; ++i) {
+        int64_t start = static_cast<int64_t>(i) * stride - padding;
+        int64_t end = start + kernel_size;
+        // 判断区间 [start, end) 和 [0, input_size) 是否有交集
+        int64_t real_start = std::max(start, int64_t(0));
+        int64_t real_end = std::min(end, int64_t(input_size));
+        if (real_end > real_start) {
+            ++valid_output;
+        }
+    }
+    return utils::Result<size_t>(valid_output);
+}
+
+class MaxPoolInfo {
+    MaxPoolInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;
+    std::vector<size_t> output_dims;
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+
+    static utils::Result<MaxPoolInfo> create(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        MaxPoolInfo info;
+
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != output_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info.ndim = input_desc->ndim() - 2; // spatial dimensions
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // Get spatial dimensions
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+            auto output_size = calculateMaxPoolOutputSize(
+                info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode);
+            CHECK_RESULT(output_size);
+            size_t expected_size = output_size.take();
+            if (expected_size != output_desc->dim(i + 2)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            info.output_dims.push_back(output_desc->dim(i + 2));
+        }
+        return utils::Result<MaxPoolInfo>(std::move(info));
+    }
+};
+} // namespace op::maxpool
+
+#endif // __MAX_POOL_INFO_H__
diff --git a/src/infiniop/ops/maxpool/maxpool.h b/src/infiniop/ops/maxpool/maxpool.h
new file mode 100644
index 000000000..5ee7703c5
--- /dev/null
+++ b/src/infiniop/ops/maxpool/maxpool.h
@@ -0,0 +1,53 @@
+#ifndef __MAX_POOL_H__
+#define __MAX_POOL_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::maxpool::NAMESPACE {                           \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        MaxPoolInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            MaxPoolInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __MAX_POOL_H__
diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.cc b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc
new file mode 100644
index 000000000..b70286abd
--- /dev/null
+++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc
@@ -0,0 +1,217 @@
+#include "maxpool_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_hcdnn_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor);    \
+        DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor);   \
+        DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::maxpool::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_HCDNN_API
+    hcdnnTensorDescriptor_t input_desc = nullptr;
+    hcdnnTensorDescriptor_t output_desc = nullptr;
+    hcdnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_HCDNN_API
+    infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info,
+                                            hcdnnDataType_t hcdnn_data_type) {
+        // 创建输入输出张量描述符
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc));
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc));
+        CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc));
+
+        // 构建输入输出维度（NCHW格式）
+        std::vector<int> input_dims = {static_cast<int>(info.batch),
+                                       static_cast<int>(info.channels)};
+        std::vector<int> output_dims = {static_cast<int>(info.batch),
+                                        static_cast<int>(info.channels)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        // 1D池化补充维度
+        if (info.ndim == 1) {
+            input_dims.push_back(1);
+            output_dims.push_back(1);
+        }
+
+        // 计算输入输出张量的步幅
+        std::vector<int> input_strides(input_dims.size(), 1);
+        std::vector<int> output_strides(output_dims.size(), 1);
+        for (int i = input_dims.size() - 2; i >= 0; --i) {
+            input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+            output_strides[i] = output_strides[i + 1] * output_dims[i + 1];
+        }
+
+        // 设置张量描述符（NCHW格式）
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data()));
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            output_desc, hcdnn_data_type, output_dims.size(), output_dims.data(),
+            output_strides.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) {
+        // 构建池化参数
+        std::vector<int> kernel_size, strides, pads;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_size.push_back(static_cast<int>(info.kernel_sizes[i]));
+            strides.push_back(static_cast<int>(info.strides[i]));
+            pads.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        // 1D池化补充维度
+        if (info.ndim == 1) {
+            kernel_size.push_back(1);
+            strides.push_back(1);
+            pads.push_back(0);
+        }
+
+        // 设置最大池化描述符（确定性模式）
+        CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(
+            pooling_desc, HCDNN_POOLING_MAX_DETERMINISTIC, // 确定性最大池化
+            HCDNN_NOT_PROPAGATE_NAN,                       // 不传播NaN
+            kernel_size.size(),
+            kernel_size.data(),
+            pads.data(),
+            strides.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializehcdnnContext(MaxPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type);
+        CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // 最大池化通常不需要工作空间
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+#ifdef ENABLE_HCDNN_API
+          ,
+          input_desc(other.input_desc), output_desc(other.output_desc), pooling_desc(other.pooling_desc)
+#endif
+    {
+#ifdef ENABLE_HCDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_HCDNN_API
+        CLEANUP_hcdnn_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::metax::Handle::Internal> internal_ptr,
+           MaxPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_HCDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializehcdnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_HCDNN_API
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_HCDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    // 执行最大池化前向计算
+    CHECK_STATUS(_opaque->internal->useMcdnn(
+        (hcStream_t)stream, [&](hcdnnHandle_t handle) {
+            CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::maxpool::metax
diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.h b/src/infiniop/ops/maxpool/metax/maxpool_metax.h
new file mode 100644
index 000000000..5051358de
--- /dev/null
+++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.h
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_METAX_H__
+#define __MAX_POOL_METAX_H__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(metax)
+
+#endif // __MAX_POOL_METAX_CUH__
diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu
new file mode 100644
index 000000000..8b94a29c1
--- /dev/null
+++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu
@@ -0,0 +1,240 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "maxpool_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);    \
+        DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor);   \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::maxpool::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        // Create CUDNN descriptors
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc));
+
+        // Setup tensor descriptors
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) {
+        // Setup pooling descriptor
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN,
+            kernel_vec.size(), kernel_vec.data(), pad_vec.data(),
+            stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(MaxPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // Max pooling typically doesn't need workspace
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+          , input_desc(other.input_desc)
+          , output_desc(other.output_desc)
+          , pooling_desc(other.pooling_desc)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           MaxPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    // 打印input展平后的前十个数据
+    //  printf("MaxPool input (first 10 elements): ");
+    //  const uint16_t *input_data = static_cast<const uint16_t *>(input);
+    //  for (int i = 0; i < 10; ++i) {
+    //    // 将BF16转换为float显示
+    //    union {
+    //      uint32_t bits;
+    //      float value;
+    //    } converter;
+    //    uint16_t bf16_val = input_data[i];
+    //    converter.bits = static_cast<uint32_t>(bf16_val) << 16;
+    //    printf("%f ", converter.value);
+    //  }
+    //  printf("\n");
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::maxpool::nvidia
diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh
new file mode 100644
index 000000000..539ad5a1a
--- /dev/null
+++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_CUDA_CUH__
+#define __MAX_POOL_CUDA_CUH__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __MAX_POOL_CUDA_CUH__
diff --git a/src/infiniop/ops/maxpool/operator.cc b/src/infiniop/ops/maxpool/operator.cc
new file mode 100644
index 000000000..aedfc0585
--- /dev/null
+++ b/src/infiniop/ops/maxpool/operator.cc
@@ -0,0 +1,155 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/maxpool.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/maxpool_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/maxpool_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/maxpool_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMaxPoolDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaxPoolDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::maxpool::NAMESPACE::Descriptor::create(                     \
+            handle,                                                            \
+            reinterpret_cast<op::maxpool::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                       \
+            input_desc,                                                        \
+            kernel_size,                                                       \
+            strides,                                                           \
+            pads,                                                              \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(
+    infiniopMaxPoolDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopMaxPool(
+    infiniopMaxPoolDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                    \
+        return reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size,                                \
+                        output,                                                   \
+                        input,                                                    \
+                        stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        delete reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
new file mode 100644
index 000000000..55a340226
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
@@ -0,0 +1,102 @@
+#include "reduce_max_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::reduce_max::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto result = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
+    const size_t cols = info->shape[3]; // 规约维度
+    const ptrdiff_t y_batch_stride = info->y_strides[0];
+    const ptrdiff_t y_channel_stride = info->y_strides[1];
+    const ptrdiff_t y_row_stride = info->y_strides[2];
+    const ptrdiff_t x_batch_stride = info->x_strides[0];
+    const ptrdiff_t x_channel_stride = info->x_strides[1];
+    const ptrdiff_t x_row_stride = info->x_strides[2];
+    const ptrdiff_t x_col_stride = info->x_strides[3];
+#ifdef _WIN32
+    const ptrdiff_t batch_size = static_cast<ptrdiff_t>(info->shape[0]);
+    const ptrdiff_t channels = static_cast<ptrdiff_t>(info->shape[1]);
+    const ptrdiff_t rows = static_cast<ptrdiff_t>(info->shape[2]);
+#pragma omp parallel for
+    for (ptrdiff_t batch = 0; batch < batch_size; ++batch) {
+        for (ptrdiff_t channel = 0; channel < channels; ++channel) {
+            for (ptrdiff_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride);
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(max);
+                } else {
+                    *output_ptr = max;
+                }
+            }
+        }
+    }
+#else
+    const size_t batch_size = info->shape[0];
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
+#pragma omp parallel for collapse(3)
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        for (size_t channel = 0; channel < channels; ++channel) {
+            for (size_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride);
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(max);
+                } else {
+                    *output_ptr = max;
+                }
+            }
+        }
+    }
+#endif //_WIN32
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(reduce_max<fp16_t>(&_info, (fp16_t *)y, (const fp16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(reduce_max<bf16_t>(&_info, (bf16_t *)y, (const bf16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(reduce_max<float>(&_info, (float *)y, (const float *)x));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reduce_max::cpu
diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h
new file mode 100644
index 000000000..bf6be9b74
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __REDUCE_MAX_CPU_H__
+#define __REDUCE_MAX_CPU_H__
+#include "../reduce_max.h"
+
+DESCRIPTOR(cpu)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h
new file mode 100644
index 000000000..99bb5be19
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/info.h
@@ -0,0 +1,62 @@
+﻿#ifndef __REDUCE_MAX_INFO_H__
+#define __REDUCE_MAX_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::reduce_max {
+
+class ReduceMaxInfo {
+    ReduceMaxInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> y_strides;
+    std::vector<ptrdiff_t> x_strides;
+
+    static utils::Result<ReduceMaxInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) {
+        auto dtype = y_desc->dtype();
+        if (dtype != x_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+
+        size_t ndim = y_desc->ndim();
+        if (x_desc->ndim() != ndim) {
+            CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+        CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
+        if (ndim > 4) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        } else if (ndim == 0) {
+            std::vector<size_t> shape = {1, 1, 1, 1};
+            std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
+            std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
+            return utils::Result<ReduceMaxInfo>(ReduceMaxInfo{
+                dtype, shape, y_strides, x_strides});
+        } else {
+            std::vector<size_t> shape = x_desc->shape();
+            std::vector<ptrdiff_t> y_strides = y_desc->strides();
+            std::vector<ptrdiff_t> x_strides = x_desc->strides();
+            if (dim != (shape.size() - 1)) {
+                std::swap(shape[dim], shape[shape.size() - 1]);
+                std::swap(y_strides[dim], y_strides[shape.size() - 1]);
+                std::swap(x_strides[dim], x_strides[shape.size() - 1]);
+            }
+            while (shape.size() < 4) {
+                shape.insert(shape.begin(), 1);
+                y_strides.insert(y_strides.begin(), 0);
+                x_strides.insert(x_strides.begin(), 0);
+            }
+            return utils::Result<ReduceMaxInfo>(ReduceMaxInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+    }
+};
+
+} // namespace op::reduce_max
+
+#endif // __REDUCE_MAX_INFO_H__
diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h
new file mode 100644
index 000000000..735bc8da4
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MAX_METAX_H__
+#define __REDUCE_MAX_METAX_H__
+
+#include "../reduce_max.h"
+
+DESCRIPTOR(metax)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca
new file mode 100644
index 000000000..ab120fea5
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca
@@ -0,0 +1,96 @@
+#include "../../../devices/metax/metax_common.h"
+#include "reduce_max_metax.h"
+
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../nvidia/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL ReduceMax(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_max::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            hcStream_t stream) {
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMax<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMax<BLOCK_SIZE, __hpcc_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMax<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_max::metax
diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
new file mode 100644
index 000000000..fec2f9341
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
@@ -0,0 +1,21 @@
+﻿#ifndef __REDUCE_MAX_KERNEL_CUH__
+#define __REDUCE_MAX_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void ReduceMaxKernel(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+
+    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
+    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
+
+    // [Reduce] Find the max of each updated row and store in shared memory
+    Tcompute max_0 = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, width, x_stride_w);
+    if (threadIdx.x == 0) {
+        *y = max_0;
+    }
+}
+
+#endif // __REDUCE_MAX_KERNEL_CUH__
diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
new file mode 100644
index 000000000..f64f596f4
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
@@ -0,0 +1,101 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "reduce_max_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL ReduceMax(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_max::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            cudaStream_t stream) {
+    dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMax<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMax<BLOCK_SIZE, __nv_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMax<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_max::nvidia
diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh
new file mode 100644
index 000000000..388738c27
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MAX_NVIDIA_H__
+#define __REDUCE_MAX_NVIDIA_H__
+
+#include "../reduce_max.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc
new file mode 100644
index 000000000..ad7b33393
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/operator.cc
@@ -0,0 +1,181 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reduce_max.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reduce_max_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/reduce_max_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/reduce_max_metax.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/reduce_max_ascend.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReduceMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::reduce_max::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                          \
+            input_desc,                                                           \
+            dim);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CREATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+        //         // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         GET(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size);
+        //         // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReduceMax(
+    infiniopReduceMaxDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
+        return reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, output, input, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+        //         // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                \
+    case CASE:                                                                  \
+        delete reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc);
+        //         // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu:
+        //         return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc);
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h
new file mode 100644
index 000000000..c6edd7365
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/reduce_max.h
@@ -0,0 +1,47 @@
+#ifndef REDUCE_MAX_H
+#define REDUCE_MAX_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::reduce_max::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ReduceMaxInfo _info;                                     \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            ReduceMaxInfo info,                                  \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            size_t dim);                                         \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // REDUCE_MAX_H
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
new file mode 100644
index 000000000..7853afdbb
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
@@ -0,0 +1,102 @@
+#include "reduce_mean_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::reduce_mean::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto result = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
+    const size_t cols = info->shape[3]; // 规约维度
+    const ptrdiff_t y_batch_stride = info->y_strides[0];
+    const ptrdiff_t y_channel_stride = info->y_strides[1];
+    const ptrdiff_t y_row_stride = info->y_strides[2];
+    const ptrdiff_t x_batch_stride = info->x_strides[0];
+    const ptrdiff_t x_channel_stride = info->x_strides[1];
+    const ptrdiff_t x_row_stride = info->x_strides[2];
+    const ptrdiff_t x_col_stride = info->x_strides[3];
+#ifdef _WIN32
+    const ptrdiff_t batch_size = static_cast<ptrdiff_t>(info->shape[0]);
+    const ptrdiff_t channels = static_cast<ptrdiff_t>(info->shape[1]);
+    const ptrdiff_t rows = static_cast<ptrdiff_t>(info->shape[2]);
+#pragma omp parallel for
+    for (ptrdiff_t batch = 0; batch < batch_size; ++batch) {
+        for (ptrdiff_t channel = 0; channel < channels; ++channel) {
+            for (ptrdiff_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(mean);
+                } else {
+                    *output_ptr = mean;
+                }
+            }
+        }
+    }
+#else
+    const size_t batch_size = info->shape[0];
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
+#pragma omp parallel for collapse(3)
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        for (size_t channel = 0; channel < channels; ++channel) {
+            for (size_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(mean);
+                } else {
+                    *output_ptr = mean;
+                }
+            }
+        }
+    }
+#endif //_WIN32
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(reduce_mean<fp16_t>(&_info, (fp16_t *)y, (const fp16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(reduce_mean<bf16_t>(&_info, (bf16_t *)y, (const bf16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(reduce_mean<float>(&_info, (float *)y, (const float *)x));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reduce_mean::cpu
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h
new file mode 100644
index 000000000..f67601c62
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __REDUCE_MEAN_CPU_H__
+#define __REDUCE_MEAN_CPU_H__
+#include "../reduce_mean.h"
+
+DESCRIPTOR(cpu)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h
new file mode 100644
index 000000000..6c11e07d3
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/info.h
@@ -0,0 +1,62 @@
+﻿#ifndef __REDUCE_MEAN_INFO_H__
+#define __REDUCE_MEAN_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::reduce_mean {
+
+class ReduceMeanInfo {
+    ReduceMeanInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> y_strides;
+    std::vector<ptrdiff_t> x_strides;
+
+    static utils::Result<ReduceMeanInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) {
+        auto dtype = y_desc->dtype();
+        if (dtype != x_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+
+        size_t ndim = y_desc->ndim();
+        if (x_desc->ndim() != ndim) {
+            CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+        CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
+        if (ndim > 4) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        } else if (ndim == 0) {
+            std::vector<size_t> shape = {1, 1, 1, 1};
+            std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
+            std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        } else {
+            std::vector<size_t> shape = x_desc->shape();
+            std::vector<ptrdiff_t> y_strides = y_desc->strides();
+            std::vector<ptrdiff_t> x_strides = x_desc->strides();
+            if (dim != (shape.size() - 1)) {
+                std::swap(shape[dim], shape[shape.size() - 1]);
+                std::swap(y_strides[dim], y_strides[shape.size() - 1]);
+                std::swap(x_strides[dim], x_strides[shape.size() - 1]);
+            }
+            while (shape.size() < 4) {
+                shape.insert(shape.begin(), 1);
+                y_strides.insert(y_strides.begin(), 0);
+                x_strides.insert(x_strides.begin(), 0);
+            }
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+    }
+};
+
+} // namespace op::reduce_mean
+
+#endif // __REDUCE_MEAN_INFO_H__
diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h
new file mode 100644
index 000000000..a105724d6
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MEAN_METAX_H__
+#define __REDUCE_MEAN_METAX_H__
+
+#include "../reduce_mean.h"
+
+DESCRIPTOR(metax)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
new file mode 100644
index 000000000..0eaf1c9cb
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
@@ -0,0 +1,96 @@
+#include "../../../devices/metax/metax_common.h"
+#include "reduce_mean_metax.h"
+
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../nvidia/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL ReduceMean(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_mean::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            hcStream_t stream) {
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMean<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMean<BLOCK_SIZE, __hpcc_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMean<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_mean::metax
diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
new file mode 100644
index 000000000..e70748605
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
@@ -0,0 +1,26 @@
+﻿#ifndef __REDUCE_MEAN_KERNEL_CUH__
+#define __REDUCE_MEAN_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void ReduceMeanKernel(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+
+    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
+    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
+
+    // [Reduce] Find the sum of each updated row and store in shared memory
+    Tcompute sum_0 = op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(x, width, x_stride_w);
+    if (threadIdx.x == 0) {
+        // mean_ = sum_0/width;
+        *y = sum_0 / width;
+    }
+    // __syncthreads();
+
+    // [Elementwise] Divide each element by the sum and store in shared memory
+    // *y = mean_;
+}
+
+#endif // __REDUCE_MEAN_KERNEL_CUH__
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
new file mode 100644
index 000000000..bfc26e23d
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
@@ -0,0 +1,101 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "reduce_mean_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL ReduceMean(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_mean::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            cudaStream_t stream) {
+    dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMean<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMean<BLOCK_SIZE, __nv_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMean<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_mean::nvidia
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh
new file mode 100644
index 000000000..be16b4491
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MEAN_NVIDIA_H__
+#define __REDUCE_MEAN_NVIDIA_H__
+
+#include "../reduce_mean.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc
new file mode 100644
index 000000000..7c88e4e4d
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/operator.cc
@@ -0,0 +1,182 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reduce_mean.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reduce_mean_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/reduce_mean_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/reduce_mean_metax.h"
+#endif
+// #ifdef ENABLE_ASCEND_API
+// #include "ascend/reduce_mean_ascend.h"
+// #endif
+
+__C infiniStatus_t infiniopCreateReduceMeanDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMeanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        return op::reduce_mean::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                \
+            x_desc,                                                                \
+            dim);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CREATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCreateReduceMeanDescriptor((BangHandle_t)handle, (ReduceMeanBangDescriptor_t *)desc_ptr, y_desc);
+        //         // return cnnlCreateReduceMeanDescriptor((BangHandle_t) handle, (ReduceMeanCnnlDescriptor_t *) desc_ptr, y_desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCreateReduceMeanDescriptor((MusaHandle_t)handle, (ReduceMeanMusaDescriptor_t *)desc_ptr, y_desc);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         GET(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangGetReduceMeanWorkspaceSize((ReduceMeanBangDescriptor_t)desc, size);
+        //         // return cnnlGetReduceMeanWorkspaceSize((ReduceMeanCnnlDescriptor_t) desc, size);
+        //     }
+
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaGetReduceMeanWorkspaceSize((ReduceMeanMusaDescriptor_t)desc, size);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReduceMean(
+    infiniopReduceMeanDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                              \
+        return reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, y, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangReduceMean((ReduceMeanBangDescriptor_t)desc, workspace, workspace_size, data, stream);
+        //         // return cnnlReduceMean((ReduceMeanCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaReduceMean((ReduceMeanMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                   \
+        delete reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangDestroyReduceMeanDescriptor((ReduceMeanBangDescriptor_t)desc);
+        //         // return cnnlDestroyReduceMeanDescriptor((ReduceMeanCnnlDescriptor_t) desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu:
+        //         return musaDestroyReduceMeanDescriptor((ReduceMeanMusaDescriptor_t)desc);
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h
new file mode 100644
index 000000000..bf2e2dda0
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/reduce_mean.h
@@ -0,0 +1,47 @@
+#ifndef REDUCE_MEAN_H
+#define REDUCE_MEAN_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::reduce_mean::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ReduceMeanInfo _info;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            ReduceMeanInfo info,                                 \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            size_t dim);                                         \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // REDUCE_MEAN_H
diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc
new file mode 100644
index 000000000..c47a38ef9
--- /dev/null
+++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc
@@ -0,0 +1,108 @@
+#include "scatter_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::scatter::cpu {
+
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+	char * output,
+	const char * input,
+	const int64_t * index
+) {
+//  -------------------------------- start: perform operator on CPU --------------------------------
+    std::vector<ptrdiff_t> contiguous_strides(info.ndim);
+	ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for(size_t d = 0; d < info.ndim; d ++)
+    {
+        if (d == info.dim)
+            continue;
+        contiguous_strides[d] = last_dim * last_stride;  
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+    int scatter_dim = int(info.dim);
+    size_t element_size = infiniSizeOf(info.dtype);
+
+    #pragma omp parallel for
+    for (size_t n = 0; n < batch_size; n ++) {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;
+        size_t rem = n;
+        for(int d = info.ndim - 1; d >= 0; d --) {
+            if (d == scatter_dim)
+                continue;
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * element_size * info.output_strides[d];
+            input_ptr += dim_index * element_size * info.input_strides[d];
+            index_ptr += dim_index * info.index_strides[d];
+        }
+        for (size_t c = 0; c < info.index_shape[scatter_dim]; c ++) {
+            int64_t scatter_number = *(index_ptr + c * info.index_strides[scatter_dim]);
+            memcpy(
+                output_ptr + scatter_number * element_size * info.output_strides[scatter_dim],
+                input_ptr + c * element_size * info.input_strides[scatter_dim],
+                element_size
+            );
+        }
+    }
+
+//  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = input_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = ScatterInfo::createScatterInfo(
+		output_desc,
+		input_desc,
+		index_desc,
+		dim
+    );
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+) const {
+
+    return calculate_scatter(_info, (char *)output, (const char *)input, (const int64_t *)index);
+}
+}
diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h
new file mode 100644
index 000000000..bf2fcb7a1
--- /dev/null
+++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __SCATTER_CPU_H__
+#define __SCATTER_CPU_H__
+
+#include "../scatter.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __SCATTER_CPU_H__
diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh
new file mode 100644
index 000000000..6c4de6ce5
--- /dev/null
+++ b/src/infiniop/ops/scatter/cuda/kernel.cuh
@@ -0,0 +1,38 @@
+#ifndef __SCATTER_KERNEL_CUH__
+#define __SCATTER_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void scatterKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int scatter_dim
+) {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;        
+        size_t rem = blockIdx.x;
+        for(int d = ndim - 1; d >= 0; d --) {
+            if (d == scatter_dim)
+                continue;
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * output_strides[d];
+            input_ptr += dim_index * input_strides[d];
+            index_ptr += dim_index * index_strides[d];
+        }
+        for (size_t c = threadIdx.x; c < index_scatter_size; c += BLOCK_SIZE) {
+            int64_t scatter_number = *(index_ptr + c * index_strides[scatter_dim]);
+            *(output_ptr + scatter_number * output_strides[scatter_dim]) = \
+                *(input_ptr + c * input_strides[scatter_dim]);
+        }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __SCATTER_KERNEL_CUH__
diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h
new file mode 100644
index 000000000..9f21e435c
--- /dev/null
+++ b/src/infiniop/ops/scatter/info.h
@@ -0,0 +1,67 @@
+#ifndef __SCATTER_INFO_H__
+#define __SCATTER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::scatter {
+
+class ScatterInfo {
+private:
+    ScatterInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+	size_t ndim;
+	std::vector<size_t> output_shape;
+	std::vector<size_t> input_shape;
+	std::vector<size_t> index_shape;
+	std::vector<ptrdiff_t> output_strides;
+	std::vector<ptrdiff_t> input_strides;
+	std::vector<ptrdiff_t> index_strides;
+	size_t dim;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<ScatterInfo> createScatterInfo(
+		infiniopTensorDescriptor_t output_desc,
+		infiniopTensorDescriptor_t input_desc,
+		infiniopTensorDescriptor_t index_desc,
+		size_t dim
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+		CHECK_OR_RETURN(
+			input_desc->ndim() == output_desc->ndim() && output_desc->ndim() == index_desc->ndim(),
+			INFINI_STATUS_BAD_TENSOR_SHAPE
+		);
+		size_t ndim = output_desc->ndim();
+		for (size_t d = 0; d < ndim; d ++){
+			if(d != dim) {
+				CHECK_OR_RETURN(
+					index_desc->dim(d) <= input_desc->dim(d) && index_desc->dim(d) <= output_desc->dim(d),
+					INFINI_STATUS_BAD_TENSOR_SHAPE;
+				);
+			}
+		}
+		CHECK_OR_RETURN(index_desc->dim(dim) <= input_desc->dim(dim), INFINI_STATUS_BAD_TENSOR_SHAPE);		
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<ScatterInfo>(ScatterInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+			ndim,
+			output_desc->shape(),
+			input_desc->shape(),
+			index_desc->shape(),
+			output_desc->strides(),
+			input_desc->strides(),
+			index_desc->strides(),
+			dim
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __SCATTER_INFO_H__
diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h
new file mode 100644
index 000000000..d5ce0ef16
--- /dev/null
+++ b/src/infiniop/ops/scatter/metax/scatter_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SCATTER_METAX_H__
+#define __SCATTER_METAX_H__
+
+#include "../scatter.h"
+
+DESCRIPTOR(metax)
+
+#endif // __SCATTER_METAX_H__
diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca
new file mode 100644
index 000000000..1c742f60d
--- /dev/null
+++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca
@@ -0,0 +1,190 @@
+#include "../../../devices/metax/metax_common.h"
+#include "scatter_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::scatter::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int scatter_dim
+) {
+    scatterKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_scatter_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        scatter_dim
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    hcStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t scatter_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == scatter_dim) 
+            continue;
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+
+    size_t batch_size = last_dim * last_stride;
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));    
+
+
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.index_shape[scatter_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        scatter_dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    auto result = ScatterInfo::createScatterInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \
+        calculate_scatter<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+
+    #define CALCULATE_SCATTER_WITH_METAX_BLOCK(BLOCK_SIZE)            \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, bool);               \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t);            \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t);           \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t);           \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t);           \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int8_t);             \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int16_t);            \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int32_t);            \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int64_t);            \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, half);               \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, float);              \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16);      \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }   
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_SCATTER_WITH_METAX_BLOCK
+    #undef CALCULATE_SCATTER
+}
+} // namespace op::scatter::metax
diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu
new file mode 100644
index 000000000..7d6e1a1a1
--- /dev/null
+++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu
@@ -0,0 +1,190 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "scatter_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::scatter::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int scatter_dim
+) {
+    scatterKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_scatter_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        scatter_dim
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    cudaStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t scatter_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == scatter_dim) 
+            continue;
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+
+    size_t batch_size = last_dim * last_stride;
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));    
+
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.index_shape[scatter_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        scatter_dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = ScatterInfo::createScatterInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+    #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \
+        calculate_scatter<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+    #define CALCULATE_SCATTER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, bool);               \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t);            \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t);           \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t);           \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t);           \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int8_t);             \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int16_t);            \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int32_t);            \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int64_t);            \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, half);               \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, float);              \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16);      \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }    
+        
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    
+    #undef CALCULATE_SCATTER_WITH_BLOCK_SIZE
+    #undef CALCULATE_SCATTER
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::scatter::nvidia
diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh
new file mode 100644
index 000000000..a199edb6e
--- /dev/null
+++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __SCATTER_NVIDIA_API_H__
+#define __SCATTER_NVIDIA_API_H__
+#include "../scatter.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __SCATTER_NVIDIA_API_H__
diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc
new file mode 100644
index 000000000..7c7de71b5
--- /dev/null
+++ b/src/infiniop/ops/scatter/operator.cc
@@ -0,0 +1,154 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/scatter.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/scatter_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/scatter_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/scatter_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateScatterDescriptor(
+    infiniopHandle_t handle,
+    infiniopScatterDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::scatter::NAMESPACE::Descriptor::create(                      \
+            handle,                                                             \
+            reinterpret_cast<op::scatter::NAMESPACE::Descriptor **>(desc_ptr),  \
+            output_desc,                                                        \
+            input_desc,                                                         \
+            index_desc,                                                         \
+            dim                                                                 \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::scatter::NAMESPACE::Descriptor *>(desc)->workspaceSize();      \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopScatter(
+    infiniopScatterDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::scatter::NAMESPACE::Descriptor *>(desc)->calculate(       \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            input,                                                                                  \
+            index,                                                                                  \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::scatter::NAMESPACE::Descriptor *>(desc);        \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h
new file mode 100644
index 000000000..7d2deefc1
--- /dev/null
+++ b/src/infiniop/ops/scatter/scatter.h
@@ -0,0 +1,49 @@
+#ifndef __SCATTER_H__
+#define __SCATTER_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::scatter::NAMESPACE {                                \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        ScatterInfo _info;                                            \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            ScatterInfo info,                                         \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t index_desc,                    \
+            size_t dim                                                \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            const void * index,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc
new file mode 100644
index 000000000..88ba6cdd6
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc
@@ -0,0 +1,52 @@
+#include "sin_cpu.h"
+
+namespace op::sin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SinOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SinOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SinOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::cpu
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h
new file mode 100644
index 000000000..80e406f98
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __SIN_CPU_H__
+#define __SIN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(sin, cpu)
+
+namespace op::sin::cpu {
+typedef struct SinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::sin(input);
+    }
+} SinOp;
+} // namespace op::sin::cpu
+
+#endif // __SIN_CPU_H__
diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh
new file mode 100644
index 000000000..30641366c
--- /dev/null
+++ b/src/infiniop/ops/sin/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef __SIN_CUDA_H__
+#define __SIN_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::sin::cuda {
+typedef struct SinOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float inputf = __half2float(input);
+            return __float2half_rn(sinf(inputf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(__sinf(f0), __sinf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float inputf = __bfloat162float(input);
+            return __float2bfloat16_rn(__sinf(inputf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return sinf(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::sin(input);
+        } else {
+            return std::sin(input);
+        }
+    }
+} SinOp;
+} // namespace op::sin::cuda
+
+#endif // __SIN_CUDA_H__
diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h
new file mode 100644
index 000000000..5b272d4d9
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SIN_METAX_API_H__
+#define __SIN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sin, metax)
+
+#endif // __SIN_METAX_API_H__
diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca
new file mode 100644
index 000000000..5ea69e139
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.maca
@@ -0,0 +1,60 @@
+#include "sin_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::sin::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::metax
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
new file mode 100644
index 000000000..6fbf952bc
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sin_nvidia.cuh"
+
+namespace op::sin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::nvidia
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
new file mode 100644
index 000000000..31f5b48ef
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIN_CUDA_API_H__
+#define __SIN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sin, nvidia)
+
+#endif // __SIN_CUDA_API_H__
diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc
new file mode 100644
index 000000000..978561a04
--- /dev/null
+++ b/src/infiniop/ops/sin/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/sin_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/sin_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSinDescriptor(
+    infiniopHandle_t handle,
+    infiniopSinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::sin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::sin::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::sin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSin(
+    infiniopSinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/softplus/operator.cc b/src/infiniop/ops/softplus/operator.cc
index 2548f7d34..96c71b6f3 100644
--- a/src/infiniop/ops/softplus/operator.cc
+++ b/src/infiniop/ops/softplus/operator.cc
@@ -8,9 +8,9 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
 #include "nvidia/softplus_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/softplus_metax.h"
-#endif
+// #ifdef ENABLE_METAX_API
+// #include "metax/softplus_metax.h"
+// #endif
 
 __C infiniStatus_t infiniopCreateSoftplusDescriptor(
     infiniopHandle_t handle,
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
new file mode 100644
index 000000000..23a92ed65
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
@@ -0,0 +1,52 @@
+#include "tanh_cpu.h"
+
+namespace op::tanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanhOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<TanhOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanhOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::cpu
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
new file mode 100644
index 000000000..73fd7c1b6
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __TANH_CPU_H__
+#define __TANH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(tanh, cpu)
+
+namespace op::tanh::cpu {
+typedef struct TanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::tanh(input);
+    }
+} TanhOp;
+} // namespace op::tanh::cpu
+
+#endif // __TANH_CPU_H__
diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh
new file mode 100644
index 000000000..62979a20e
--- /dev/null
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
@@ -0,0 +1,46 @@
+#ifndef __TANH_CUDA_H__
+#define __TANH_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::tanh::cuda {
+typedef struct TanhOp {
+    static constexpr size_t num_inputs = 1;
+
+    __device__ __forceinline__ float tanh_f32_func(float x) const {
+        return tanhf(x);
+    }
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = tanh_f32_func(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            float r0 = tanh_f32_func(f0);
+            float r1 = tanh_f32_func(f1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            float rf = tanh_f32_func(xf);
+            return __float2bfloat16_rn(rf);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return tanh_f32_func(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::tanh(input);
+        } else {
+            return std::tanh(input);
+        }
+    }
+} TanhOp;
+} // namespace op::tanh::cuda
+
+#endif // __TANH_CUDA_H__
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h
new file mode 100644
index 000000000..8432a7f0d
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TANH_METAX_API_H__
+#define __TANH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanh, metax)
+
+#endif // __TANH_METAX_API_H__
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca
new file mode 100644
index 000000000..0a01554c4
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca
@@ -0,0 +1,60 @@
+#include "tanh_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::tanh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::metax
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
new file mode 100644
index 000000000..a2c36551c
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tanh_nvidia.cuh"
+
+namespace op::tanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::nvidia
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
new file mode 100644
index 000000000..cb37b2528
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TANH_CUDA_API_H__
+#define __TANH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tanh, nvidia)
+
+#endif // __TANH_CUDA_API_H__
diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc
new file mode 100644
index 000000000..d34d97df6
--- /dev/null
+++ b/src/infiniop/ops/tanh/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tanh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tanh_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tanh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTanh(
+    infiniopTanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc
new file mode 100644
index 000000000..de7e86e3e
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.cc
@@ -0,0 +1,84 @@
+#include "where_cpu.h"
+
+namespace op::where::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<WhereOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<WhereOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<WhereOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<WhereOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<WhereOp, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<WhereOp, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<WhereOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<WhereOp, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<WhereOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<WhereOp, uint16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<WhereOp, uint32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<WhereOp, uint64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<WhereOp, bool>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::cpu
diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h
new file mode 100644
index 000000000..3d86cb4f7
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __WHERE_CPU_H__
+#define __WHERE_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(where, cpu)
+
+namespace op::where::cpu {
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+    template <typename T>
+    T operator()(const T &a, const T &b, const bool &cond) const {
+        return cond ? a : b;
+    }
+} WhereOp;
+} // namespace op::where::cpu
+
+#endif // __WHERE_CPU_H__
diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh
new file mode 100644
index 000000000..8eb5c762b
--- /dev/null
+++ b/src/infiniop/ops/where/cuda/kernel.cuh
@@ -0,0 +1,15 @@
+#ifndef __WHERE_CUDA_H__
+#define __WHERE_CUDA_H__
+
+namespace op::where::cuda {
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b, const bool &cond) const {
+        return cond ? a : b;
+    }
+} WhereOp;
+} // namespace op::where::cuda
+
+#endif // __WHERE_CUDA_H__
diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h
new file mode 100644
index 000000000..43bb1a945
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.h
@@ -0,0 +1,8 @@
+#ifndef __WHERE_METAX_API_H__
+#define __WHERE_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(where, metax)
+
+#endif // __WHERE_METAX_API_H__
diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca
new file mode 100644
index 000000000..b648cfbcc
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.maca
@@ -0,0 +1,92 @@
+#include "where_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::where::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::metax
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu
new file mode 100644
index 000000000..860089bd2
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu
@@ -0,0 +1,91 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "where_nvidia.cuh"
+
+namespace op::where::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::nvidia
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
new file mode 100644
index 000000000..c168364a8
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __WHERE_CUDA_API_H__
+#define __WHERE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(where, nvidia)
+
+#endif // __WHERE_CUDA_API_H__
diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc
new file mode 100644
index 000000000..d69b1d4e1
--- /dev/null
+++ b/src/infiniop/ops/where/operator.cc
@@ -0,0 +1,148 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/where.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/where_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/where_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/where_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateWhereDescriptor(
+    infiniopHandle_t handle,
+    infiniopWhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t condition_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::where::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::where::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                          \
+            {a_desc,                                                         \
+             b_desc,                                                         \
+             condition_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::where::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopWhere(
+    infiniopWhereDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    const void *condition,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b, condition}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh
index a1d2c2501..cce47d459 100644
--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -61,6 +61,40 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
     return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
 }
 
+// Sum(x) on non-contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr,
+                                        size_t count,
+                                        ptrdiff_t stride) {
+    Tcompute s = 0;
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        s += Tcompute(data_ptr[i * stride]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Sum(s);
+}
+
+// Max(x) on non-contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ __forceinline__ Tdata max(const Tdata *data_ptr,
+                                     size_t count,
+                                     ptrdiff_t stride) {
+    Tdata max_ = data_ptr[0];
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        max_ = cub::Max()(max_, data_ptr[i * stride]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
+}
+
 } // namespace op::common_cuda::reduce_op
 
 #endif
diff --git a/src/utils/check.h b/src/utils/check.h
index 7f4a2bdd9..76823aa40 100644
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -59,4 +59,22 @@
 
 #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__)
 
+#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \
+    do {                                                     \
+        if (INPUT_SHAPE.empty()) {                           \
+            if (!EXPECTED_SHAPE.empty()) {                   \
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;       \
+            }                                                \
+            break;                                           \
+        }                                                    \
+        if (DIM >= INPUT_SHAPE.size()) {                     \
+            return INFINI_STATUS_BAD_PARAM;                  \
+        }                                                    \
+        std::vector<size_t> reduced_shape = INPUT_SHAPE;     \
+        reduced_shape[DIM] = 1;                              \
+        if (reduced_shape != EXPECTED_SHAPE) {               \
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;           \
+        }                                                    \
+    } while (0)
+
 #endif // INFINIUTILS_CHECK_H
diff --git a/test/infiniop/averagepool.py b/test/infiniop/averagepool.py
new file mode 100644
index 000000000..55d5c37cf
--- /dev/null
+++ b/test/infiniop/averagepool.py
@@ -0,0 +1,239 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from typing import Tuple
+import math
+from torch.nn import functional as F
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+_TEST_CASES = [
+    # ============ 1D Average Pooling Tests (converted to MaxPool format) ============
+    # Basic cases
+    ((4, 8, 128), None, (3,), (1,), (0,), False),  # kernel=3, stride=1, pad=0
+    ((2, 16, 256), None, (5,), (2,), (2,), False),  # kernel=5, stride=2, pad=2
+    ((8, 4, 64), None, (7,), (3,), (1,), False),  # kernel=7, stride=3, pad=1
+    # ceil_mode variations
+    ((1, 3, 99), None, (4,), (3,), (1,), True),  # kernel=4, stride=3, pad=1
+    ((3, 2, 77), None, (6,), (4,), (0,), True),  # kernel=6, stride=4, pad=0
+    # ============ 2D Average Pooling Tests ============
+    # Basic cases with square kernels
+    ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False),
+    ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False),
+    ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False),
+    # Rectangular kernels
+    ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False),
+    ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False),
+    ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False),
+    # ceil_mode variations
+    ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True),
+    ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True),
+    # ============ 3D Average Pooling Tests ============
+    # Basic cubic kernels
+    ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False),
+    ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False),
+    ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False),
+    # Non-cubic kernels
+    ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False),
+    ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False),
+    ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False),
+    # ceil_mode variations
+    ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True),
+    ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def averagepool(input_tensor, kernel_size, stride, padding, ceil_mode, output_tensor):
+    ndim = len(input_tensor.shape) - 2
+    if ndim == 1:
+        result = F.avg_pool1d(
+            input_tensor.to(torch.float32), kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode
+        )
+    elif ndim == 2:
+        result = F.avg_pool2d(
+            input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    elif ndim == 3:
+        result = F.avg_pool3d(
+            input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    else:
+        raise ValueError(f"Unsupported spatial dimensions: {ndim}")
+    
+    # 将计算结果转换回原始数据类型
+    output_tensor.copy_(result.to(output_tensor.dtype))
+
+
+def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode):
+    def calc_output_size(input_size, k, s, p, ceil_mode):
+        return (
+            math.ceil((input_size + 2 * p - k) / s + 1)
+            if ceil_mode
+            else math.floor((input_size + 2 * p - k) / s + 1)
+        )
+
+    batch, channel, *spatial = input_shape
+    output_spatial = [
+        calc_output_size(spatial[i], kernel_size[i], stride[i], padding[i], ceil_mode)
+        for i in range(len(spatial))
+    ]
+    return (batch, channel) + tuple(output_spatial)
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    arr = (ctypes.c_uint64 * len(py_tuple))(*py_tuple)
+    return ctypes.cast(arr, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+    output_shape = infer_output_shape(
+        input_shape, kernel_size, stride, padding, ceil_mode
+    )
+    output_tensor = TestTensor(output_shape, None, dt=tensor_dtype, device=device)
+
+    print(
+        f"Testing AvgPool on {InfiniDeviceNames[device]} with input_shape: {input_shape}, kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    averagepool(
+        input_tensor.torch_tensor(),
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        output_tensor.torch_tensor(),
+    )
+
+    if sync:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    for tensor in [input_tensor, output_tensor]:
+        if tensor:
+            tensor.destroy_desc()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAvgPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_averagepool():
+        check_error(
+            LIBINFINIOP.infiniopAvgPool(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    lib_averagepool()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        output_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    ), f"Mismatch for shape {input_shape}, kernel {kernel_size}"
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: averagepool(
+                input_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                output_tensor.torch_tensor(),
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "   lib", lib_averagepool, device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyAvgPoolDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/batch_norm.py b/test/infiniop/batch_norm.py
new file mode 100644
index 000000000..a7b46858f
--- /dev/null
+++ b/test/infiniop/batch_norm.py
@@ -0,0 +1,244 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    # shape, momentum, eps
+    ((13, 4, 5,), 0.1, 1e-5),
+    ((2, 3, 4),  0.1, 1e-4),
+    ((15, 16, 17,), 0.2, 1e-5),
+    ((50, 60, 70),  0.1, 1e-4),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+
+# No implement for INPLACE
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_batch_norm(
+    output: torch.Tensor,
+    running_mean: torch.Tensor,
+    running_var: torch.Tensor,
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    init_running_mean: torch.Tensor,
+    init_running_var: torch.Tensor,
+    momentum: float,
+    eps: float
+):
+    bn = torch.nn.BatchNorm1d(
+        num_features=input.shape[1],
+        eps=eps,
+        momentum=momentum,
+        dtype=input.dtype,
+    )
+    bn.weight.data = weight
+    bn.bias.data = bias
+    bn.running_mean.data = init_running_mean
+    bn.running_var.data = init_running_var
+    output.copy_(bn(input).detach())
+    running_mean.copy_(bn.running_mean.data)
+    running_var.copy_(bn.running_var.data)
+
+
+def test(
+    handle,
+    device,
+    shape, momentum, eps,
+    inplace,
+    dtype,
+    sync=None,
+):
+    running_mean = TestTensor(
+        [shape[1]],
+        None,
+        dtype,
+        device,
+    )    
+    running_var = TestTensor(
+        [shape[1]],
+        None,
+        dtype,
+        device,
+    ) 
+
+    input = TestTensor(
+        shape,
+        None,
+        dtype,
+        device,
+    )   
+    if inplace == Inplace.INPLACE:
+        output = input
+    else:
+        output = TestTensor(
+            shape,
+            None,
+            dtype,
+            device
+        ) 
+
+    weight = TestTensor(
+        [shape[1]],
+        None,
+        dtype,
+        device,
+    )
+    bias = TestTensor(
+        [shape[1]],
+        None,      
+        dtype,
+        device,
+    )            
+
+
+    print(
+        f"Testing BatchNorm on {InfiniDeviceNames[device]} with shape:{shape}, inplace:{inplace}, momentum:{momentum}, eps:{eps},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    
+    torch_batch_norm(
+        output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(),
+        input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(),
+        running_mean.torch_tensor(), running_var.torch_tensor(),
+        momentum, eps
+    )
+
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateBatchNormDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            running_mean.descriptor,
+            running_var.descriptor,
+            input.descriptor,
+            weight.descriptor,
+            bias.descriptor,
+            momentum,
+            eps
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, running_mean, running_var, input, weight, bias]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetBatchNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_batch_norm():
+        check_error(
+            LIBINFINIOP.infiniopBatchNorm(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                running_mean.data(),
+                running_var.data(),
+                input.data(),
+                weight.data(),
+                bias.data(),
+                None,
+            )
+        )
+
+    lib_batch_norm()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+        debug(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol)
+        debug(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol)
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol)
+    
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_batch_norm(
+            output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(),
+            input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), momentum, eps
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_batch_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyBatchNormDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my BatchNorm passed!\033[0m")
diff --git a/test/infiniop/cross_entropy_loss.py b/test/infiniop/cross_entropy_loss.py
new file mode 100644
index 000000000..acc5cadc4
--- /dev/null
+++ b/test/infiniop/cross_entropy_loss.py
@@ -0,0 +1,213 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+import numpy as np
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    infiniopOperatorDescriptor_t,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    TestWorkspace,
+    InfiniDeviceEnum,
+)
+from torch.nn import functional as F
+
+_TEST_CASES = [
+    # Single sample classification
+    ((10,), 10),
+    ((200,), 200),
+    # 2D: (N, C) - batch classification
+    ((4, 10), 10),
+    ((8, 5), 5),
+    ((16, 100), 100),
+    ((32, 1000), 1000),
+    ((64, 21), 21),
+    ((128, 50), 50),
+    # 3D: (N, C, d1) - sequence classification
+    ((4, 10, 5), 10),
+    # 4D: (N, C, d1, d2) - image segmentation
+    ((2, 8, 8, 8), 8),
+    # 5D: (N, C, d1, d2, d3) - 3D segmentation
+    ((3, 10, 10, 20, 30), 10),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cross_entropy_loss_pytorch(logits, target):
+    return F.cross_entropy(logits.double(), target.long(), reduction="mean")
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    num_classes,
+    tensor_dtype=InfiniDtype.F32,
+    sync=None,
+):
+    # 根据输入形状确定logits和target的形状
+    if len(input_shape) == 1:
+        # Shape (C,) - single sample classification
+        logits_shape = (num_classes,)
+        target_shape = (1,)  # 修改：使用 (1,) 而不是标量
+    else:
+        # Shape (N, C, [d1], [d2], ...)
+        logits_shape = input_shape
+        target_shape = (input_shape[0],) + input_shape[2:]
+
+    print(
+        f"Testing CrossEntropyLoss on {InfiniDeviceNames[device]} with logits_shape: {logits_shape}, target_shape: {target_shape}, dtype:{InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    # 创建logits张量
+    logits = TestTensor(logits_shape, None, dt=tensor_dtype, device=device)
+
+    # 创建target张量
+    target_torch = torch.randint(
+        0,
+        num_classes,
+        target_shape,
+        dtype=torch.long,
+        device=logits.torch_tensor().device,
+    )
+    target = TestTensor.from_torch(target_torch, dt=InfiniDtype.I64, device=device)
+
+    # 创建loss张量
+    loss = TestTensor((1,), None, dt=tensor_dtype, device=device)
+
+    # 计算PyTorch参考损失
+    if len(input_shape) == 1:
+        # 对于一维logits，target需要是标量
+        target_scalar = target.torch_tensor()[0]
+        pytorch_loss = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar)
+    else:
+        pytorch_loss = cross_entropy_loss_pytorch(
+            logits.torch_tensor(), target.torch_tensor()
+        )
+
+    # 将参考结果存储到loss张量
+    loss.torch_tensor()[0] = pytorch_loss.to(loss.torch_tensor().dtype)
+
+    if sync:
+        sync()
+
+    # 创建算子描述符
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCrossEntropyLossDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            loss.descriptor,
+            logits.descriptor,
+            target.descriptor,
+        )
+    )
+
+    # 销毁tensor的描述符以防止内核直接使用
+    for tensor in [logits, target, loss]:
+        tensor.destroy_desc()
+
+    # 获取工作空间大小并创建工作空间
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCrossEntropyLossWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    # PyTorch参考实现函数
+    def torch_cross_entropy():
+        if len(input_shape) == 1:
+            target_scalar = target.torch_tensor()[0]
+            result = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar)
+        else:
+            result = cross_entropy_loss_pytorch(
+                logits.torch_tensor(), target.torch_tensor()
+            )
+        loss.torch_tensor()[0] = result.to(loss.torch_tensor().dtype)
+
+    # InfiniOP实现函数
+    def lib_cross_entropy():
+        check_error(
+            LIBINFINIOP.infiniopCrossEntropyLoss(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                loss.data(),
+                logits.data(),
+                target.data(),
+                None,
+            )
+        )
+
+    # 执行InfiniOP算子
+    lib_cross_entropy()
+
+    if sync:
+        sync()
+
+    # 验证结果
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    actual_loss = loss.actual_tensor()[0]
+    expected_loss = loss.torch_tensor()[0]
+
+    if DEBUG:
+        print(f"Expected loss: {expected_loss.item()}")
+        print(f"Actual loss: {actual_loss.item()}")
+        if target_shape:
+            print(
+                f"Target shape: {target_shape}, first few targets: {target.torch_tensor().flatten()[:5]}"
+            )
+        else:
+            print(f"Target (scalar): {target.torch_tensor()[0].item()}")
+        debug(actual_loss, expected_loss, atol=atol, rtol=rtol)
+
+    if not torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol):
+        print("--- ERROR ANALYSIS ---")
+        print(f"Expected: {expected_loss.item()}, Actual: {actual_loss.item()}")
+        print(f"Difference: {abs(actual_loss - expected_loss).item()}")
+        print(f"Tolerance: atol={atol}, rtol={rtol}")
+
+    assert torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol)
+
+    # Profile功能
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCrossEntropyLossDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mAll CrossEntropyLoss tests passed!\033[0m")
diff --git a/test/infiniop/div.py b/test/infiniop/div.py
new file mode 100644
index 000000000..de37404b3
--- /dev/null
+++ b/test/infiniop/div.py
@@ -0,0 +1,183 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def div(c, a, b):
+    torch.div(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device, bias=1e-6)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDivDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDivWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_div():
+        check_error(
+            LIBINFINIOP.infiniopDiv(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_div()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py
new file mode 100644
index 000000000..3b78098dd
--- /dev/null
+++ b/test/infiniop/equal.py
@@ -0,0 +1,201 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride
+    ((13, 4), None, None),
+    ((13, 4), (13, 1), (13, 1)),
+    ((13, 4, 4), (16, 4, 1), (16, 4, 1),),
+    ((16, 5632), None, None),
+]
+
+class Identical(Enum):
+    EQUAL = auto()
+    NOT_EQUAL = auto()
+
+
+_IDENTICAL = [
+    Identical.EQUAL, # -> result=true
+    Identical.NOT_EQUAL, # -> result=false
+]
+
+_TEST_CASES = [
+    test_case + (identical_item,)
+    for test_case in _TEST_CASES_
+    for identical_item in _IDENTICAL
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_equal(c, a, b):
+    return torch.tensor(torch.equal(input=a, other=b), dtype=torch.bool)
+    
+
+def test(
+    handle,
+    device,
+    input_shape,
+    a_strides,
+    b_strides,
+    identical,
+    dtype,
+    sync=None,
+):
+    torch_dtype = {
+        InfiniDtype.F16: torch.half,
+        InfiniDtype.F32: torch.float,
+        InfiniDtype.BF16: torch.bfloat16,
+        InfiniDtype.I32: torch.int32,
+        InfiniDtype.I64: torch.int64
+    }[dtype]
+
+    print(
+        f"Testing equal on {InfiniDeviceNames[device]} with input_shape:{input_shape},"
+        f"a_stride:{a_strides} b_stride:{b_strides} identical:{identical},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    torch_c = torch.tensor([False], dtype=torch.bool)
+    c = TestTensor(
+        [1],
+        torch_c.stride(),
+        InfiniDtype.BOOL,
+        device,
+        "manual",
+        set_tensor=torch_c
+    )
+
+    torch_a = (torch.rand(input_shape) * 100 - 50).type(torch_dtype)
+    if a_strides is not None:
+        torch_a.as_strided_(input_shape, a_strides)
+    a = TestTensor(
+        input_shape,
+        torch_a.stride(),
+        dtype,
+        device,
+        "manual",
+        set_tensor=torch_a        
+    )
+    if identical == Identical.EQUAL:
+        torch_b = torch_a.clone()
+    else:
+        torch_b = (torch.rand(input_shape) * 100 - 50).type(torch_dtype)
+    if b_strides is not None:
+        torch_b.as_strided_(input_shape, b_strides)   
+ 
+    b = TestTensor(
+        input_shape,
+        torch_b.stride(),
+        dtype,
+        device,
+        "manual",
+        set_tensor=torch_b
+    )
+
+
+    c._torch_tensor = torch_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateEqualDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [c, a, b]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetEqualWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_equal():
+        check_error(
+            LIBINFINIOP.infiniopEqual(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),                
+                None,
+            )
+        )
+
+    lib_equal()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor().to(torch.uint8), c.torch_tensor().to(torch.uint8), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_equal(
+            c.torch_tensor(), a.torch_tensor(), b.torch_tensor()
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my equal passed!\033[0m")
diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py
new file mode 100644
index 000000000..eb139af12
--- /dev/null
+++ b/test/infiniop/exp.py
@@ -0,0 +1,165 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT    = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def exp(output, input):
+    output.copy_(torch.exp(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    exp(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateExpDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetExpWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_exp():
+        check_error(
+            LIBINFINIOP.infiniopExp(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_exp()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/gather.py b/test/infiniop/gather.py
new file mode 100644
index 000000000..b5c8ea93d
--- /dev/null
+++ b/test/infiniop/gather.py
@@ -0,0 +1,160 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES = [
+    # input_shape, output_shape, dim, input_strides, output_strides, index_strides
+    ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None),
+    ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [16, 4, 1]),
+    ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None),
+    ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None)
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_gather(output, input, dim, index):
+    torch.gather(input, dim, index, out=output)
+
+def test(
+    handle,
+    device,
+    input_shape, output_shape, dim, input_strides, output_strides, index_strides,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing Gather on {InfiniDeviceNames[device]} with input shape:{input_shape}, dim:{dim}, output_shape:{output_shape},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device
+    ) 
+    torch_index = torch.randint(low=0, high=input_shape[dim], size=output_shape, dtype=torch.int64)
+    if index_strides:
+        torch_index = torch_index.as_strided(output_shape, index_strides)        
+    index = TestTensor(
+        output_shape,
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    ) 
+    output = TestTensor(
+        output_shape,
+        output_strides,
+        dtype,
+        device,
+    )
+
+    torch_gather(output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGatherDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+            index.descriptor,
+            dim
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGatherWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, input.device)
+
+    def lib_gather():
+        check_error(
+            LIBINFINIOP.infiniopGather(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                index.data(),
+                None,
+            )
+        )
+
+    lib_gather()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print("x:", input.torch_tensor())
+    # print("CALCULATED:\n", output.actual_tensor(), )
+    # print("GT\n", output.torch_tensor())
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_gather(
+            output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor()
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gather(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyGatherDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my Gather passed!\033[0m")
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
new file mode 100644
index 000000000..424b30567
--- /dev/null
+++ b/test/infiniop/hardswish.py
@@ -0,0 +1,167 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def hardswish(output, input):
+    output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    hardswish(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateHardswishDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHardswishWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_hardswish():
+        check_error(
+            LIBINFINIOP.infiniopHardswish(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_hardswish()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/index_copy_inplace.py b/test/infiniop/index_copy_inplace.py
new file mode 100644
index 000000000..97dbd8266
--- /dev/null
+++ b/test/infiniop/index_copy_inplace.py
@@ -0,0 +1,180 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+import random
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+_TEST_CASES = [
+    # input_shape, output_shape, dim, output_strides, input_strides,
+    ([13, 1], [13, 4], 1, [37, 1], [37, 1], Inplace.OUT_OF_PLACE),
+    ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.INPLACE),
+    ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.OUT_OF_PLACE),
+    ([133, 23, 53], [133, 23, 53], 1, None, None, Inplace.OUT_OF_PLACE),
+    ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, Inplace.OUT_OF_PLACE),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_index_copy_inplace(output, input, index, dim):
+    output.index_copy_(dim, index, input.clone())
+    
+
+def test(
+    handle,
+    device,
+    input_shape, output_shape, dim, output_strides, input_strides,
+    inplace,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing index_copy_inplace on {InfiniDeviceNames[device]} with shape:{input_shape},"
+        f"inplace:{inplace},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device,
+    )
+    if inplace == Inplace.INPLACE:
+        assert output_shape == input_shape
+        output = input
+    else:
+        output = TestTensor(
+            output_shape,
+            output_strides,
+            dtype,
+            device,
+            "zeros",
+        )
+
+    index_list = list(range(output_shape[dim]))
+    
+    random.shuffle(index_list)
+    torch_index = torch.tensor(index_list[:input_shape[dim]], dtype=torch.int64)
+    index = TestTensor(
+        [input_shape[dim]],
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    )
+
+    torch_index_copy_inplace(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			index.descriptor,
+			dim,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetIndexCopyInplaceWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_index_copy_inplace():
+        check_error(
+            LIBINFINIOP.infiniopIndexCopyInplace(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),
+				index.data(),                
+                None,
+            )
+        )
+
+    lib_index_copy_inplace()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print('input:\n', input.torch_tensor())
+    # print('index:\n', index.torch_tensor())
+    # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), )
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_index_copy_inplace(
+            output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_index_copy_inplace(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my index_copy_inplace passed!\033[0m")
diff --git a/test/infiniop/interpolate_nearest.py b/test/infiniop/interpolate_nearest.py
new file mode 100644
index 000000000..335bcd7fd
--- /dev/null
+++ b/test/infiniop/interpolate_nearest.py
@@ -0,0 +1,254 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+from typing import List, Tuple
+import math
+from torch.nn import functional as F
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# Test cases: (input_shape, input_stride, output_shape, output_stride)
+_TEST_CASES = [
+    # 2D test cases - simplified to one line each
+    ((1, 1, 2, 2), None, (1, 1, 4, 4), None),  # Simple contiguous case
+    ((1, 3, 4, 4), (48, 16, 4, 1), (1, 3, 8, 8), (192, 64, 8, 1)),  # 2D upscaling 2x
+    ((1, 3, 8, 8), (192, 64, 8, 1), (1, 3, 4, 4), (48, 16, 4, 1)),  # 2D downscaling 2x
+    ((2, 4, 2, 2), (16, 4, 2, 1), (2, 4, 6, 6), (144, 36, 6, 1)),  # Batch upscaling
+    (
+        (1, 1, 3, 5),
+        (15, 15, 5, 1),
+        (1, 1, 9, 10),
+        (90, 90, 10, 1),
+    ),  # Different aspect ratio
+    (
+        (4, 64, 16, 16),
+        (16384, 256, 16, 1),
+        (4, 64, 32, 32),
+        (65536, 1024, 32, 1),
+    ),  # Large batch
+    ((1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 7, 7), (49, 49, 7, 1)),  # Small to large
+    (
+        (1, 2, 3, 4),
+        (24, 1, 8, 2),
+        (1, 2, 6, 8),
+        (96, 1, 16, 2),
+    ),  # Non-contiguous layout
+    ((2, 3, 2, 2), (32, 8, 4, 1), (2, 3, 4, 4), (128, 32, 8, 1)),  # Padded strides
+    # 1D test cases
+    ((1, 3, 8), (24, 8, 1), (1, 3, 16), (48, 16, 1)),  # 1D upscaling 2x
+    ((2, 5, 10), (50, 10, 1), (2, 5, 5), (25, 5, 1)),  # 1D downscaling 2x
+    ((4, 2, 32), (64, 32, 1), (4, 2, 64), (128, 64, 1)),  # 1D larger upscaling
+    # 3D test cases
+    (
+        (1, 2, 2, 2, 2),
+        (16, 8, 4, 2, 1),
+        (1, 2, 4, 4, 4),
+        (128, 64, 16, 4, 1),
+    ),  # 3D upscaling 2x
+    (
+        (1, 1, 2, 3, 4),
+        (24, 24, 12, 4, 1),
+        (1, 1, 4, 6, 8),
+        (192, 192, 48, 8, 1),
+    ),  # 3D uniform upscaling
+    (
+        (3, 2, 5, 5, 5),
+        (250, 125, 25, 5, 1),
+        (3, 2, 3, 3, 3),
+        (54, 27, 9, 3, 1),
+    ),  # 3D non-uniform scaling
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.I8]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def interpolate_nearest(input_tensor, output_shape, output_tensor):
+    """
+    Perform nearest neighbor interpolation using PyTorch as reference
+    """
+    # Extract spatial dimensions (H, W)
+    target_size = output_shape[2:]  # Skip batch and channel dimensions
+
+    # Use PyTorch's interpolate function with nearest mode
+    if input_tensor.dtype in [
+        torch.int8,
+        torch.uint8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    ]:
+        # 对于整数类型，先转换为 float32，进行插值，再转换回原类型
+        original_dtype = input_tensor.dtype
+
+        # 转换为 float32 进行插值
+        float_input = input_tensor.float()
+        result = F.interpolate(float_input, size=target_size, mode="nearest")
+
+        # 转换回原始类型
+        result = result.to(original_dtype)
+    else:
+        result = F.interpolate(input_tensor, size=target_size, mode="nearest")
+
+    output_tensor.copy_(result)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    output_shape,
+    output_stride,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input and output tensors
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+    output_tensor = TestTensor(
+        output_shape, output_stride, dt=tensor_dtype, device=device
+    )
+
+    print(
+        f"Testing InterpolateNearest on {InfiniDeviceNames[device]} with "
+        f"input_shape: {input_shape}, output_shape: {output_shape}, "
+        f"input_stride: {input_stride}, output_stride: {output_stride}, "
+        f"dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    # Compute reference result using PyTorch
+    interpolate_nearest(
+        input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor()
+    )
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor for our interpolate_nearest operator
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateInterpolateNearestDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output_tensor]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    # Get workspace size
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetInterpolateNearestWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_interpolate_nearest():
+        check_error(
+            LIBINFINIOP.infiniopInterpolateNearest(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    # Execute the operation
+    lib_interpolate_nearest()
+
+    # Check results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        output_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    ), f"Results don't match for shape {input_shape} -> {output_shape}"
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: interpolate_nearest(
+                input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor()
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib",
+            lambda: lib_interpolate_nearest(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyInterpolateNearestDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/layer_norm.py b/test/infiniop/layer_norm.py
new file mode 100644
index 000000000..aacf07186
--- /dev/null
+++ b/test/infiniop/layer_norm.py
@@ -0,0 +1,265 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    # shape, bias_exist, eps, input_strides, output_strides, weight_strides
+    ((13, 4, 4), True, 1e-5, [30, 4, 1], [50, 4, 1], [2]),
+    ((16, 5, 563), True, 1e-4, None, None, None),
+    ((5, 16, 563), False, 1e-5, None, None, [10]),
+    ((4, 4, 563), True, 1e-5, None, None, None),
+    ((40, 40, 56), True, 1e-5, [3600, 56, 1], None, None),
+    ((40, 40, 56), False, 1e-5, [3600, 56, 1], None, None),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 5e-2, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 5e-2, "rtol": 5e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_layer_norm(
+    output:torch.Tensor,
+    input_standardization:torch.Tensor,
+    input_std_deviation:torch.Tensor,
+    input:torch.Tensor,
+    weight, bias, eps,
+    bias_exist: bool
+):
+    normalized_shape = input.shape[-1:]
+    ln = torch.nn.LayerNorm(
+        normalized_shape=normalized_shape,
+        eps=eps,
+        dtype=torch.float,
+        bias=bias_exist,
+        device=input.device
+    )
+    ln.weight.data = weight.type(torch.float)
+    if bias_exist:
+        ln.bias.data = bias.type(torch.float)
+    input = input.type(torch.float)
+    mean = input.mean(dim=-1, keepdim=True)
+    var = input.var(dim=-1, correction=0)    
+    std = torch.sqrt(var + eps)
+    input_standardization.copy_(
+        ((input - mean) / std.unsqueeze(2)).type(input_standardization.dtype)
+    )
+    input_std_deviation.copy_(std.type(input_standardization.dtype))
+    output.copy_(ln(input).detach().type(output.dtype))
+
+    
+
+def test(
+    handle,
+    device,
+    input_shape,
+    bias_exist,
+    eps,
+    input_strides,
+    output_strides,
+    weight_strides,
+    inplace,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing layer_norm on {InfiniDeviceNames[device]} with input_shape:{input_shape},"
+        f"bias:{bias_exist},eps:{eps},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )    
+
+    input_standardization = TestTensor(
+        input_shape,
+        None,
+        dtype,
+        device,
+    )
+    
+    input_std_deviation = TestTensor(
+        input_shape[:-1],
+        None,
+        dtype,
+        device,
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device,
+    )
+    if inplace == Inplace.INPLACE:
+        if output_strides != input_strides:
+            return
+        output = input
+    else:
+        output = TestTensor(
+            input_shape,
+            output_strides,
+            dtype,
+            device,
+        )
+    
+
+    weight = TestTensor(
+        input_shape[-1:],
+        weight_strides,
+        dtype,
+        device,
+    )
+
+    bias = TestTensor(
+        input_shape[-1:],
+        None,
+        dtype,
+        device,
+    ) if bias_exist else None
+
+    torch_layer_norm(
+        output.torch_tensor(),
+        input_standardization.torch_tensor(),
+        input_std_deviation.torch_tensor(),
+        input.torch_tensor(),
+        weight.torch_tensor(),
+        bias.torch_tensor() if bias_exist else None,
+        eps,
+        bias_exist
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLayerNormDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input_standardization.descriptor,
+			input_std_deviation.descriptor,
+			input.descriptor,
+			weight.descriptor,
+			bias.descriptor if bias_exist else None,
+			eps,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input_standardization, input_std_deviation, input, weight] + [bias] if bias_exist else []:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLayerNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_layer_norm():
+        check_error(
+            LIBINFINIOP.infiniopLayerNorm(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input_standardization.data(),
+				input_std_deviation.data(),
+				input.data(),
+				weight.data(),
+				bias.data() if bias_exist else None,                
+                None,
+            )
+        )
+
+    lib_layer_norm()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+        debug(input_standardization.actual_tensor(), input_standardization.torch_tensor(), atol=atol, rtol=rtol)
+        debug(input_std_deviation.actual_tensor(), input_std_deviation.torch_tensor(), atol=atol, rtol=rtol)
+    # print('input:\n', input.torch_tensor(), '\n')
+    # print('weight:\n', weight.torch_tensor(), '\n')
+    # print('bias:\n', bias.torch_tensor(), '\n')
+    # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), )
+    # print('input_standardization:\n', input_standardization.torch_tensor(), '\n', input_standardization.actual_tensor(), )
+    # print('input_std_deviation:\n', input_std_deviation.torch_tensor(), '\n', input_std_deviation.actual_tensor(), )
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(input_standardization.actual_tensor(), input_standardization.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(input_std_deviation.actual_tensor(), input_std_deviation.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_layer_norm(
+            output, input_standardization, input_std_deviation, input, weight, bias, eps, bias_exist
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_layer_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLayerNormDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my layer_norm passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 869e4aa86..ae20ccf2e 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -4,7 +4,7 @@
     infiniopOperatorDescriptor_t,
 )
 
-from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
+from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool
 
 
 class OpRegister:
@@ -565,7 +565,6 @@ def dequantize_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
-
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
@@ -586,3 +585,670 @@ def softplus_(lib):
     ]
     lib.infiniopDestroySoftplusDescriptor.restype = c_int32
     lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def cross_entropy_loss_(lib):
+    lib.infiniopCreateCrossEntropyLossDescriptor.restype = c_int32
+    lib.infiniopCreateCrossEntropyLossDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # loss_desc
+        infiniopTensorDescriptor_t,  # logits_desc
+        infiniopTensorDescriptor_t,  # target_desc
+    ]
+
+    lib.infiniopGetCrossEntropyLossWorkspaceSize.restype = c_int32
+    lib.infiniopGetCrossEntropyLossWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCrossEntropyLoss.restype = c_int32
+    lib.infiniopCrossEntropyLoss.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # loss
+        c_void_p,  # logits
+        c_void_p,  # target
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyCrossEntropyLossDescriptor.restype = c_int32
+    lib.infiniopDestroyCrossEntropyLossDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def avg_pool_(lib):
+    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+        c_void_p,  # kernel_size
+        c_void_p,  # strides
+        c_void_p,  # pads
+        c_bool,  # ceil_mode
+    ]
+
+
+    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAvgPool.restype = c_int32
+    lib.infiniopAvgPool.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def exp_(lib):
+    lib.infiniopCreateExpDescriptor.restype = c_int32
+    lib.infiniopCreateExpDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetExpWorkspaceSize.restype = c_int32
+    lib.infiniopGetExpWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+
+    lib.infiniopExp.restype = c_int32
+    lib.infiniopExp.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyExpDescriptor.restype = c_int32
+    lib.infiniopDestroyExpDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sin_(lib):
+    lib.infiniopCreateSinDescriptor.restype = c_int32
+    lib.infiniopCreateSinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSinWorkspaceSize.restype = c_int32
+    lib.infiniopGetSinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+      
+    lib.infiniopSin.restype = c_int32
+    lib.infiniopSin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+      
+    lib.infiniopDestroySinDescriptor.restype = c_int32
+    lib.infiniopDestroySinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+ 
+@OpRegister.operator     
+def tanh_(lib):
+    lib.infiniopCreateTanhDescriptor.restype = c_int32
+    lib.infiniopCreateTanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+      
+    lib.infiniopGetTanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetTanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+      
+    lib.infiniopTanh.restype = c_int32
+    lib.infiniopTanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+      
+    lib.infiniopDestroyTanhDescriptor.restype = c_int32
+    lib.infiniopDestroyTanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+    
+      
+@OpRegister.operator
+def max_pool_(lib):
+    lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
+    lib.infiniopCreateMaxPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+        c_void_p,  # kernel_size
+        c_void_p,  # strides
+        c_void_p,  # pads
+        c_bool,  # ceil_mode
+    ]
+      
+    lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMaxPool.restype = c_int32
+    lib.infiniopMaxPool.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+    
+    lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def interpolate_nearest_(lib):
+    lib.infiniopCreateInterpolateNearestDescriptor.restype = c_int32
+    lib.infiniopCreateInterpolateNearestDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+    ]
+
+    lib.infiniopGetInterpolateNearestWorkspaceSize.restype = c_int32
+    lib.infiniopGetInterpolateNearestWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopInterpolateNearest.restype = c_int32
+    lib.infiniopInterpolateNearest.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+      
+    lib.infiniopDestroyInterpolateNearestDescriptor.restype = c_int32
+    lib.infiniopDestroyInterpolateNearestDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def hardswish_(lib):
+    lib.infiniopCreateHardswishDescriptor.restype = c_int32
+    lib.infiniopCreateHardswishDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+      
+    lib.infiniopGetHardswishWorkspaceSize.restype = c_int32
+    lib.infiniopGetHardswishWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+      
+    lib.infiniopHardswish.restype = c_int32
+    lib.infiniopHardswish.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    
+    lib.infiniopDestroyHardswishDescriptor.restype = c_int32
+    lib.infiniopDestroyHardswishDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+    
+
+@OpRegister.operator
+def where_(lib):
+    lib.infiniopCreateWhereDescriptor.restype = c_int32
+    lib.infiniopCreateWhereDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    
+    lib.infiniopGetWhereWorkspaceSize.restype = c_int32
+    lib.infiniopGetWhereWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopWhere.restype = c_int32
+    lib.infiniopWhere.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyWhereDescriptor.restype = c_int32
+    lib.infiniopDestroyWhereDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def reduce_max_(lib):
+    lib.infiniopCreateReduceMaxDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopReduceMax.restype = c_int32
+    lib.infiniopReduceMax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def div_(lib):
+    lib.infiniopCreateDivDescriptor.restype = c_int32
+    lib.infiniopCreateDivDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetDivWorkspaceSize.restype = c_int32
+    lib.infiniopGetDivWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDiv.restype = c_int32
+    lib.infiniopDiv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDivDescriptor.restype = c_int32
+    lib.infiniopDestroyDivDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+      
+@OpRegister.operator
+def equal_(lib):
+    lib.infiniopCreateEqualDescriptor.restype = c_int32
+    lib.infiniopCreateEqualDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+      
+    lib.infiniopGetEqualWorkspaceSize.restype = c_int32
+    lib.infiniopGetEqualWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopEqual.restype = c_int32
+    lib.infiniopEqual.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    
+    lib.infiniopDestroyEqualDescriptor.restype = c_int32
+    lib.infiniopDestroyEqualDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def batch_norm_(lib):
+    lib.infiniopCreateBatchNormDescriptor.restype = c_int32
+    lib.infiniopCreateBatchNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+    ]
+    lib.infiniopGetBatchNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetBatchNormWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopBatchNorm.restype = c_int32
+    lib.infiniopBatchNorm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyBatchNormDescriptor.restype = c_int32
+    lib.infiniopDestroyBatchNormDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def scatter_(lib):
+    lib.infiniopCreateScatterDescriptor.restype = c_int32
+    lib.infiniopCreateScatterDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    
+    lib.infiniopGetScatterWorkspaceSize.restype = c_int32
+    lib.infiniopGetScatterWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    
+    lib.infiniopScatter.restype = c_int32
+    lib.infiniopScatter.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    
+    lib.infiniopDestroyScatterDescriptor.restype = c_int32
+    lib.infiniopDestroyScatterDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+      
+@OpRegister.operator
+def gather_(lib):
+    lib.infiniopCreateGatherDescriptor.restype = c_int32
+    lib.infiniopCreateGatherDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    
+    lib.infiniopGetGatherWorkspaceSize.restype = c_int32
+    lib.infiniopGetGatherWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    
+    lib.infiniopGather.restype = c_int32
+    lib.infiniopGather.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    
+    lib.infiniopDestroyGatherDescriptor.restype = c_int32
+    lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+      
+@OpRegister.operator     
+def index_copy_inplace_(lib):
+    lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32
+    lib.infiniopCreateIndexCopyInplaceDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+      
+    lib.infiniopGetIndexCopyInplaceWorkspaceSize.restype = c_int32
+    lib.infiniopGetIndexCopyInplaceWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopIndexCopyInplace.restype = c_int32
+    lib.infiniopIndexCopyInplace.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    
+    lib.infiniopDestroyIndexCopyInplaceDescriptor.restype = c_int32
+    lib.infiniopDestroyIndexCopyInplaceDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def layer_norm_(lib):
+    lib.infiniopCreateLayerNormDescriptor.restype = c_int32
+    lib.infiniopCreateLayerNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+    lib.infiniopGetLayerNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetLayerNormWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopLayerNorm.restype = c_int32
+    lib.infiniopLayerNorm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    
+    lib.infiniopDestroyLayerNormDescriptor.restype = c_int32
+    lib.infiniopDestroyLayerNormDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+      
+@OpRegister.operator
+def logical_or_(lib):
+    lib.infiniopCreateLogicalOrDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalOrDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+      
+    lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    
+    lib.infiniopLogicalOr.restype = c_int32
+    lib.infiniopLogicalOr.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    
+    lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalOrDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def logical_and_(lib):
+    lib.infiniopCreateLogicalAndDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalAndDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLogicalAnd.restype = c_int32
+    lib.infiniopLogicalAnd.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalAndDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def reduce_mean_(lib):
+    lib.infiniopCreateReduceMeanDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMeanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopReduceMean.restype = c_int32
+    lib.infiniopReduceMean.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMeanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index 510e3d2fa..cdcfbc515 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -66,10 +66,34 @@ def __init__(
                 torch_strides.append(strides[i])
             else:
                 torch_shape.append(shape[i])
+
+        is_bool = dt == InfiniDtype.BOOL
+        if is_bool:
+            dt = InfiniDtype.F32
+
+        is_int = (
+            dt == InfiniDtype.I8
+            or dt == InfiniDtype.I16
+            or dt == InfiniDtype.I32
+            or dt == InfiniDtype.I64
+        )
+
+        torch_dtype = to_torch_dtype(dt)
         if mode == "random":
-            self._torch_tensor = torch.rand(
-                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
-            )
+            if is_int:
+                self._torch_tensor = torch.randint(
+                    0,
+                    100,
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
+            else:
+                self._torch_tensor = torch.rand(
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
         elif mode == "zeros":
             self._torch_tensor = torch.zeros(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
@@ -90,10 +114,36 @@ def __init__(
         else:
             raise ValueError("Unsupported mode")
 
+        if is_bool:
+            self._torch_tensor = self._torch_tensor > 0.5
+
         if scale is not None:
-            self._torch_tensor *= scale
+            if torch_dtype in [
+                torch.int8,
+                torch.uint8,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]:
+                # 对于整数类型，先转换为 float，应用 scale，再转换回原类型
+                self._torch_tensor = (self._torch_tensor.float() * scale).to(
+                    torch_dtype
+                )
+            else:
+                self._torch_tensor *= scale
+
         if bias is not None:
-            self._torch_tensor += bias
+            if torch_dtype in [
+                torch.int8,
+                torch.uint8,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]:
+                # 对于整数类型，先转换为 float，应用 bias，再转换回原类型
+                self._torch_tensor = (self._torch_tensor.float() + bias).to(torch_dtype)
+            else:
+                self._torch_tensor += bias
 
         if strides is not None:
             self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides)
@@ -142,6 +192,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
         return torch.float32
     elif dt == InfiniDtype.F64:
         return torch.float64
+    elif dt == InfiniDtype.BOOL:
+        return torch.bool
     # TODO: These following types may not be supported by older
     # versions of PyTorch. Use compatability mode to convert them.
     elif dt == InfiniDtype.U16:
diff --git a/test/infiniop/logical_and.py b/test/infiniop/logical_and.py
new file mode 100644
index 000000000..b44b83968
--- /dev/null
+++ b/test/infiniop/logical_and.py
@@ -0,0 +1,196 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+    InfiniDtype.BOOL,
+    InfiniDtype.I8,
+    InfiniDtype.I16,
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.BF16,
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+    InfiniDtype.I16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.F64: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def logical_and(c, a, b):
+    torch.logical_and(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing LogicalAnd on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogicalAndDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogicalAndWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_logical_and():
+        check_error(
+            LIBINFINIOP.infiniopLogicalAnd(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_logical_and()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.equal(c.actual_tensor(), c.torch_tensor())
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_logical_and(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLogicalAndDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/logical_or.py b/test/infiniop/logical_or.py
new file mode 100644
index 000000000..7bf991052
--- /dev/null
+++ b/test/infiniop/logical_or.py
@@ -0,0 +1,196 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+    InfiniDtype.BOOL,
+    InfiniDtype.I8,
+    InfiniDtype.I16,
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.BF16,
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+    InfiniDtype.I16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.F64: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def logical_or(c, a, b):
+    torch.logical_or(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing LogicalOr on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogicalOrDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogicalOrWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_logical_or():
+        check_error(
+            LIBINFINIOP.infiniopLogicalOr(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_logical_or()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.equal(c.actual_tensor(), c.torch_tensor())
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_logical_or(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLogicalOrDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/maxpool.py b/test/infiniop/maxpool.py
new file mode 100644
index 000000000..81ddce060
--- /dev/null
+++ b/test/infiniop/maxpool.py
@@ -0,0 +1,242 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+from typing import List, Tuple
+import math
+from torch.nn import functional as F
+
+# Configuration for profiling
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode)
+_TEST_CASES = [
+    # 1D max pooling cases
+    ((1, 3, 8), None, (2,), (2,), (0,), False),
+    ((2, 4, 16), None, (3,), (2,), (1,), False),
+    ((3, 2, 77), None, (6,), (4,), (3,), True),
+    # 2D max pooling cases
+    ((1, 1, 4, 4), None, (2, 2), (2, 2), (0, 0), False),
+    ((2, 3, 8, 8), None, (3, 3), (2, 2), (1, 1), False),
+    ((1, 64, 32, 32), None, (2, 2), (2, 2), (0, 0), False),
+    ((4, 128, 16, 16), None, (3, 3), (1, 1), (1, 1), False),
+    # 3D max pooling cases
+    ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False),
+    ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False),
+    # Cases with ceil_mode=True
+    ((1, 1, 7, 7), None, (3, 3), (2, 2), (1, 1), True),
+    ((1, 2, 5), None, (3,), (2,), (0,), True),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def max_pool(input_tensor, kernel_size, stride, padding, ceil_mode):
+    """
+    Perform max pooling using PyTorch as reference
+    """
+    ndim = len(input_tensor.shape) - 2  # Spatial dimensions
+
+    if ndim == 1:
+        result = F.max_pool1d(
+            input_tensor,
+            kernel_size=kernel_size[0],
+            stride=stride[0],
+            padding=padding[0],
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 2:
+        result = F.max_pool2d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 3:
+        result = F.max_pool3d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    else:
+        raise ValueError(f"Unsupported spatial dimensions: {ndim}")
+
+    return result
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    """Convert a python tuple to a ctype void pointer"""
+    array = ctypes.c_uint64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input tensor
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+
+    # Compute reference result using PyTorch
+    torch_ref_output = max_pool(
+        input_tensor.torch_tensor(),
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+    )
+
+    # Use PyTorch输出shape来初始化output_tensor
+    output_tensor = TestTensor(
+        torch_ref_output.shape, None, dt=tensor_dtype, device=device
+    )
+
+    print(
+        f"Testing MaxPool on {InfiniDeviceNames[device]} with "
+        f"input_shape: {input_shape}, kernel_size: {kernel_size}, "
+        f"stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, "
+        f"dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor for our max pool operator
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMaxPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output_tensor]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    # Get workspace size
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMaxPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_max_pool():
+        check_error(
+            LIBINFINIOP.infiniopMaxPool(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    # Execute the operation
+    lib_max_pool()
+
+    # Check results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            torch_ref_output,
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        torch_ref_output,
+        atol=atol,
+        rtol=rtol,
+    ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}"
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: max_pool(
+                input_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_max_pool(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyMaxPoolDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py
new file mode 100644
index 000000000..3b738ec1e
--- /dev/null
+++ b/test/infiniop/reduce_max.py
@@ -0,0 +1,154 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not maxt to be imported from other modules
+_TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    ((), (), None, None, 0),
+    ((1,), (32,), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+]
+
+# x types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TEST_CASES = _TEST_CASES_
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reduce_max(x, dim):
+    return x.max(dim=dim, keepdim=True)[0]
+
+
+def test(
+    handle,
+    device,
+    y_shape,
+    x_shape,
+    y_stride,
+    x_stride,
+    dim,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Reduce_Max on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}"
+        f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    ans = reduce_max(x.torch_tensor(), dim)
+
+    y = TestTensor(y_shape, y_stride, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReduceMaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            ctypes.c_size_t(dim),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReduceMaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_reduce_max():
+        check_error(
+            LIBINFINIOP.infiniopReduceMax(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_reduce_max()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: reduce_max(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_reduce_max(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReduceMaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py
new file mode 100644
index 000000000..dfa5ee7a8
--- /dev/null
+++ b/test/infiniop/reduce_mean.py
@@ -0,0 +1,154 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    ((), (), None, None, 0),
+    ((1,), (32,), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+]
+
+# x types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TEST_CASES = _TEST_CASES_
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reduce_mean(x, dim):
+    return x.mean(dim=dim, keepdim=True)
+
+
+def test(
+    handle,
+    device,
+    y_shape,
+    x_shape,
+    y_stride,
+    x_stride,
+    dim,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Reduce_Mean on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}"
+        f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    ans = reduce_mean(x.torch_tensor(), dim)
+
+    y = TestTensor(y_shape, y_stride, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReduceMeanDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            ctypes.c_size_t(dim),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReduceMeanWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_reduce_mean():
+        check_error(
+            LIBINFINIOP.infiniopReduceMean(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_reduce_mean()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: reduce_mean(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_reduce_mean(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReduceMeanDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/scatter.py b/test/infiniop/scatter.py
new file mode 100644
index 000000000..86ccdcdeb
--- /dev/null
+++ b/test/infiniop/scatter.py
@@ -0,0 +1,196 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+import random
+
+_TEST_CASES = [
+    # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides
+    ((6, 7), (6, 7), (6, 7), 1, (7, 1), (1, 7), None),
+    ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (1, 2, 6), None, None),
+    ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, None, None, [16, 4, 1]),
+    ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, [16, 8, 4, 1], None),
+]
+
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_scatter(output: torch.Tensor, input, index, dim):
+    output.scatter_(dim, index, src=input)
+    
+
+def test(
+    handle,
+    device,
+    input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing scatter on {InfiniDeviceNames[device]} with input_shape:{input_shape}, index_shape:{index_shape}, output_shape:{output_shape}, dim:{dim},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    output = TestTensor(
+        output_shape,
+        output_strides,
+        dtype,
+        device,
+        "zeros",
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device,
+    )
+
+    def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim):
+        index = torch.empty(index_shape, dtype=torch.int64)
+        ndim = len(input_shape)
+        if ndim == 2 and scatter_dim == 1:
+            for i in range(input.shape[0]):
+                row = list(range(output_shape[dim]))
+                random.shuffle(row)
+                index[i, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 3 and scatter_dim == 2:
+            for i in range(input.shape[0]):
+                for j in range(input.shape[1]):
+                    row = list(range(output_shape[dim]))
+                    random.shuffle(row)
+                    index[i, j, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 3 and scatter_dim == 1:
+            for i in range(input.shape[0]):
+                for j in range(input.shape[2]):
+                    row = list(range(output_shape[dim]))
+                    random.shuffle(row)
+                    index[i, :, j] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 4 and scatter_dim == 0:
+            for i in range(input.shape[1]):
+                for j in range(input.shape[2]):
+                    for k in range(input.shape[3]):
+                        row = list(range(output_shape[dim]))
+                        random.shuffle(row)
+                        index[:, i, j, k] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        return index
+    
+    torch_index = get_test_index_tensor(input_shape, index_shape, output_shape, dim).type(torch.int64)
+    if index_strides:
+        torch_index = torch_index.as_strided(index_shape, index_strides)    
+    index = TestTensor(
+        index_shape,
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    )
+
+    torch_scatter(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateScatterDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			index.descriptor,
+			dim,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetScatterWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_scatter():
+        check_error(
+            LIBINFINIOP.infiniopScatter(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),
+				index.data(),                
+                None,
+            )
+        )
+
+    lib_scatter()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print('input:\n', input.torch_tensor())
+    # print('index:\n', index.torch_tensor())
+    # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), )
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_scatter(
+            output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_scatter(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyScatterDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my scatter passed!\033[0m")
diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py
new file mode 100644
index 000000000..613257e9c
--- /dev/null
+++ b/test/infiniop/sin.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def sin(output, input):
+    output.copy_(torch.sin(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    sin(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_sin():
+        check_error(
+            LIBINFINIOP.infiniopSin(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_sin()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sin(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py
new file mode 100644
index 000000000..dc6ec46e8
--- /dev/null
+++ b/test/infiniop/tanh.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def tanh(output, input):
+    output.copy_(torch.tanh(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    tanh(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTanhDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_tanh():
+        check_error(
+            LIBINFINIOP.infiniopTanh(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_tanh()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+        
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/where.py b/test/infiniop/where.py
new file mode 100644
index 000000000..c940d4f05
--- /dev/null
+++ b/test/infiniop/where.py
@@ -0,0 +1,288 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+    to_torch_dtype,
+    torch_device_map,
+)
+from enum import Enum, auto
+
+# ======================================================================
+# Configuration (Internal Use Only)
+# Now each test case tuple is: (shape, a_stride, b_stride, cond_stride, c_stride)
+# ======================================================================
+_TEST_CASES_ = [
+    ((13, 4), None, None, None, None),
+    ((13, 4), None, None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None, None),
+    ((13, 4, 4), None, None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None),
+    ((16, 5632), None, None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_COND = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+    Inplace.INPLACE_COND,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+_INTEGER_DTYPES = [
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.U32,
+    InfiniDtype.U64,
+]
+
+_FLOAT_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+_TENSOR_DTYPES = _INTEGER_DTYPES + _FLOAT_DTYPES
+
+_TOLERANCE_MAP = {
+    InfiniDtype.I32: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.I64: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.U32: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.U64: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+def is_supported_dt(inf_dt):
+    try:
+        td = to_torch_dtype(inf_dt, compatability_mode=True)
+        _ = torch.empty((1,), dtype=td, device="cpu")
+        return True
+    except Exception:
+        return False
+
+def _is_integer_dtype(inf_dt):
+    return inf_dt in _INTEGER_DTYPES
+
+def _is_unsigned_dtype(inf_dt):
+    return inf_dt in (InfiniDtype.U32, InfiniDtype.U64)
+
+
+def make_integer_torch_tensor(shape, inf_dt, device):
+    use_compatibility = _is_unsigned_dtype(inf_dt)
+
+    if inf_dt == InfiniDtype.I32:
+        low, high, dtype = -2000, 2000, torch.int32
+    elif inf_dt == InfiniDtype.I64:
+        low, high, dtype = -2048, 2048, torch.int64
+    elif inf_dt == InfiniDtype.U32:
+        low, high, dtype = 0, 2000, torch.int32
+    elif inf_dt == InfiniDtype.U64:
+        low, high, dtype = 0, 2048, torch.int64
+    else:
+        low, high, dtype = 0, 1, torch.int64
+
+    dev = torch_device_map[device]
+
+    t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev)
+
+    target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility)
+    if t.dtype != target_torch_dt:
+        t = t.to(dtype=target_torch_dt)
+
+    return t
+
+def where_ref(c, a, b, cond):
+    cond_bool = cond.torch_tensor().to(torch.bool)
+    c.torch_tensor().copy_(torch.where(cond_bool, a.torch_tensor(), b.torch_tensor()))
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    cond_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    inf_dt = dtype
+
+    if not is_supported_dt(inf_dt):
+        # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on this platform")
+        return
+
+    try:
+        if _is_integer_dtype(inf_dt):
+            a_torch = make_integer_torch_tensor(shape, inf_dt, device)
+            b_torch = make_integer_torch_tensor(shape, inf_dt, device)
+            a = TestTensor.from_torch(a_torch, inf_dt, device)
+            b = TestTensor.from_torch(b_torch, inf_dt, device)
+        else:
+            a = TestTensor(shape, a_stride, inf_dt, device, mode="random")
+            b = TestTensor(shape, b_stride, inf_dt, device, mode="random")
+    except RuntimeError as e:
+        msg = str(e)
+        if "not implemented for 'UInt32'" in msg or "not implemented for 'UInt64'" in msg or "check_uniform_bounds" in msg:
+            # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} because platform torch can't build random tensor: {e}")
+            return
+        else:
+            raise
+
+    dev = torch_device_map[device]
+    if _is_integer_dtype(inf_dt):
+        cond_torch = torch.randint(0, 2, size=shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev)
+    else:
+        cond_bool = (torch.rand(shape, device=dev) > 0.5)
+        cond_torch = cond_bool.to(dtype=to_torch_dtype(inf_dt, compatability_mode=False))
+
+    cond = TestTensor.from_torch(cond_torch, inf_dt, device)
+
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    elif inplace == Inplace.INPLACE_COND:
+        if c_stride != cond_stride:
+            return
+        c = cond    
+    else:
+        if _is_integer_dtype(inf_dt):
+            dev = torch_device_map[device]
+            c_torch = torch.zeros(shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev)
+            c = TestTensor.from_torch(c_torch, inf_dt, device)
+        else:
+            c = TestTensor(shape, c_stride, inf_dt, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Where on {InfiniDeviceNames[device]} "
+        f"shape:{shape} a_stride:{a_stride} b_stride:{b_stride} cond_stride:{cond_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[inf_dt]} inplace:{inplace}"
+    )
+
+    where_ref(c, a, b, cond)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    try:
+        check_error(
+            LIBINFINIOP.infiniopCreateWhereDescriptor(
+                handle,
+                ctypes.byref(descriptor),
+                c.descriptor,
+                a.descriptor,
+                b.descriptor,
+                cond.descriptor,
+            )
+        )
+    except Exception as e:
+        # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on {InfiniDeviceNames[device]}: CreateWhereDescriptor failed: {e}")
+        return
+
+    for tensor in [a, b, c, cond]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetWhereWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_where():
+        check_error(
+            LIBINFINIOP.infiniopWhere(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                cond.data(),
+                None,
+            )
+        )
+
+    lib_where()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, inf_dt)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation("PyTorch", lambda: where_ref(c, a, b, cond), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+    check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor))
+
+
+def main():
+    args = get_args()
+    global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    supported = [dt for dt in _TENSOR_DTYPES if is_supported_dt(dt)]
+    devices = get_test_devices(args)
+
+    for device in devices:
+        test_operator(device, test, _TEST_CASES, supported)
+
+    print("\033[92mTest passed!\033[0m")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xmake.lua b/xmake.lua
index 67add0d45..fbb5156c3 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -110,6 +110,16 @@ if has_config("metax-gpu") then
     includes("xmake/metax.lua")
 end
 
+option("hcdnn")
+    set_default(true)
+    set_showmenu(true)
+    set_description("Whether to compile hcdnn for Metax GPU")
+option_end()
+
+if has_config("hcdnn") then
+    add_defines("ENABLE_HCDNN_API")
+end
+
 -- 摩尔线程
 option("moore-gpu")
     set_default(false)