diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index a7249ec9d..83ea4e894 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -1,9 +1,11 @@
 #pragma once
 
 #include "ops/add.hpp"
+#include "ops/asin.hpp"
 #include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/fmin.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
 #include "ops/paged_attention.hpp"
@@ -14,4 +16,4 @@
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
-#include "ops/swiglu.hpp"
+#include "ops/swiglu.hpp"
\ No newline at end of file
diff --git a/include/infinicore/ops/adaptive_avg_pool3d.hpp b/include/infinicore/ops/adaptive_avg_pool3d.hpp
new file mode 100644
index 000000000..ab1dc9253
--- /dev/null
+++ b/include/infinicore/ops/adaptive_avg_pool3d.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class AdaptiveAvgPool3D {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor y, Tensor x);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor adaptive_avg_pool3d(Tensor x, std::vector<size_t> output_size);
+void adaptive_avg_pool3d_(Tensor y, Tensor x);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/addr.hpp b/include/infinicore/ops/addr.hpp
new file mode 100644
index 000000000..92d97fae8
--- /dev/null
+++ b/include/infinicore/ops/addr.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Addr {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float, float);
+    static void execute(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor addr(Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f);
+void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/argwhere.hpp b/include/infinicore/ops/argwhere.hpp
new file mode 100644
index 000000000..f0c629adb
--- /dev/null
+++ b/include/infinicore/ops/argwhere.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Argwhere {
+public:
+    using schema = void (*)(void **, size_t *, Tensor);
+    static void execute(void **, size_t *count, Tensor x);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor argwhere(Tensor x);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/asin.hpp b/include/infinicore/ops/asin.hpp
new file mode 100644
index 000000000..e8667b037
--- /dev/null
+++ b/include/infinicore/ops/asin.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Asin {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor asin(Tensor input);
+void asin_(Tensor output, Tensor input);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/fmin.hpp b/include/infinicore/ops/fmin.hpp
new file mode 100644
index 000000000..4ea00787f
--- /dev/null
+++ b/include/infinicore/ops/fmin.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Fmin {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor c, Tensor a, Tensor b);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor fmin(Tensor a, Tensor b);
+void fmin_(Tensor c, Tensor a, Tensor b);
+
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index c0a09fcb4..2cd81e9ea 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -2,13 +2,17 @@
 #define __INFINIOP_API_H__
 
 #include "infiniop/handle.h"
+#include "infiniop/ops/adaptive_avg_pool3d.h"
 #include "infiniop/ops/add.h"
+#include "infiniop/ops/addr.h"
 #include "infiniop/ops/add_rms_norm.h"
+#include "infiniop/ops/asin.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/dequantize_awq.h"
+#include "infiniop/ops/fmin.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/layer_norm.h"
diff --git a/include/infiniop/ops/adaptive_avg_pool3d.h b/include/infiniop/ops/adaptive_avg_pool3d.h
new file mode 100644
index 000000000..178a509e1
--- /dev/null
+++ b/include/infiniop/ops/adaptive_avg_pool3d.h
@@ -0,0 +1,30 @@
+#ifndef INFINIOP_ADAPTIVE_AVG_POOL3D_H_
+#define INFINIOP_ADAPTIVE_AVG_POOL3D_H_
+
+#include "../operator_descriptor.h"
+#include <cstddef>
+
+typedef struct InfiniopDescriptor *infiniopAdaptiveAvgPool3DDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor(
+    infiniopHandle_t handle,
+    infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    size_t *output_size);
+
+__C __export infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize(
+    infiniopAdaptiveAvgPool3DDescriptor_t desc,
+    size_t *size);
+
+__C __export infiniStatus_t infiniopAdaptiveAvgPool3D(
+    infiniopAdaptiveAvgPool3DDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/addr.h b/include/infiniop/ops/addr.h
new file mode 100644
index 000000000..6cae92d97
--- /dev/null
+++ b/include/infiniop/ops/addr.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_ADDR_API_H__
+#define __INFINIOP_ADDR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAddrDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAddrDescriptor(infiniopHandle_t handle,
+                                                         infiniopAddrDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t out,
+                                                         infiniopTensorDescriptor_t input,
+                                                         infiniopTensorDescriptor_t vec1,
+                                                         infiniopTensorDescriptor_t vec2,
+                                                         float beta,
+                                                         float alpha);
+
+__C __export infiniStatus_t infiniopGetAddrWorkspaceSize(infiniopAddrDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAddr(infiniopAddrDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *out,
+                                         const void *input,
+                                         const void *vec1,
+                                         const void *vec2,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/argwhere.h b/include/infiniop/ops/argwhere.h
new file mode 100644
index 000000000..653cbba94
--- /dev/null
+++ b/include/infiniop/ops/argwhere.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_ARGWHERE_API_H__
+#define __INFINIOP_ARGWHERE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopArgwhereDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateArgwhereDescriptor(
+    infiniopHandle_t handle,
+    infiniopArgwhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc);
+
+__C __export infiniStatus_t infiniopGetArgwhereWorkspaceSize(
+    infiniopArgwhereDescriptor_t desc,
+    size_t *size);
+
+__C __export infiniStatus_t infiniopArgwhere(
+    infiniopArgwhereDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void **output,
+    size_t *count,
+    const void *input,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyArgwhereDescriptor(
+    infiniopArgwhereDescriptor_t desc);
+
+#endif // __INFINIOP_ARGWHERE_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h
new file mode 100644
index 000000000..ee40514d0
--- /dev/null
+++ b/include/infiniop/ops/asin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ASIN_API_H__
+#define __INFINIOP_ASIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle,
+                                                         infiniopAsinDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/fmin.h b/include/infiniop/ops/fmin.h
new file mode 100644
index 000000000..274e06a46
--- /dev/null
+++ b/include/infiniop/ops/fmin.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_FMIN_H__
+#define __INFINIOP_FMIN_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopFminDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateFminDescriptor(infiniopHandle_t handle,
+                                                         infiniopFminDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t c,
+                                                         infiniopTensorDescriptor_t a,
+                                                         infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopFmin(infiniopFminDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *c,
+                                         const void *a,
+                                         const void *b,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc);
+
+#endif
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index b7288f3ac..a4ce58c9d 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -40,6 +40,8 @@
     uint8,
 )
 from infinicore.ops.add import add
+from infinicore.ops.addr import addr
+from infinicore.ops.asin import asin
 from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_
 from infinicore.ops.attention import attention
 from infinicore.ops.matmul import matmul
@@ -49,8 +51,11 @@
 from infinicore.ops.paged_attention_prefill import paged_attention_prefill
 from infinicore.ops.paged_caching import paged_caching
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.argwhere import argwhere
+from infinicore.ops.fmin import fmin
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
+
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -106,8 +111,11 @@
     "uint8",
     # Operations.
     "add",
+    "addr",
     "add_rms_norm",
     "add_rms_norm_",
+    "argwhere",
+    "asin",
     "attention",
     "matmul",
     "mul",
@@ -121,6 +129,7 @@
     "from_list",
     "from_numpy",
     "from_torch",
+    "fmin",
     "paged_caching",
     "paged_attention",
     "paged_attention_prefill",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..61dbab685 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,6 +6,8 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .adaptive_avg_pool3d import adaptive_avg_pool3d
+
 
 __all__ = [
     "causal_softmax",
@@ -17,4 +19,5 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "adaptive_avg_pool3d",
 ]
diff --git a/python/infinicore/nn/functional/adaptive_avg_pool3d.py b/python/infinicore/nn/functional/adaptive_avg_pool3d.py
new file mode 100644
index 000000000..9f8a3a4a6
--- /dev/null
+++ b/python/infinicore/nn/functional/adaptive_avg_pool3d.py
@@ -0,0 +1,16 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+from typing import List
+
+
+def adaptive_avg_pool3d(x: Tensor, output_size: List[int] = {1, 1, 1}) -> Tensor:
+    r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    Args:
+        x (Tensor): The input tensor of shape (N, C, D, H, W)
+        output_size (List[int]): The target output size of the form (d, h, w)
+
+    Returns:
+        Tensor: The pooled output tensor
+    """
+    return Tensor(_infinicore.adaptive_avg_pool3d(x._underlying, output_size))
diff --git a/python/infinicore/ops/addr.py b/python/infinicore/ops/addr.py
new file mode 100644
index 000000000..f9a607961
--- /dev/null
+++ b/python/infinicore/ops/addr.py
@@ -0,0 +1,28 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def addr(
+    input: Tensor,
+    vec1: Tensor,
+    vec2: Tensor,
+    beta: float = 1.0,
+    alpha: float = 1.0,
+    out=None,
+) -> Tensor:
+    if out is None:
+        return Tensor(
+            _infinicore.addr(
+                input._underlying, vec1._underlying, vec2._underlying, beta, alpha
+            )
+        )
+
+    _infinicore.addr_(
+        out._underlying,
+        input._underlying,
+        vec1._underlying,
+        vec2._underlying,
+        beta,
+        alpha,
+    )
+    return out
diff --git a/python/infinicore/ops/argwhere.py b/python/infinicore/ops/argwhere.py
new file mode 100644
index 000000000..1e558c898
--- /dev/null
+++ b/python/infinicore/ops/argwhere.py
@@ -0,0 +1,6 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def argwhere(x: Tensor) -> Tensor:
+    return Tensor(_infinicore.argwhere(x._underlying))
diff --git a/python/infinicore/ops/asin.py b/python/infinicore/ops/asin.py
new file mode 100644
index 000000000..fbb230737
--- /dev/null
+++ b/python/infinicore/ops/asin.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def asin(input: Tensor, *, out=None):
+    """Arcsin activation function."""
+    if out is None:
+        return Tensor(_infinicore.asin(input._underlying))
+
+    _infinicore.asin_(out._underlying, input._underlying)
+    return out
diff --git a/python/infinicore/ops/fmin.py b/python/infinicore/ops/fmin.py
new file mode 100644
index 000000000..56e274a10
--- /dev/null
+++ b/python/infinicore/ops/fmin.py
@@ -0,0 +1,10 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def fmin(input, other, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.fmin(input._underlying, other._underlying))
+
+    _infinicore.fmin_(out._underlying, input._underlying, other._underlying)
+    return out
diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc
new file mode 100644
index 000000000..61d2cecfa
--- /dev/null
+++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc
@@ -0,0 +1,36 @@
+#include "infinicore/ops/adaptive_avg_pool3d.hpp"
+#include <iostream>
+#include <stdexcept>
+namespace infinicore::op {
+
+common::OpDispatcher<AdaptiveAvgPool3D::schema> &AdaptiveAvgPool3D::dispatcher() {
+    static common::OpDispatcher<AdaptiveAvgPool3D::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void AdaptiveAvgPool3D::execute(Tensor y, Tensor x) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+    if (func == nullptr) {
+        throw std::runtime_error("No AdaptiveAvgPool3D implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+    func(y, x);
+}
+
+Tensor adaptive_avg_pool3d(Tensor x, std::vector<size_t> output_size) {
+
+    // Create output tensor shap
+    Shape y_shape = x->shape();
+    y_shape[2] = output_size[0]; // D dimension
+    y_shape[3] = output_size[1]; // H dimension
+    y_shape[4] = output_size[2]; // W dimension
+
+    auto y = Tensor::empty(y_shape, x->dtype(), x->device());
+    adaptive_avg_pool3d_(y, x);
+    return y;
+}
+
+void adaptive_avg_pool3d_(Tensor y, Tensor x) {
+    AdaptiveAvgPool3D::execute(y, x);
+}
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc
new file mode 100644
index 000000000..1f4ef3211
--- /dev/null
+++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc
@@ -0,0 +1,62 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/adaptive_avg_pool3d.hpp"
+#include "infinicore/ops/common/cache.hpp"
+
+namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAdaptiveAvgPool3DDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAdaptiveAvgPool3DDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAdaptiveAvgPool3DDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAdaptiveAvgPool3DDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        // Convert vector to array for output_size
+        std::vector<size_t> output_size_vec = {output->size(2), output->size(3), output->size(4)};
+
+        INFINICORE_CHECK_ERROR(infiniopCreateAdaptiveAvgPool3DDescriptor(
+            context::getInfiniopHandle(output->device()),
+            &desc,
+            output->desc(),
+            input->desc(),
+            output_size_vec.data()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // Get workspace size and allocate if needed
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAdaptiveAvgPool3DWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopAdaptiveAvgPool3D(
+            desc,
+            workspace->data(), workspace_size,
+            output->data(),
+            input->data(),
+            context::getStream()));
+}
+
+static bool registered = []() {
+    AdaptiveAvgPool3D::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/addr/addr.cc b/src/infinicore/ops/addr/addr.cc
new file mode 100644
index 000000000..af7493b6f
--- /dev/null
+++ b/src/infinicore/ops/addr/addr.cc
@@ -0,0 +1,32 @@
+#include "infinicore/ops/addr.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Addr::schema> &Addr::dispatcher() {
+    static common::OpDispatcher<Addr::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Addr::execute(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, vec1, vec2);
+    infinicore::context::setDevice(out->device());
+    dispatcher().lookup(out->device().getType())(out, input, vec1, vec2, beta, alpha);
+}
+
+Tensor addr(Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) {
+
+    size_t n = vec1->shape()[0];
+    size_t m = vec2->shape()[0];
+
+    // Create output tensor
+    Tensor out = Tensor::empty({n, m}, input->dtype(), input->device());
+    addr_(out, input, vec1, vec2, beta, alpha);
+    return out;
+}
+
+void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) {
+    Addr::execute(out, input, vec1, vec2, beta, alpha);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/addr/addr_infiniop.cc b/src/infinicore/ops/addr/addr_infiniop.cc
new file mode 100644
index 000000000..a215a94bf
--- /dev/null
+++ b/src/infinicore/ops/addr/addr_infiniop.cc
@@ -0,0 +1,53 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/addr.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::addr_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAddrDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAddrDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAddrDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) {
+    // Hash the inputs including beta and alpha
+    size_t seed = hash_combine(out, input, vec1, vec2, beta, alpha);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAddrDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAddrDescriptor(
+            context::getInfiniopHandle(out->device()), &desc,
+            out->desc(), input->desc(), vec1->desc(), vec2->desc(), beta, alpha));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAddrWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAddr(
+        desc, workspace->data(), workspace_size,
+        out->data(), input->data(), vec1->data(), vec2->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Addr::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::addr_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/argwhere/argwhere.cc b/src/infinicore/ops/argwhere/argwhere.cc
new file mode 100644
index 000000000..a9d45de48
--- /dev/null
+++ b/src/infinicore/ops/argwhere/argwhere.cc
@@ -0,0 +1,33 @@
+#include "infinicore/ops/argwhere.hpp"
+#include "../../utils.hpp"
+#include "infinicore.h"
+#include "infinicore/dtype.hpp"
+#include "infinicore/tensor.hpp"
+#include <cstddef>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Argwhere::schema> &Argwhere::dispatcher() {
+    static common::OpDispatcher<Argwhere::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void Argwhere::execute(void **y, size_t *count, Tensor x) {
+    auto device_type = context::getDevice().getType();
+
+    auto func = dispatcher().lookup(device_type);
+    if (func == nullptr) {
+        throw std::runtime_error("Argwhere op not implemented for device type " + std::to_string(static_cast<int>(device_type)));
+    }
+    func(y, count, x);
+}
+Tensor argwhere(Tensor x) {
+    void *y = nullptr;
+    size_t count = 0;
+    Argwhere::execute(&y, &count, x);
+    auto result = Tensor::from_blob(y, Shape{count, x->ndim()}, DataType::I64, Device::cpu());
+    result = result->to(x->device());
+    return result;
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/argwhere/argwhere_infiniop.cc b/src/infinicore/ops/argwhere/argwhere_infiniop.cc
new file mode 100644
index 000000000..bc6c919db
--- /dev/null
+++ b/src/infinicore/ops/argwhere/argwhere_infiniop.cc
@@ -0,0 +1,52 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/argwhere.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infiniop/ops/argwhere.h"
+
+namespace infinicore::op::argwhere_impl::infiniop {
+thread_local common::OpCache<size_t, infiniopArgwhereDescriptor_t> caches(
+    100, // capacity
+    [](infiniopArgwhereDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyArgwhereDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+void calculate(void **y, size_t *count, Tensor x) {
+    size_t seed = hash_combine(x);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopArgwhereDescriptor_t desc = nullptr;
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateArgwhereDescriptor(
+            context::getInfiniopHandle(x->device()),
+            &desc,
+            x->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetArgwhereWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopArgwhere(
+        desc,
+        workspace->data(), workspace_size,
+        y,
+        count,
+        x->data(),
+        context::getStream()));
+}
+static bool registered = []() {
+    Argwhere::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+} // namespace infinicore::op::argwhere_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/asin/asin.cc b/src/infinicore/ops/asin/asin.cc
new file mode 100644
index 000000000..f3a8efb7c
--- /dev/null
+++ b/src/infinicore/ops/asin/asin.cc
@@ -0,0 +1,36 @@
+#include "infinicore/ops/asin.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Asin::schema> &Asin::dispatcher() {
+    static common::OpDispatcher<Asin::schema> dispatcher_;
+    return dispatcher_;
+};
+void Asin::execute(Tensor output, Tensor input) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    infinicore::context::setDevice(output->device());
+    auto device_type = output->device().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Asin implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor asin(Tensor input) {
+    Shape shape = input->shape();
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+    asin_(output, input);
+    return output;
+}
+
+void asin_(Tensor output, Tensor input) {
+    Asin::execute(output, input);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/asin/asin_infiniop.cc b/src/infinicore/ops/asin/asin_infiniop.cc
new file mode 100644
index 000000000..4dec4c220
--- /dev/null
+++ b/src/infinicore/ops/asin/asin_infiniop.cc
@@ -0,0 +1,52 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/asin.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::asin_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAsinDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAsinDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAsinDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAsinDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAsinDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAsinWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAsin(
+        desc, workspace->data(), workspace_size,
+        output->data(), input->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Asin::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::asin_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/fmin/fmin.cc b/src/infinicore/ops/fmin/fmin.cc
new file mode 100644
index 000000000..54d056c63
--- /dev/null
+++ b/src/infinicore/ops/fmin/fmin.cc
@@ -0,0 +1,27 @@
+#include "infinicore/ops/fmin.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Fmin::schema> &Fmin::dispatcher() {
+    static common::OpDispatcher<Fmin::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void Fmin::execute(Tensor c, Tensor a, Tensor b) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
+    infinicore::context::setDevice(c->device());
+    dispatcher().lookup(c->device().getType())(c, a, b);
+}
+
+Tensor fmin(Tensor a, Tensor b) {
+    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
+    fmin_(c, a, b);
+    return c;
+}
+
+void fmin_(Tensor c, Tensor a, Tensor b) {
+    Fmin::execute(c, a, b);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/fmin/fmin_infiniop.cc b/src/infinicore/ops/fmin/fmin_infiniop.cc
new file mode 100644
index 000000000..c9560479a
--- /dev/null
+++ b/src/infinicore/ops/fmin/fmin_infiniop.cc
@@ -0,0 +1,52 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/fmin.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::fmin_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopFminDescriptor_t> caches(
+    100, // capacity
+    [](infiniopFminDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyFminDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor c, Tensor a, Tensor b) {
+    size_t seed = hash_combine(c, b, a);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopFminDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateFminDescriptor(
+            context::getInfiniopHandle(c->device()), &desc,
+            c->desc(), a->desc(), b->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetFminWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopFmin(
+        desc, workspace->data(), workspace_size,
+        c->data(), a->data(), b->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Fmin::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::fmin_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 431c3a37b..2563efb00 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -2,11 +2,16 @@
 
 #include <pybind11/pybind11.h>
 
+#include "ops/adaptive_avg_pool3d.hpp"
 #include "ops/add.hpp"
+#include "ops/addr.hpp"
+#include "ops/argwhere.hpp"
+#include "ops/asin.hpp"
 #include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
+#include "ops/fmin.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
@@ -25,9 +30,14 @@ namespace infinicore::ops {
 
 inline void bind(py::module &m) {
     bind_add(m);
+    bind_adaptive_avg_pool3d(m);
+    bind_argwhere(m);
+    bind_addr(m);
+    bind_asin(m);
     bind_add_rms_norm(m);
     bind_attention(m);
     bind_causal_softmax(m);
+    bind_fmin(m);
     bind_random_sample(m);
     bind_linear(m);
     bind_matmul(m);
@@ -42,4 +52,4 @@ inline void bind(py::module &m) {
     bind_embedding(m);
 }
 
-} // namespace infinicore::ops
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp
new file mode 100644
index 000000000..3b2b8fdc7
--- /dev/null
+++ b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/adaptive_avg_pool3d.hpp"
+
+namespace py = pybind11;
+namespace infinicore::ops {
+inline void bind_adaptive_avg_pool3d(py::module &m) {
+    m.def("adaptive_avg_pool3d",
+          &op::adaptive_avg_pool3d,
+          py::arg("x"),
+          py::arg("output_size"),
+          R"doc( Adaptive Average Pooling 3D.)doc");
+
+    m.def("adaptive_avg_pool3d_",
+          &op::adaptive_avg_pool3d_,
+          py::arg("y"),
+          py::arg("x"),
+          R"doc(In-place, Adaptive Average Pooling 3D.)doc");
+}
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/addr.hpp b/src/infinicore/pybind11/ops/addr.hpp
new file mode 100644
index 000000000..2777fc513
--- /dev/null
+++ b/src/infinicore/pybind11/ops/addr.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "infinicore/ops/addr.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace infinicore::ops {
+inline void bind_addr(py::module_ &m) {
+    m.def(
+        "addr",
+        &op::addr,
+        py::arg("input"),
+        py::arg("vec1"),
+        py::arg("vec2"),
+        py::arg("alpha"),
+        py::arg("beta"),
+        R"doc(Addr.)doc");
+    m.def(
+        "addr_",
+        &op::addr_,
+        py::arg("out"),
+        py::arg("input"),
+        py::arg("vec1"),
+        py::arg("vec2"),
+        py::arg("beta"),
+        py::arg("alpha"),
+        R"doc(Addr.)doc");
+}
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/argwhere.hpp b/src/infinicore/pybind11/ops/argwhere.hpp
new file mode 100644
index 000000000..0e6e4a4af
--- /dev/null
+++ b/src/infinicore/pybind11/ops/argwhere.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "infinicore/ops/argwhere.hpp"
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+namespace infinicore::ops {
+inline void bind_argwhere(py::module &m) {
+    m.def("argwhere",
+          &op::argwhere,
+          py::arg("x"),
+          R"doc(Argwhere.)doc");
+}
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/asin.hpp b/src/infinicore/pybind11/ops/asin.hpp
new file mode 100644
index 000000000..ebd14adbc
--- /dev/null
+++ b/src/infinicore/pybind11/ops/asin.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/asin.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_asin(py::module &m) {
+    m.def("asin",
+          &op::asin,
+          py::arg("input"),
+          R"doc(Arcsin activation function.)doc");
+
+    m.def("asin_",
+          &op::asin_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(In-place arcsin activation function.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/fmin.hpp b/src/infinicore/pybind11/ops/fmin.hpp
new file mode 100644
index 000000000..afdbe4b33
--- /dev/null
+++ b/src/infinicore/pybind11/ops/fmin.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/fmin.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_fmin(py::module &m) {
+    m.def("fmin",
+          &op::fmin,
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(fmin of two tensors.)doc");
+
+    m.def("fmin_",
+          &op::fmin_,
+          py::arg("c"),
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(In-place tensor fmin.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h
new file mode 100644
index 000000000..feea6e701
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h
@@ -0,0 +1,90 @@
+#ifndef INFINIOP_ADAPTIVE_AVG_POOL3D_DESCRIPTOR_H_
+#define INFINIOP_ADAPTIVE_AVG_POOL3D_DESCRIPTOR_H_
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "infiniop/ops/adaptive_avg_pool3d.h"
+#include <cstddef>
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::adaptive_avg_pool3d::NAMESPACE {               \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AdaptiveAvgPool3DInfo _info;                             \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            AdaptiveAvgPool3DInfo info,                          \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            size_t *output_size);                                \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+class AdaptiveAvgPool3DInfo {
+private:
+    AdaptiveAvgPool3DInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    size_t x_d, x_h, x_w;
+    size_t y_d, y_h, y_w;
+    size_t N, C;
+    std::vector<ptrdiff_t> x_strides;
+    std::vector<ptrdiff_t> y_strides;
+
+    static utils::Result<AdaptiveAvgPool3DInfo>
+    create(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        size_t *output_size) {
+        CHECK_OR_RETURN(x_desc != nullptr && output_size != nullptr,
+                        INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t data_type = x_desc->dtype();
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        const size_t ndim = x_desc->ndim();
+        CHECK_OR_RETURN(ndim == 5, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<AdaptiveAvgPool3DInfo>(AdaptiveAvgPool3DInfo{
+            data_type,
+            x_desc->dim(2),
+            x_desc->dim(3),
+            x_desc->dim(4),
+            output_size[0],
+            output_size[1],
+            output_size[2],
+            x_desc->dim(0),
+            x_desc->dim(1),
+            x_desc->strides(),
+            y_desc->strides()});
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc
new file mode 100644
index 000000000..45b53a973
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc
@@ -0,0 +1,113 @@
+#include "adaptive_avg_pool3d_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <array>
+namespace op::adaptive_avg_pool3d::cpu {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t *output_size) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateAdaptiveAvgPool3D(
+    const AdaptiveAvgPool3DInfo &info,
+    Tdata *y,
+    const Tdata *x) {
+    std::array<size_t, 5> y_strides;
+    y_strides[size_t(4)] = 1;
+    y_strides[size_t(3)] = info.y_w * y_strides[size_t(4)];
+    y_strides[size_t(2)] = info.y_h * y_strides[size_t(3)];
+    y_strides[size_t(1)] = info.y_d * y_strides[size_t(2)];
+    y_strides[size_t(0)] = info.C * y_strides[size_t(1)];
+    {
+#pragma omp for collapse(5)
+        for (ptrdiff_t n = 0; n < ptrdiff_t(info.N); n++) {
+            for (ptrdiff_t c = 0; c < ptrdiff_t(info.C); c++) {
+                for (ptrdiff_t od = 0; od < ptrdiff_t(info.y_d); od++) {
+                    for (ptrdiff_t oh = 0; oh < ptrdiff_t(info.y_h); oh++) {
+                        for (ptrdiff_t ow = 0; ow < ptrdiff_t(info.y_w); ow++) {
+                            size_t x_start_d = od * info.x_d / info.y_d;
+                            size_t x_end_d = ((od + 1) * info.x_d + info.y_d - 1) / info.y_d;
+                            size_t x_start_h = oh * info.x_h / info.y_h;
+                            size_t x_end_h = ((oh + 1) * info.x_h + info.y_h - 1) / info.y_h;
+                            size_t x_start_w = ow * info.x_w / info.y_w;
+                            size_t x_end_w = ((ow + 1) * info.x_w + info.y_w - 1) / info.y_w;
+                            size_t count = (x_end_d - x_start_d) * (x_end_h - x_start_h) * (x_end_w - x_start_w);
+
+                            // Handle floating point types with casting
+                            if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+                                float sum = 0.0f;
+                                for (size_t id = x_start_d; id < x_end_d; id++) {
+                                    for (size_t ih = x_start_h; ih < x_end_h; ih++) {
+                                        for (size_t iw = x_start_w; iw < x_end_w; iw++) {
+                                            size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4];
+                                            sum += utils::cast<float>(x[x_offset]);
+                                        }
+                                    }
+                                }
+                                size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4];
+                                y[y_offset] = utils::cast<Tdata>(sum / static_cast<float>(count));
+                            } else {
+                                Tdata sum = (Tdata)0;
+                                for (size_t id = x_start_d; id < x_end_d; id++) {
+                                    for (size_t ih = x_start_h; ih < x_end_h; ih++) {
+                                        for (size_t iw = x_start_w; iw < x_end_w; iw++) {
+                                            size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4];
+                                            sum += x[x_offset];
+                                        }
+                                    }
+                                }
+                                size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4];
+                                // For integer types, we might want to handle division differently
+                                y[y_offset] = sum / static_cast<Tdata>(count);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return calculateAdaptiveAvgPool3D<fp16_t>(_info, reinterpret_cast<fp16_t *>(y), reinterpret_cast<const fp16_t *>(x));
+    case INFINI_DTYPE_F32:
+        return calculateAdaptiveAvgPool3D<float>(_info, reinterpret_cast<float *>(y), reinterpret_cast<const float *>(x));
+    case INFINI_DTYPE_F64:
+        return calculateAdaptiveAvgPool3D<double>(_info, reinterpret_cast<double *>(y), reinterpret_cast<const double *>(x));
+    case INFINI_DTYPE_BF16:
+        return calculateAdaptiveAvgPool3D<bf16_t>(_info, reinterpret_cast<bf16_t *>(y), reinterpret_cast<const bf16_t *>(x));
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::adaptive_avg_pool3d::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h
new file mode 100644
index 000000000..94753979a
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_ADAPTIVE_AVG_POOL3D_CPU_H__
+#define __INFINIOP_ADAPTIVE_AVG_POOL3D_CPU_H__
+
+#include "../adaptive_avg_pool3d.h"
+
+DESCRIPTOR(cpu)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh
new file mode 100644
index 000000000..a348c5e30
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh
@@ -0,0 +1,44 @@
+#ifndef __ADAPTIVE_AVG_POOL3D_CUDA_H__
+#define __ADAPTIVE_AVG_POOL3D_CUDA_H__
+
+#include <cstddef>
+
+template <typename T, unsigned int BLOCK_SIZE>
+__device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C,
+                                        size_t x_d, size_t x_h, size_t x_w,
+                                        size_t y_d, size_t y_h, size_t y_w,
+                                        const ptrdiff_t *x_strides,
+                                        const ptrdiff_t *y_strides) {
+
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = N * C * y_d * y_h * y_w;
+    size_t n, c, od, oh, ow;
+    n = index / (C * y_d * y_h * y_w);
+    c = (index / (y_d * y_h * y_w)) % C;
+    od = (index / (y_h * y_w)) % y_d;
+    oh = (index / y_w) % y_h;
+    ow = index % y_w;
+
+    if (index < total) {
+        size_t x_d_start = (od * x_d) / y_d;
+        size_t x_d_end = ((od + 1) * x_d + y_d - 1) / y_d;
+        size_t x_h_start = (oh * x_h) / y_h;
+        size_t x_h_end = ((oh + 1) * x_h + y_h - 1) / y_h;
+        size_t x_w_start = (ow * x_w) / y_w;
+        size_t x_w_end = ((ow + 1) * x_w + y_w - 1) / y_w;
+
+        T sum = static_cast<T>(0);
+        size_t count = (x_d_end - x_d_start) * (x_h_end - x_h_start) * (x_w_end - x_w_start);
+        for (size_t id = x_d_start; id < x_d_end; ++id) {
+            for (size_t ih = x_h_start; ih < x_h_end; ++ih) {
+                for (size_t iw = x_w_start; iw < x_w_end; ++iw) {
+                    size_t x_index = n * x_strides[0] + c * x_strides[1] + id * x_strides[2] + ih * x_strides[3] + iw * x_strides[4];
+                    sum += x[x_index];
+                }
+            }
+        }
+        size_t y_index = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4];
+        y[y_index] = sum / static_cast<T>(static_cast<float>(count));
+    }
+}
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h
new file mode 100644
index 000000000..9421aef35
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h
@@ -0,0 +1,8 @@
+#ifndef INFINIOP_OPS_ADAPTIVE_AVG_POOL3D_METAX_ADAPTIVE_AVG_POOL3D_METAX_H_
+#define INFINIOP_OPS_ADAPTIVE_AVG_POOL3D_METAX_ADAPTIVE_AVG_POOL3D_METAX_H_
+
+#include "../adaptive_avg_pool3d.h"
+
+DESCRIPTOR(metax)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca
new file mode 100644
index 000000000..48794fe17
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca
@@ -0,0 +1,88 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../cuda/kernel.cuh"
+#include "adaptive_avg_pool3d_metax.h"
+
+template <typename T, unsigned int BLOCK_SIZE>
+INFINIOP_METAX_KERNEL
+adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h,
+                  size_t x_w, size_t y_d, size_t y_h, size_t y_w,
+                  const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) {
+  adaptiveAvgPool3DKernel<T, BLOCK_SIZE>(y, x, N, C, x_d, x_h, x_w, y_d, y_h,
+                                         y_w, x_strides, y_strides);
+}
+
+namespace op::adaptive_avg_pool3d::metax {
+
+struct Descriptor::Opaque {
+  std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t y_desc,
+                                  infiniopTensorDescriptor_t x_desc,
+                                  size_t *output_size) {
+  auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size);
+  CHECK_RESULT(info);
+  size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides
+  *desc_ptr = new Descriptor(
+      info.take(), workspace_size,
+      new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+      handle->device, handle->device_id);
+  return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t N, size_t C, size_t x_d, size_t x_h,
+                            size_t x_w, size_t y_d, size_t y_h, size_t y_w,
+                            const ptrdiff_t *x_strides,
+                            const ptrdiff_t *y_strides, hcStream_t stream) {
+  size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  if (dtype == INFINI_DTYPE_F16) {
+    adaptiveAvgPool3D<half, BLOCK_SIZE><<<num_blocks, BLOCK_SIZE, 0, stream>>>(
+        (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w,
+        x_strides, y_strides);
+  } else if (dtype == INFINI_DTYPE_F32) {
+    adaptiveAvgPool3D<float, BLOCK_SIZE><<<num_blocks, BLOCK_SIZE, 0, stream>>>(
+        (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w,
+        x_strides, y_strides);
+  } else {
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+  }
+  return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y, const void *x,
+                                     void *stream_) const {
+  hcStream_t stream = (hcStream_t)stream_;
+
+  // Prepare strides
+  ptrdiff_t *x_strides = (ptrdiff_t *)workspace;
+  ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size();
+  hcMemcpyAsync(x_strides, _info.x_strides.data(),
+                _info.x_strides.size() * sizeof(ptrdiff_t),
+                hcMemcpyHostToDevice, stream);
+  hcMemcpyAsync(y_strides, _info.y_strides.data(),
+                _info.y_strides.size() * sizeof(ptrdiff_t),
+                hcMemcpyHostToDevice, stream);
+
+  if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+    CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+        y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+        _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+  } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+    CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+        y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+        _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+  } else {
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+  }
+  return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::adaptive_avg_pool3d::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h
new file mode 100644
index 000000000..a99c02483
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h
@@ -0,0 +1,42 @@
+#ifndef ADAPTIVE_AVG_POOL3D_KERNEL_H_
+#define ADAPTIVE_AVG_POOL3D_KERNEL_H_
+
+template <typename T, unsigned int BLOCK_SIZE>
+__device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C,
+                                        size_t x_d, size_t x_h, size_t x_w,
+                                        size_t y_d, size_t y_h, size_t y_w,
+                                        const ptrdiff_t *x_strides,
+                                        const ptrdiff_t *y_strides) {
+
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = N * C * y_d * y_h * y_w;
+    size_t n, c, od, oh, ow;
+    n = index / (C * y_d * y_h * y_w);
+    c = (index / (y_d * y_h * y_w)) % C;
+    od = (index / (y_h * y_w)) % y_d;
+    oh = (index / y_w) % y_h;
+    ow = index % y_w;
+
+    if (index < total) {
+        size_t x_d_start = (od * x_d) / y_d;
+        size_t x_d_end = ((od + 1) * x_d + y_d - 1) / y_d;
+        size_t x_h_start = (oh * x_h) / y_h;
+        size_t x_h_end = ((oh + 1) * x_h + y_h - 1) / y_h;
+        size_t x_w_start = (ow * x_w) / y_w;
+        size_t x_w_end = ((ow + 1) * x_w + y_w - 1) / y_w;
+
+        T sum = static_cast<T>(0);
+        size_t count = (x_d_end - x_d_start) * (x_h_end - x_h_start) * (x_w_end - x_w_start);
+        for (size_t id = x_d_start; id < x_d_end; ++id) {
+            for (size_t ih = x_h_start; ih < x_h_end; ++ih) {
+                for (size_t iw = x_w_start; iw < x_w_end; ++iw) {
+                    size_t x_index = n * x_strides[0] + c * x_strides[1] + id * x_strides[2] + ih * x_strides[3] + iw * x_strides[4];
+                    sum += x[x_index];
+                }
+            }
+        }
+        size_t y_index = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4];
+        y[y_index] = sum / static_cast<T>(count);
+    }
+}
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h
new file mode 100644
index 000000000..37967bf16
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h
@@ -0,0 +1,7 @@
+#ifndef ADAPTIVE_AVG_POOL3D_MOORE_H_
+#define ADAPTIVE_AVG_POOL3D_MOORE_H_
+#include "../adaptive_avg_pool3d.h"
+
+DESCRIPTOR(moore)
+
+#endif // ADAPTIVE_AVG_POOL3D_MOORE_H_
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu
new file mode 100644
index 000000000..9dd2259bf
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu
@@ -0,0 +1,89 @@
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "adaptive_avg_pool3d_moore.h"
+#include "adaptive_avg_pool3d_kernel.h"
+
+template <typename T, unsigned int BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL
+adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h,
+                  size_t x_w, size_t y_d, size_t y_h, size_t y_w,
+                  const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) {
+    adaptiveAvgPool3DKernel<T, BLOCK_SIZE>(y, x, N, C, x_d, x_h, x_w, y_d, y_h,
+                                           y_w, x_strides, y_strides);
+}
+namespace op::adaptive_avg_pool3d::moore {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t y_desc,
+                                  infiniopTensorDescriptor_t x_desc,
+                                  size_t *output_size) {
+    auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size);
+    CHECK_RESULT(info);
+    size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides
+    *desc_ptr = new Descriptor(
+        info.take(), workspace_size,
+        new Opaque{
+            reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t N, size_t C, size_t x_d, size_t x_h,
+                            size_t x_w, size_t y_d, size_t y_h, size_t y_w,
+                            const ptrdiff_t *x_strides,
+                            const ptrdiff_t *y_strides, musaStream_t stream) {
+    size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    if (dtype == INFINI_DTYPE_F16) {
+        adaptiveAvgPool3D<half, BLOCK_SIZE><<<num_blocks, BLOCK_SIZE, 0, stream>>>(
+            (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w,
+            x_strides, y_strides);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        adaptiveAvgPool3D<float, BLOCK_SIZE><<<num_blocks, BLOCK_SIZE, 0, stream>>>(
+            (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w,
+            x_strides, y_strides);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y, const void *x,
+                                     void *stream_) const {
+    musaStream_t stream = (musaStream_t)stream_;
+
+    // Prepare strides
+    ptrdiff_t *x_strides = (ptrdiff_t *)workspace;
+    ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size();
+    musaMemcpyAsync(x_strides, _info.x_strides.data(),
+                    _info.x_strides.size() * sizeof(ptrdiff_t),
+                    musaMemcpyHostToDevice, stream);
+    musaMemcpyAsync(y_strides, _info.y_strides.data(),
+                    _info.y_strides.size() * sizeof(ptrdiff_t),
+                    musaMemcpyHostToDevice, stream);
+
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+            _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+            _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+    } else if(_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048){
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(
+            y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+            _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+    }else{
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu
new file mode 100644
index 000000000..b522b2da3
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu
@@ -0,0 +1,93 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "adaptive_avg_pool3d_nvidia.cuh"
+#include <cstddef>
+
+template <typename T, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL
+adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h,
+                  size_t x_w, size_t y_d, size_t y_h, size_t y_w,
+                  const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) {
+    adaptiveAvgPool3DKernel<T, BLOCK_SIZE>(y, x, N, C, x_d, x_h, x_w, y_d, y_h,
+                                           y_w, x_strides, y_strides);
+}
+
+namespace op::adaptive_avg_pool3d::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t y_desc,
+                                  infiniopTensorDescriptor_t x_desc,
+                                  size_t *output_size) {
+    auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size);
+    CHECK_RESULT(info);
+    size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides
+    *desc_ptr = new Descriptor(
+        info.take(), workspace_size,
+        new Opaque{
+            reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t N, size_t C, size_t x_d, size_t x_h,
+                            size_t x_w, size_t y_d, size_t y_h, size_t y_w,
+                            const ptrdiff_t *x_strides,
+                            const ptrdiff_t *y_strides, cudaStream_t stream) {
+    size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    if (dtype == INFINI_DTYPE_F16) {
+        adaptiveAvgPool3D<half, BLOCK_SIZE><<<num_blocks, BLOCK_SIZE, 0, stream>>>(
+            (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w,
+            x_strides, y_strides);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        adaptiveAvgPool3D<float, BLOCK_SIZE><<<num_blocks, BLOCK_SIZE, 0, stream>>>(
+            (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w,
+            x_strides, y_strides);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y, const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    // Prepare strides
+    ptrdiff_t *x_strides = (ptrdiff_t *)workspace;
+    ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size();
+    cudaMemcpyAsync(x_strides, _info.x_strides.data(),
+                    _info.x_strides.size() * sizeof(ptrdiff_t),
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(y_strides, _info.y_strides.data(),
+                    _info.y_strides.size() * sizeof(ptrdiff_t),
+                    cudaMemcpyHostToDevice, stream);
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+            _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+            _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w,
+            _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::adaptive_avg_pool3d::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh
new file mode 100644
index 000000000..973a0a54a
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ADAPTIVE_AVG_POOL3D_NVIDIA_H__
+#define __ADAPTIVE_AVG_POOL3D_NVIDIA_H__
+
+#include "../adaptive_avg_pool3d.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool3d/operator.cc b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc
new file mode 100644
index 000000000..738402862
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc
@@ -0,0 +1,216 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/adaptive_avg_pool3d.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/adaptive_avg_pool3d_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/adaptive_avg_pool3d_nvidia.cuh"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/adaptive_avg_pool3d_ascend.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/adaptive_avg_pool3d_bang.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/adaptive_avg_pool3d_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/adaptive_avg_pool3d_kunlun.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/adaptive_avg_pool3d_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor(
+    infiniopHandle_t handle,
+    infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    size_t *output_size) {
+
+#define CREATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                             \
+        return op::adaptive_avg_pool3d::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                        \
+            reinterpret_cast<op::adaptive_avg_pool3d::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                                             \
+            x,                                                                             \
+            output_size)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    }
+
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize(
+    infiniopAdaptiveAvgPool3DDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                     \
+    case CASE:                                                                                                   \
+        *size = reinterpret_cast<const op::adaptive_avg_pool3d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAdaptiveAvgPool3D(
+    infiniopAdaptiveAvgPool3DDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                                \
+        return reinterpret_cast<const op::adaptive_avg_pool3d::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, stream);
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    }
+#undef CALCULATE
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                                 \
+        delete reinterpret_cast<const op::adaptive_avg_pool3d::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DELETE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+#undef DELETE
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/addr.h b/src/infiniop/ops/addr/addr.h
new file mode 100644
index 000000000..6cf5b20fc
--- /dev/null
+++ b/src/infiniop/ops/addr/addr.h
@@ -0,0 +1,95 @@
+#ifndef INFINIOP_ADDR_DESCRIPTOR_H_
+#define INFINIOP_ADDR_DESCRIPTOR_H_
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "infiniop/ops/addr.h"
+#include <cstddef>
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::addr::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AddrInfo _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            AddrInfo info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t out_desc,                 \
+            infiniopTensorDescriptor_t input_desc,               \
+            infiniopTensorDescriptor_t vec1_desc,                \
+            infiniopTensorDescriptor_t vec2_desc,                \
+            float beta,                                          \
+            float alpha);                                        \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *out,                                           \
+            const void *input,                                   \
+            const void *vec1,                                    \
+            const void *vec2,                                    \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+struct AddrInfo {
+    infiniDtype_t dtype;
+    size_t vec1_size;
+    size_t vec2_size;
+    float beta;
+    float alpha;
+    ptrdiff_t input_stride0, input_stride1;
+    ptrdiff_t output_stride0, output_stride1;
+    ptrdiff_t vec1_stride;
+    ptrdiff_t vec2_stride;
+
+    static utils::Result<AddrInfo>
+    create(infiniopTensorDescriptor_t input_desc,
+           infiniopTensorDescriptor_t output_desc,
+           infiniopTensorDescriptor_t vec1_desc,
+           infiniopTensorDescriptor_t vec2_desc,
+           float beta = 1.0f, float alpha = 1.0f) {
+        CHECK_OR_RETURN(input_desc->ndim() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(vec1_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(vec2_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(input_desc->dim(0) == vec1_desc->dim(0) && input_desc->dim(1) == vec2_desc->dim(0), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        const infiniDtype_t data_type = input_desc->dtype();
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        return utils::Result<AddrInfo>(AddrInfo{
+            data_type,
+            vec1_desc->dim(0),
+            vec2_desc->dim(0),
+            beta,
+            alpha,
+            input_desc->stride(0),
+            input_desc->stride(1),
+            output_desc->stride(0),
+            output_desc->stride(1),
+            vec1_desc->stride(0),
+            vec2_desc->stride(0),
+        });
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.cc b/src/infiniop/ops/addr/cpu/addr_cpu.cc
new file mode 100644
index 000000000..52339164c
--- /dev/null
+++ b/src/infiniop/ops/addr/cpu/addr_cpu.cc
@@ -0,0 +1,93 @@
+#include "addr_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <spdlog/spdlog.h>
+namespace op::addr::cpu {
+Descriptor::~Descriptor() = default;
+
+// Template function to handle different data types
+template <typename Tdata>
+infiniStatus_t addr_impl(Tdata *out,
+                         const Tdata *input,
+                         const Tdata *vec1,
+                         const Tdata *vec2,
+                         const AddrInfo &info,
+                         void *workspace,
+                         size_t workspace_size) {
+    size_t n = info.vec1_size;
+    size_t m = info.vec2_size;
+    float beta = info.beta;
+    float alpha = info.alpha;
+
+#pragma omp parallel for collapse(2)
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t j = 0; j < m; ++j) {
+            if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+                float a = utils::cast<float>(vec1[i * info.vec1_stride]);
+                float b = utils::cast<float>(vec2[j * info.vec2_stride]);
+                float c = utils::cast<float>(input[i * info.input_stride0 + j * info.input_stride1]);
+                out[i * info.output_stride0 + j * info.output_stride1] = utils::cast<Tdata>(alpha * a * b + beta * c);
+            } else {
+                float a = vec1[i * info.vec1_stride], b = vec2[j * info.vec2_stride], c = input[i * info.input_stride0 + j * info.input_stride1];
+                out[i * info.output_stride0 + j * info.output_stride1] = utils::cast<Tdata>(alpha * a * b + beta * c);
+            }
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t vec1_desc,
+    infiniopTensorDescriptor_t vec2_desc,
+    float beta,
+    float alpha) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(info.take(), 0, nullptr,
+                               INFINI_DEVICE_CPU, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *vec1,
+    const void *vec2,
+    void *stream) const {
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F32:
+        return addr_impl(reinterpret_cast<float *>(out),
+                         reinterpret_cast<const float *>(input),
+                         reinterpret_cast<const float *>(vec1),
+                         reinterpret_cast<const float *>(vec2),
+                         _info, workspace, workspace_size);
+        break;
+    case INFINI_DTYPE_F16:
+        return addr_impl(reinterpret_cast<fp16_t *>(out),
+                         reinterpret_cast<const fp16_t *>(input),
+                         reinterpret_cast<const fp16_t *>(vec1),
+                         reinterpret_cast<const fp16_t *>(vec2),
+                         _info, workspace, workspace_size);
+    case INFINI_DTYPE_BF16:
+        return addr_impl(reinterpret_cast<bf16_t *>(out),
+                         reinterpret_cast<const bf16_t *>(input),
+                         reinterpret_cast<const bf16_t *>(vec1),
+                         reinterpret_cast<const bf16_t *>(vec2),
+                         _info, workspace, workspace_size);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addr::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.h b/src/infiniop/ops/addr/cpu/addr_cpu.h
new file mode 100644
index 000000000..e606c1816
--- /dev/null
+++ b/src/infiniop/ops/addr/cpu/addr_cpu.h
@@ -0,0 +1,7 @@
+#ifndef INFINIOP_ADDR_CPU_H_
+#define INFINIOP_ADDR_CPU_H_
+#include "../addr.h"
+
+DESCRIPTOR(cpu)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/cuda/kernel.cu b/src/infiniop/ops/addr/cuda/kernel.cu
new file mode 100644
index 000000000..b6af54e05
--- /dev/null
+++ b/src/infiniop/ops/addr/cuda/kernel.cu
@@ -0,0 +1,38 @@
+#ifndef __ADDR_CUDA_H__
+#define __ADDR_CUDA_H__
+#include <cstddef>
+#include <type_traits>
+
+template <typename T>
+__device__ void addr_kernel(T *out, const T *input, const T *vec1,
+                            const T *vec2, size_t n, size_t m, float beta,
+                            float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+                            ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+                            ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) {
+    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t j = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= n || j >= m) {
+        return;
+    }
+    size_t out_idx = i * out_stride_0 + j * out_stride_1;
+    size_t in_idx = i * in_stride_0 + j * in_stride_1;
+    size_t vec1_idx = i * stride1;
+    size_t vec2_idx = j * stride2;
+    T in_val = input[in_idx];
+    T vec1_val = vec1[vec1_idx];
+    T vec2_val = vec2[vec2_idx];
+    T out_val;
+    if constexpr (std::is_same_v<T, half>) {
+        out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2half(alpha)),
+                         __hmul(__float2half(beta), in_val));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2bfloat16(alpha)),
+                         __hmul(__float2bfloat16(beta), in_val));
+    } else {
+        out_val = beta * in_val + alpha * vec1_val * vec2_val;
+    }
+
+    out[out_idx] = out_val;
+    __syncthreads();
+}
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/metax/addr_metax.h b/src/infiniop/ops/addr/metax/addr_metax.h
new file mode 100644
index 000000000..9a8acdbe2
--- /dev/null
+++ b/src/infiniop/ops/addr/metax/addr_metax.h
@@ -0,0 +1,8 @@
+#ifndef INFINIOP_ADDR_METAX_H_
+#define INFINIOP_ADDR_METAX_H_
+
+#include "../addr.h"
+
+DESCRIPTOR(metax)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/metax/addr_nvidia.maca b/src/infiniop/ops/addr/metax/addr_nvidia.maca
new file mode 100644
index 000000000..443d59cff
--- /dev/null
+++ b/src/infiniop/ops/addr/metax/addr_nvidia.maca
@@ -0,0 +1,108 @@
+
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../cuda/kernel.cu"
+#include "addr_metax.h"
+#include "infinicore.h"
+#include <cmath>
+
+template <typename Tdata>
+INFINIOP_METAX_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1,
+                           const Tdata *vec2, size_t n, size_t m, float beta,
+                           float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+                           ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+                           ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) {
+  addr_kernel<Tdata>(output, input, vec1, vec2, n, m, beta, alpha, stride1,
+                     stride2, out_stride_0, out_stride_1, in_stride_0,
+                     in_stride_1);
+}
+namespace op::addr::metax {
+
+struct Descriptor::Opaque {
+  std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t out_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  infiniopTensorDescriptor_t vec1_desc,
+                                  infiniopTensorDescriptor_t vec2_desc,
+                                  float beta, float alpha) {
+
+  auto info =
+      AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha);
+  CHECK_RESULT(info);
+  size_t workspace_size = 0;
+
+  *desc_ptr = new Descriptor(
+      info.take(), workspace_size,
+      new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+      handle->device, handle->device_id);
+  return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t
+launchKernel(infiniDtype_t dtype, void *output, const void *input,
+             const void *vec1, const void *vec2, size_t n, size_t m, float beta,
+             float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+             ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+             ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, hcStream_t stream) {
+  unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE),
+               dm = std::min((unsigned int)m, BLOCK_SIZE);
+  dim3 grid = {(unsigned int)((n + dn - 1) / dn),
+               (unsigned int)((m + dm - 1) / dm)};
+  dim3 block = {dn, dm};
+  switch (dtype) {
+  case INFINI_DTYPE_F32:
+    Addr<float><<<grid, block, 0, stream>>>(
+        (float *)output, (const float *)input, (const float *)vec1,
+        (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0,
+        out_stride_1, in_stride_0, in_stride_1);
+    break;
+  case INFINI_DTYPE_F16:
+    Addr<__half><<<grid, block, 0, stream>>>(
+        (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m,
+        beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0,
+        in_stride_1);
+    break;
+  case INFINI_DTYPE_BF16:
+    Addr<__hpcc_bfloat16><<<grid, block, 0, stream>>>(
+        (__hpcc_bfloat16 *)output, (__hpcc_bfloat16 *)input,
+        (__hpcc_bfloat16 *)vec1, (__hpcc_bfloat16 *)vec2, n, m, beta, alpha,
+        stride1, stride2, out_stride_0, out_stride_1, in_stride_0, in_stride_1);
+    break;
+  default:
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+  }
+  return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *out, const void *input,
+                                     const void *vec1, const void *vec2,
+                                     void *stream_) const {
+  hcStream_t stream = (hcStream_t)stream_;
+
+  if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+    CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+        _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size,
+        _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride,
+        _info.output_stride0, _info.output_stride1, _info.input_stride0,
+        _info.input_stride1, stream));
+  } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+    CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+        _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size,
+        _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride,
+        _info.output_stride0, _info.output_stride1, _info.input_stride0,
+        _info.input_stride1, stream));
+  } else {
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+  }
+  return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addr::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/moore/addr_kernel.h b/src/infiniop/ops/addr/moore/addr_kernel.h
new file mode 100644
index 000000000..49e83aced
--- /dev/null
+++ b/src/infiniop/ops/addr/moore/addr_kernel.h
@@ -0,0 +1,36 @@
+#ifndef __ADDR_MOORE_H__
+#define __ADDR_MOORE_H__
+
+template <typename T>
+__device__ void addr_kernel(T *out, const T *input, const T *vec1,
+                            const T *vec2, size_t n, size_t m, float beta,
+                            float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+                            ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+                            ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) {
+    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t j = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= n || j >= m) {
+        return;
+    }
+    size_t out_idx = i * out_stride_0 + j * out_stride_1;
+    size_t in_idx = i * in_stride_0 + j * in_stride_1;
+    size_t vec1_idx = i * stride1;
+    size_t vec2_idx = j * stride2;
+    T in_val = input[in_idx];
+    T vec1_val = vec1[vec1_idx];
+    T vec2_val = vec2[vec2_idx];
+    T out_val;
+    if constexpr (std::is_same_v<T, half>) {
+        out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2half(alpha)),
+                         __hmul(__float2half(beta), in_val));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        float a = __bfloat162float(vec1_val), b = __bfloat162float(vec2_val), in = __bfloat162float(in_val);
+        out_val = __float2bfloat16_rn(a * b * alpha + in * beta);
+    } else {
+        out_val = beta * in_val + alpha * vec1_val * vec2_val;
+    }
+
+    out[out_idx] = out_val;
+    __syncthreads();
+}
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/moore/addr_moore.h b/src/infiniop/ops/addr/moore/addr_moore.h
new file mode 100644
index 000000000..d32182597
--- /dev/null
+++ b/src/infiniop/ops/addr/moore/addr_moore.h
@@ -0,0 +1,8 @@
+#ifndef INFINIOP_ADDR_MOORE_H_
+#define INFINIOP_ADDR_MOORE_H_
+
+#include "../addr.h"
+
+DESCRIPTOR(moore)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/moore/addr_moore.mu b/src/infiniop/ops/addr/moore/addr_moore.mu
new file mode 100644
index 000000000..95626044b
--- /dev/null
+++ b/src/infiniop/ops/addr/moore/addr_moore.mu
@@ -0,0 +1,108 @@
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "addr_kernel.h"
+#include "addr_moore.h"
+#include "infinicore.h"
+#include <cmath>
+
+template <typename Tdata>
+INFINIOP_MOORE_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1,
+                          const Tdata *vec2, size_t n, size_t m, float beta,
+                          float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+                          ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+                          ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) {
+  addr_kernel<Tdata>(output, input, vec1, vec2, n, m, beta, alpha, stride1,
+                     stride2, out_stride_0, out_stride_1, in_stride_0,
+                     in_stride_1);
+}
+namespace op::addr::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t out_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  infiniopTensorDescriptor_t vec1_desc,
+                                  infiniopTensorDescriptor_t vec2_desc,
+                                  float beta, float alpha) {
+
+  auto info =
+      AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha);
+  CHECK_RESULT(info);
+  size_t workspace_size = 0;
+
+  *desc_ptr = new Descriptor(
+      info.take(), workspace_size,
+      new Opaque{
+          reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+      handle->device, handle->device_id);
+  return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(infiniDtype_t dtype, void *output,
+                            const void *input, const void *vec1,
+                            const void *vec2, size_t n, size_t m, float beta,
+                            float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+                            ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+                            ptrdiff_t in_stride_0, ptrdiff_t in_stride_1,
+                            musaStream_t stream) {
+  unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE),
+               dm = std::min((unsigned int)m, BLOCK_SIZE);
+  dim3 grid = {(unsigned int)((n + dn - 1) / dn),
+               (unsigned int)((m + dm - 1) / dm)};
+  dim3 block = {dn, dm};
+  switch (dtype) {
+  case INFINI_DTYPE_F32:
+    Addr<float><<<grid, block, 0, stream>>>(
+        (float *)output, (const float *)input, (const float *)vec1,
+        (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0,
+        out_stride_1, in_stride_0, in_stride_1);
+    break;
+  case INFINI_DTYPE_F16:
+    Addr<__half><<<grid, block, 0, stream>>>(
+        (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m,
+        beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0,
+        in_stride_1);
+    break;
+  case INFINI_DTYPE_BF16:
+    Addr<cuda_bfloat16><<<grid, block, 0, stream>>>(
+        (cuda_bfloat16 *)output, (cuda_bfloat16 *)input, (cuda_bfloat16 *)vec1,
+        (cuda_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2,
+        out_stride_0, out_stride_1, in_stride_0, in_stride_1);
+    break;
+  default:
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+  }
+  return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *out, const void *input,
+                                     const void *vec1, const void *vec2,
+                                     void *stream_) const {
+   musaStream_t stream = (musaStream_t)stream_;
+
+  if (_opaque->internal->maxThreadsPerBlock() ==MOORE_BLOCK_SIZE_1024) {
+    CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
+        _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size,
+        _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride,
+        _info.output_stride0, _info.output_stride1, _info.input_stride0,
+        _info.input_stride1, stream));
+  } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+    CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
+        _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size,
+        _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride,
+        _info.output_stride0, _info.output_stride1, _info.input_stride0,
+        _info.input_stride1, stream));
+  } else {
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+  }
+  return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addr::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cu b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu
new file mode 100644
index 000000000..cbd5279ce
--- /dev/null
+++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu
@@ -0,0 +1,114 @@
+
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cu"
+#include "addr_nvidia.cuh"
+#include "infinicore.h"
+#include <cmath>
+
+template <typename Tdata>
+INFINIOP_CUDA_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1,
+                          const Tdata *vec2, size_t n, size_t m, float beta,
+                          float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+                          ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+                          ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) {
+    addr_kernel<Tdata>(output, input, vec1, vec2, n, m, beta, alpha, stride1,
+                       stride2, out_stride_0, out_stride_1, in_stride_0,
+                       in_stride_1);
+}
+namespace op::addr::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t out_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  infiniopTensorDescriptor_t vec1_desc,
+                                  infiniopTensorDescriptor_t vec2_desc,
+                                  float beta, float alpha) {
+
+    auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha);
+    CHECK_RESULT(info);
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info.take(), workspace_size,
+        new Opaque{
+            reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(infiniDtype_t dtype, void *output,
+                            const void *input, const void *vec1,
+                            const void *vec2, size_t n, size_t m, float beta,
+                            float alpha, ptrdiff_t stride1, ptrdiff_t stride2,
+                            ptrdiff_t out_stride_0, ptrdiff_t out_stride_1,
+                            ptrdiff_t in_stride_0, ptrdiff_t in_stride_1,
+                            cudaStream_t stream) {
+    unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE),
+                 dm = std::min((unsigned int)m, BLOCK_SIZE);
+    dim3 grid = {(unsigned int)((n + dn - 1) / dn),
+                 (unsigned int)((m + dm - 1) / dm)};
+    dim3 block = {dn, dm};
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        Addr<float><<<grid, block, 0, stream>>>(
+            (float *)output, (const float *)input, (const float *)vec1,
+            (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0,
+            out_stride_1, in_stride_0, in_stride_1);
+        break;
+    case INFINI_DTYPE_F16:
+        Addr<__half><<<grid, block, 0, stream>>>(
+            (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m,
+            beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0,
+            in_stride_1);
+        break;
+    case INFINI_DTYPE_BF16:
+        Addr<__nv_bfloat16><<<grid, block, 0, stream>>>(
+            (__nv_bfloat16 *)output, (__nv_bfloat16 *)input, (__nv_bfloat16 *)vec1,
+            (__nv_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2,
+            out_stride_0, out_stride_1, in_stride_0, in_stride_1);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *out, const void *input,
+                                     const void *vec1, const void *vec2,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size,
+            _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride,
+            _info.output_stride0, _info.output_stride1, _info.input_stride0,
+            _info.input_stride1, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size,
+            _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride,
+            _info.output_stride0, _info.output_stride1, _info.input_stride0,
+            _info.input_stride1, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size,
+            _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride,
+            _info.output_stride0, _info.output_stride1, _info.input_stride0,
+            _info.input_stride1, stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addr::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh
new file mode 100644
index 000000000..9ebe7f2c3
--- /dev/null
+++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef INFINIOP_ADDR_NVIDIA_H_
+#define INFINIOP_ADDR_NVIDIA_H_
+#include "../addr.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/addr/operator.cc b/src/infiniop/ops/addr/operator.cc
new file mode 100644
index 000000000..6e3724743
--- /dev/null
+++ b/src/infiniop/ops/addr/operator.cc
@@ -0,0 +1,222 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/addr.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/addr_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/addr_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/addr_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/addr_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAddrDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddrDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out,
+    infiniopTensorDescriptor_t input,
+    infiniopTensorDescriptor_t vec1,
+    infiniopTensorDescriptor_t vec2,
+    float beta,
+    float alpha) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::addr::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::addr::NAMESPACE::Descriptor **>(desc_ptr), \
+            out,                                                            \
+            input,                                                          \
+            vec1,                                                           \
+            vec2,                                                           \
+            beta,                                                           \
+            alpha)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    }
+
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetAddrWorkspaceSize(
+    infiniopAddrDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<const op::addr::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAddr(
+    infiniopAddrDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *vec1,
+    const void *vec2,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::addr::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, out, input, vec1, vec2, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    }
+
+#undef CALCULATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::addr::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DELETE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef DELETE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/argwhere.h b/src/infiniop/ops/argwhere/argwhere.h
new file mode 100644
index 000000000..afb067576
--- /dev/null
+++ b/src/infiniop/ops/argwhere/argwhere.h
@@ -0,0 +1,75 @@
+#ifndef __INFINIOP_ARGWHERE_H__
+#define __INFINIOP_ARGWHERE_H__
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "infinicore.h"
+#include <cstddef>
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::argwhere::NAMESPACE {                          \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ArgwhereInfo _info;                                      \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            ArgwhereInfo info,                                   \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void **y,                                            \
+            size_t *count,                                       \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+class ArgwhereInfo {
+private:
+    ArgwhereInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    std::vector<ptrdiff_t> strides;
+    std::vector<size_t> shapes;
+    size_t num_elements;
+
+    static utils::Result<ArgwhereInfo>
+    create(
+        infiniopTensorDescriptor_t x_desc) {
+        CHECK_OR_RETURN(x_desc != nullptr,
+                        INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t data_type = x_desc->dtype();
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F32);
+
+        return utils::Result<ArgwhereInfo>(ArgwhereInfo{
+            data_type,
+            x_desc->strides(),
+            x_desc->shape(),
+            x_desc->numel()});
+    }
+};
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc
new file mode 100644
index 000000000..6a92047a2
--- /dev/null
+++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc
@@ -0,0 +1,80 @@
+#include "argwhere_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "infinicore.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
+namespace op::argwhere::cpu {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info = ArgwhereInfo::create(x_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateArgWhere(
+    const ArgwhereInfo &info,
+    void *workspace,
+    size_t workspace_size,
+    void **y,
+    size_t *count,
+    const void *x) {
+
+    const Tdata *x_data = reinterpret_cast<const Tdata *>(x);
+    // int64_t *y_data = reinterpret_cast<int64_t *>(y);
+    std::vector<size_t> positions;
+    // #pragma omp parallel for
+    for (size_t i = 0; i < info.num_elements; i++) {
+        size_t pos = 0, tem = i;
+        std::vector<size_t> position(info.strides.size());
+        for (int j = info.strides.size() - 1; j >= 0; j--) {
+            position[j] = tem % info.shapes[j];
+            tem /= info.shapes[j];
+            pos += position[j] * info.strides[j];
+        }
+        if (fabs(x_data[pos] - 0.0f) > 1e-5) {
+            for (auto p : position) {
+                positions.push_back(p);
+            }
+        }
+    }
+
+    *y = new int64_t[positions.size()];
+    memcpy(*y, positions.data(), positions.size() * sizeof(int64_t));
+    *count = positions.size() / info.strides.size();
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void **y,
+    size_t *count,
+    const void *x,
+    void *stream) const {
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F32:
+        return calculateArgWhere<float>(_info, workspace, workspace_size, y, count, x);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::argwhere::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h
new file mode 100644
index 000000000..1e6461e75
--- /dev/null
+++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_ARGWHERE_CPU_API_H__
+#define __INFINIOP_ARGWHERE_CPU_API_H__
+
+#include "../argwhere.h"
+
+DESCRIPTOR(cpu)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/cuda/kernel.cuh b/src/infiniop/ops/argwhere/cuda/kernel.cuh
new file mode 100644
index 000000000..ff4aae126
--- /dev/null
+++ b/src/infiniop/ops/argwhere/cuda/kernel.cuh
@@ -0,0 +1,215 @@
+#ifndef ARGWHERE_NVIDIA_KERNEL_H
+#define ARGWHERE_NVIDIA_KERNEL_H
+#include <cstddef>
+#include <cstdint>
+
+__device__ void Index2Pos(size_t index, size_t ndim, const size_t *shapes,
+                          size_t *pos) {
+    for (int i = ndim - 1; i >= 0; i--) {
+        pos[i] = index % shapes[i];
+        index /= shapes[i];
+    }
+}
+__device__ size_t pos2dest(size_t *pos, size_t ndim, const ptrdiff_t *strides) {
+    size_t dest = 0;
+    for (size_t i = 0; i < ndim; i++) {
+        dest += pos[i] * strides[i];
+    }
+    return dest;
+}
+
+template <typename T>
+__global__ void parallel_block_argwhere_kernel(T *data, int64_t *results,
+                                               size_t N, const size_t *shapes, const ptrdiff_t *strides, size_t ndim, size_t *count) {
+    extern __shared__ size_t tmp[];
+    size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引
+    bool is_zero1 = false, is_zero2 = false;
+    int tid = threadIdx.x;
+    int leaf_num = blockDim.x * 2; // equals to length of tmp
+
+    if (tid * 2 < N) {
+        Index2Pos(tid * 2, ndim, shapes, pos1);
+        is_zero1 = fabs(data[pos2dest(pos1, ndim, strides)]) <= 1e-5;
+        tmp[tid * 2] = !is_zero1;
+    }
+    if (tid * 2 + 1 < N) {
+        Index2Pos(tid * 2 + 1, ndim, shapes, pos2);
+        is_zero2 = fabs(data[pos2dest(pos2, ndim, strides)]) <= 1e-5;
+        tmp[tid * 2 + 1] = !is_zero2;
+    }
+
+    __syncthreads();
+
+    int offset = 1;
+    for (int d = leaf_num >> 1; d > 0; d >>= 1) {
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+            tmp[bi] += tmp[ai];
+        }
+        offset *= 2;
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        tmp[leaf_num - 1] = 0;
+    }
+    __syncthreads();
+
+    for (int d = 1; d < leaf_num; d *= 2) {
+        offset >>= 1;
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+
+            int v = tmp[ai];
+            tmp[ai] = tmp[bi];
+            tmp[bi] += v;
+        }
+        __syncthreads();
+    }
+
+    // 写入最终结果
+    if (!is_zero1 && tid * 2 < N) {
+        for (int i = 0; i < ndim; i++) {
+            results[tmp[2 * tid] * ndim + i] = pos1[i];
+        }
+    }
+    if (!is_zero2 && tid * 2 + 1 < N) {
+        for (int i = 0; i < ndim; i++) {
+            results[tmp[2 * tid + 1] * ndim + i] = pos2[i];
+        }
+    }
+    if (tid == blockDim.x - 1) {
+        // printf("blockIdxDim = %d\n", blockDim.x);
+        *count = tmp[N - 1] + (N == blockDim.x * 2 ? 1 : tmp[N] != tmp[N - 1]);
+        // printf("finally: count = %d\n", tmp[leaf_num - 1]);
+    }
+}
+
+template <typename T>
+__global__ void parallel_block_scan_kernel(size_t N, int64_t *pre_sum) {
+    // single block scan
+    extern __shared__ int64_t tmp[];
+    int tid = threadIdx.x;
+    int leaf_num = blockDim.x * 2; // equals to length of tmp
+
+    tmp[tid * 2] = tid * 2 < N - 1 ? pre_sum[tid * 2] : 0;
+    tmp[tid * 2 + 1] = tid * 2 + 1 < N - 1 ? pre_sum[tid * 2 + 1] : 0;
+    __syncthreads();
+
+    int offset = 1;
+    for (int d = leaf_num >> 1; d > 0; d >>= 1) {
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+            tmp[bi] += tmp[ai];
+        }
+        offset *= 2;
+        __syncthreads();
+    }
+    if (tid == 0) {
+        tmp[leaf_num - 1] = 0;
+    }
+    __syncthreads();
+    for (int d = 1; d < leaf_num; d <<= 1) {
+        offset >>= 1;
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+
+            int v = tmp[ai];
+            tmp[ai] = tmp[bi];
+            tmp[bi] += v;
+        }
+        __syncthreads();
+    }
+    if (tid * 2 < N) {
+        pre_sum[tid * 2] = tmp[tid * 2];
+    }
+    if (tid * 2 + 1 < N) {
+        pre_sum[tid * 2 + 1] = tmp[tid * 2 + 1];
+    }
+}
+
+template <typename T>
+__global__ void
+parallel_large_argwhere_kernel(const T *data, int64_t *block_sum,
+                               int64_t *results, size_t N, const size_t *shapes,
+                               const ptrdiff_t *strides, size_t ndim) {
+    // To be implemented for large N
+    extern __shared__ int64_t tmp_argwhere[];
+    int bid = blockIdx.x, tid = threadIdx.x;
+    size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引
+    bool is_zero1 = false, is_zero2 = false;
+    int block_offset = bid * blockDim.x * 2, leaf_num = blockDim.x * 2;
+    tmp_argwhere[2 * tid] = tmp_argwhere[2 * tid + 1] = 0;
+    if (block_offset + tid * 2 < N) {
+        Index2Pos(block_offset + tid * 2, ndim, shapes, pos1);
+        is_zero1 = fabs(data[pos2dest(pos1, ndim, strides)]) <= 1e-9;
+        tmp_argwhere[2 * tid] = !is_zero1;
+    }
+    if (block_offset + tid * 2 + 1 < N) {
+        Index2Pos(block_offset + tid * 2 + 1, ndim, shapes, pos2);
+        is_zero2 = fabs(data[pos2dest(pos2, ndim, strides)]) <= 1e-9;
+        tmp_argwhere[2 * tid + 1] = !is_zero2;
+    }
+    __syncthreads();
+    int offset = 1;
+    for (int d = leaf_num >> 1; d > 0; d >>= 1) {
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+            tmp_argwhere[bi] += tmp_argwhere[ai];
+        }
+        offset *= 2;
+        __syncthreads();
+    }
+    if (tid == 0) {
+        block_sum[bid] = tmp_argwhere[leaf_num - 1];
+        // printf("tmp_argwhere[%d] = %lld\n", leaf_num - 1,
+        //        tmp_argwhere[leaf_num - 1]);
+        // printf("block_sum[%d] = %lld\n", bid, block_sum[bid]);
+        tmp_argwhere[leaf_num - 1] = 0;
+    }
+    __syncthreads();
+    for (int d = 1; d < leaf_num; d <<= 1) {
+        offset >>= 1;
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+            int v = tmp_argwhere[ai];
+            tmp_argwhere[ai] = tmp_argwhere[bi];
+            tmp_argwhere[bi] += v;
+        }
+        __syncthreads();
+    }
+
+    if (!is_zero1 && block_offset + tid * 2 < N) {
+        for (int i = 0; i < ndim; i++) {
+            results[(tmp_argwhere[2 * tid] + block_offset) * ndim + i] = pos1[i];
+        }
+    }
+    if (!is_zero2 && block_offset + tid * 2 + 1 < N) {
+        for (int i = 0; i < ndim; i++) {
+            results[(tmp_argwhere[2 * tid + 1] + block_offset) * ndim + i] = pos2[i];
+        }
+    }
+}
+
+__global__ void add_block_offset_kernel(int64_t *results, int64_t *tmp,
+                                        int64_t *block_sums, size_t ndim) {
+    int bid = blockIdx.x, tid = threadIdx.x;
+    size_t block_offset = block_sums[bid], origin_offset = bid * blockDim.x * 2;
+    if (2 * tid < block_sums[bid + 1] - block_sums[bid]) {
+        for (int i = 0; i < ndim; i++) {
+            results[(block_offset + 2 * tid) * ndim + i] = tmp[(origin_offset + tid * 2) * ndim + i];
+        }
+    }
+    if (2 * tid + 1 < block_sums[bid + 1] - block_sums[bid]) {
+        for (int i = 0; i < ndim; i++) {
+            results[(block_offset + 2 * tid + 1) * ndim + i] = tmp[(origin_offset + tid * 2 + 1) * ndim + i];
+        }
+    }
+}
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.h b/src/infiniop/ops/argwhere/metax/argwhere_metax.h
new file mode 100644
index 000000000..4e85f870c
--- /dev/null
+++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.h
@@ -0,0 +1,8 @@
+#ifndef ARGWHERE_METAX_H
+#define ARGWHERE_METAX_H
+
+#include "../argwhere.h"
+
+DESCRIPTOR(metax)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.maca b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca
new file mode 100644
index 000000000..7f3c5fb4a
--- /dev/null
+++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca
@@ -0,0 +1,94 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../cuda/kernel.cuh"
+#include "argwhere_metax.h"
+#include "infinicore.h"
+#include <cstddef>
+#include <cstdint>
+
+infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N,
+                            size_t M, const size_t *shapes,
+                            const ptrdiff_t *strides, size_t ndim,
+                            infiniDtype_t dtype, size_t *count) {
+
+    if (dtype == INFINI_DTYPE_F32) {
+        parallel_block_argwhere_kernel<float><<<1, M / 2, M * sizeof(size_t)>>>(
+            (float *)data, results, N, shapes, strides, ndim, count);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace op::argwhere::metax {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t x_desc) {
+    auto info = ArgwhereInfo::create(x_desc);
+    CHECK_RESULT(info);
+    size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() + sizeof(size_t);
+    *desc_ptr = new Descriptor(
+        info.take(), workspace_size,
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+int nextPowerOfTwo(int x) {
+    int power = 1;
+    while (power < x) {
+        power *= 2;
+    }
+    return power;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void **y, size_t *count, const void *x,
+                                     void *stream) const {
+    hcStream_t hc_stream = static_cast<hcStream_t>(stream);
+    size_t ndim = _info.strides.size();
+    ptrdiff_t *strides = static_cast<ptrdiff_t *>(workspace);
+    size_t *shapes = reinterpret_cast<size_t *>(strides + ndim);
+    int64_t *result = reinterpret_cast<int64_t *>(shapes + ndim);
+    size_t *count_cuda = reinterpret_cast<size_t *>(result + _info.num_elements * ndim);
+
+    hcMemcpyAsync(shapes, _info.shapes.data(),
+                  _info.shapes.size() * sizeof(size_t), hcMemcpyHostToDevice,
+                  hc_stream);
+    hcMemcpyAsync(strides, _info.strides.data(),
+                  _info.strides.size() * sizeof(ptrdiff_t), hcMemcpyHostToDevice,
+                  hc_stream);
+    // hcStreamSynchronize(hc_stream);
+    size_t M = nextPowerOfTwo(_info.num_elements);
+    CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides,
+                              ndim, INFINI_DTYPE_F32, count_cuda));
+    // 从设备内存中读取 count_cuda 的值
+    hcMemcpyAsync(count, count_cuda, sizeof(size_t), hcMemcpyDeviceToHost);
+    hcStreamSynchronize(hc_stream);
+
+    // 写回结果
+    *y = new int64_t[(*count) * ndim];
+    // cudaStreamSynchronize(cuda_stream);
+
+    // spdlog::debug("count_cuda:{}", *count_cuda);
+    //   spdlog::debug("N:{}", N);
+
+    hcMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
+                  hcMemcpyDeviceToHost, hc_stream);
+
+    // hcStreamSynchronize(hc_stream);
+    // for (size_t i = 0; i < (*count) * ndim; i++) {
+    //   spdlog::debug("(*y)[{}]:{}", i, static_cast<size_t *>(*y)[i]);
+    // }
+    // hcFreeAsync(result, hc_stream);
+    // hcFreeAsync(count_cuda, hc_stream);
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::argwhere::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/moore/argwhere_kernel.h b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h
new file mode 100644
index 000000000..dd67c877c
--- /dev/null
+++ b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h
@@ -0,0 +1,91 @@
+#ifndef ARGWHERE_MOORE_KERNEL_H
+#define ARGWHERE_MOORE_KERNEL_H
+
+__device__ void Index2Pos(size_t index, size_t ndim, const size_t *shapes,
+                          size_t *pos) {
+    for (int i = ndim - 1; i >= 0; i--) {
+        pos[i] = index % shapes[i];
+        index /= shapes[i];
+    }
+}
+__device__ size_t pos2dest(size_t *pos, size_t ndim, const size_t *shapes,
+                           const ptrdiff_t *strides) {
+    size_t dest = 0;
+    for (size_t i = 0; i < ndim; i++) {
+        dest += pos[i] * strides[i];
+    }
+    return dest;
+}
+template <typename T>
+__global__ void parallel_block_argwhere_kernel(T *data, int64_t *results,
+                                               size_t N, const size_t *shapes,
+                                               const ptrdiff_t *strides,
+                                               size_t ndim, size_t *count) {
+    extern __shared__ size_t tmp[];
+    size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引
+    bool is_zero1 = false, is_zero2 = false;
+    int tid = threadIdx.x;
+    int leaf_num = blockDim.x * 2;
+
+    if (tid * 2 < N) {
+        Index2Pos(tid * 2, ndim, shapes, pos1);
+        is_zero1 = fabs(data[pos2dest(pos1, ndim, shapes, strides)]) <= 1e-5;
+        tmp[tid * 2] = !is_zero1;
+    } else {
+        tmp[tid * 2] = 0;
+    }
+
+    if (tid * 2 + 1 < N) {
+        Index2Pos(tid * 2 + 1, ndim, shapes, pos2);
+        is_zero2 = fabs(data[pos2dest(pos2, ndim, shapes, strides)]) <= 1e-5;
+        tmp[tid * 2 + 1] = !is_zero2;
+    } else {
+        tmp[tid * 2 + 1] = 0;
+    }
+
+    __syncthreads();
+
+    int offset = 1;
+    for (int d = leaf_num >> 1; d > 0; d >>= 1) {
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+            tmp[bi] += tmp[ai];
+        }
+        offset *= 2;
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        tmp[leaf_num - 1] = 0;
+    }
+    __syncthreads();
+
+    for (int d = 1; d < leaf_num; d *= 2) {
+        offset >>= 1;
+        if (tid < d) {
+            int ai = offset * (2 * tid + 1) - 1;
+            int bi = offset * (2 * tid + 2) - 1;
+
+            int v = tmp[ai];
+            tmp[ai] = tmp[bi];
+            tmp[bi] += v;
+        }
+        __syncthreads();
+    }
+    if (!is_zero1 && tid * 2 < N) {
+        for (int i = 0; i < ndim; i++) {
+            results[tmp[2 * tid] * ndim + i] = pos1[i];
+        }
+    }
+    if (!is_zero2 && tid * 2 + 1 < N) {
+        for (int i = 0; i < ndim; i++) {
+            results[tmp[2 * tid + 1] * ndim + i] = pos2[i];
+        }
+    }
+    if (tid == blockDim.x - 1) {
+        *count = tmp[N - 1] + (N == blockDim.x * 2 ? 1 : tmp[N] != tmp[N - 1]);
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.h b/src/infiniop/ops/argwhere/moore/argwhere_moore.h
new file mode 100644
index 000000000..63149291a
--- /dev/null
+++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.h
@@ -0,0 +1,8 @@
+#ifndef ARGWHERE_MOORE_H
+#define ARGWHERE_MOORE_H
+
+#include "../argwhere.h"
+
+DESCRIPTOR(moore)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu
new file mode 100644
index 000000000..4d5007731
--- /dev/null
+++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu
@@ -0,0 +1,107 @@
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "argwhere_kernel.h"
+#include "argwhere_moore.h"
+#include "infinicore.h"
+#include <spdlog/spdlog.h>
+
+// template <typename T>
+// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N,
+//                             size_t M, const size_t *shapes,
+//                             const ptrdiff_t *strides, size_t ndim,
+//                             size_t *count) {
+//        parallel_block_argwhere_kernel<float><<<1, M / 2, M>>>(
+//         data, results, N, shapes, strides, ndim, count);
+// }
+
+
+infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N,
+                            size_t M, const size_t *shapes,
+                            const ptrdiff_t *strides, size_t ndim,
+                            infiniDtype_t dtype, size_t *count) {
+
+  if (dtype == INFINI_DTYPE_F32) {
+    parallel_block_argwhere_kernel<float><<<1, M / 2, M*sizeof(size_t)>>>(
+        (float *)data, results, N, shapes, strides, ndim, count);
+  } else {
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+  }
+  return INFINI_STATUS_SUCCESS;
+}
+
+namespace op::argwhere::moore {
+struct Descriptor::Opaque {
+  std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t x_desc) {
+  auto info = ArgwhereInfo::create(x_desc);
+  CHECK_RESULT(info);
+  size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 +
+                          x_desc->ndim() * sizeof(int64_t) * x_desc->numel() +
+                          sizeof(size_t);
+  *desc_ptr = new Descriptor(
+      info.take(), workspace_size,
+      new Opaque{
+          reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+      handle->device, handle->device_id);
+  return INFINI_STATUS_SUCCESS;
+}
+
+int nextPowerOfTwo(int x) {
+  int power = 1;
+  while (power < x) {
+    power *= 2;
+  }
+  return power;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void **y, size_t *count, const void *x,
+                                     void *stream) const {
+  musaStream_t moore_stream = static_cast<musaStream_t>(stream);
+  size_t ndim = _info.strides.size();
+  ptrdiff_t *strides = static_cast<ptrdiff_t *>(workspace);
+  size_t *shapes = reinterpret_cast<size_t *>(strides + ndim);
+  int64_t *result = reinterpret_cast<int64_t *>(shapes + ndim);
+  size_t *count_cuda =
+      reinterpret_cast<size_t *>(result + _info.num_elements * ndim);
+
+  musaMemcpyAsync(shapes, _info.shapes.data(),
+                  _info.shapes.size() * sizeof(size_t), musaMemcpyHostToDevice,
+                  moore_stream);
+  musaMemcpyAsync(strides, _info.strides.data(),
+                  _info.strides.size() * sizeof(ptrdiff_t),
+                  musaMemcpyHostToDevice, moore_stream);
+  // musaStreamSynchronize(moore_stream);
+  size_t M = nextPowerOfTwo(_info.num_elements);
+//   musaStreamSynchronize(moore_stream);
+  CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides,
+                            ndim, INFINI_DTYPE_F32, count_cuda));
+  // 从设备内存中读取 count_cuda 的值
+  musaMemcpyAsync(count, count_cuda, sizeof(size_t), musaMemcpyDeviceToHost,
+                  moore_stream);
+  musaStreamSynchronize(moore_stream);
+
+  //写回结果
+  *y = new int64_t[(*count) * ndim];
+  // cudaStreamSynchronize(cuda_stream);
+
+
+  musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
+                  musaMemcpyDeviceToHost, moore_stream);
+
+  // cudaStreamSynchronize(cuda_stream);
+  // for (size_t i = 0; i < (*count) * ndim; i++) {
+  //   spdlog::debug("(*y)[{}]:{}", i, static_cast<size_t *>(*y)[i]);
+  // }
+  // cudaFreeAsync(result, cuda_stream);
+  // cudaFreeAsync(count_cuda, cuda_stream);
+  return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::argwhere::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu
new file mode 100644
index 000000000..20e46f7bc
--- /dev/null
+++ b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu
@@ -0,0 +1,107 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "argwhere_nvidia.cuh"
+#include "infinicore.h"
+#include <cstddef>
+#include <cstdint>
+
+int nextPowerOfTwo(int x) {
+    int power = 1;
+    while (power < x) {
+        power *= 2;
+    }
+    return power;
+}
+
+infiniStatus_t launchKernel(const void *data, int64_t *tmp, int64_t *results,
+                            int64_t *block_sum, size_t N, const size_t *shapes,
+                            const ptrdiff_t *strides, size_t ndim,
+                            infiniDtype_t dtype, size_t *count,
+                            int maxThreadsPerBlock, cudaStream_t stream) {
+
+    if (dtype == INFINI_DTYPE_F32) {
+        int block_size = maxThreadsPerBlock * 2;
+        int num_blocks = (N + block_size - 1) / block_size;
+        parallel_large_argwhere_kernel<float>
+            <<<num_blocks, maxThreadsPerBlock, sizeof(int64_t) * block_size>>>(
+                static_cast<const float *>(data), block_sum, tmp, N, shapes,
+                strides, ndim);
+        if (num_blocks > 1) {
+            // 计算前缀和
+            parallel_block_scan_kernel<int64_t>
+                <<<1, maxThreadsPerBlock, sizeof(int64_t) * block_size>>>(
+                    num_blocks + 1, block_sum);
+            // 重新整理结果
+            add_block_offset_kernel<<<num_blocks, maxThreadsPerBlock>>>(
+                results, tmp, block_sum, ndim);
+            cudaMemcpyAsync(count, &block_sum[num_blocks], sizeof(size_t),
+                            cudaMemcpyDeviceToHost, stream);
+        } else {
+            cudaMemcpyAsync(count, &block_sum[0], sizeof(size_t),
+                            cudaMemcpyDeviceToHost, stream);
+            cudaMemcpyAsync(results, tmp, sizeof(int64_t) * (*count) * ndim,
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace op::argwhere::nvidia {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t x_desc) {
+    auto info = ArgwhereInfo::create(x_desc);
+    CHECK_RESULT(info);
+    Opaque *opaque = new Opaque{
+        reinterpret_cast<device::nvidia::Handle *>(handle)->internal()};
+    size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() * 2 + sizeof(int64_t) * opaque->internal->maxThreadsPerBlock();
+    *desc_ptr = new Descriptor(info.take(), workspace_size, opaque,
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void **y, size_t *count, const void *x,
+                                     void *stream) const {
+    cudaStream_t cuda_stream = static_cast<cudaStream_t>(stream);
+    size_t ndim = _info.strides.size();
+    ptrdiff_t *strides = static_cast<ptrdiff_t *>(workspace);
+    size_t *shapes = reinterpret_cast<size_t *>(strides + ndim);
+    int64_t *tmp = reinterpret_cast<int64_t *>(shapes + ndim);
+    int64_t *result = reinterpret_cast<int64_t *>(tmp + _info.num_elements * ndim);
+    int64_t *block_sum = reinterpret_cast<int64_t *>(result + _info.num_elements * ndim);
+
+    cudaMemcpyAsync(shapes, _info.shapes.data(),
+                    _info.shapes.size() * sizeof(size_t), cudaMemcpyHostToDevice,
+                    cuda_stream);
+    cudaMemcpyAsync(strides, _info.strides.data(),
+                    _info.strides.size() * sizeof(ptrdiff_t),
+                    cudaMemcpyHostToDevice, cuda_stream);
+
+    CHECK_STATUS(launchKernel(x, tmp, result, block_sum, _info.num_elements,
+                              shapes, strides, ndim, INFINI_DTYPE_F32, count,
+                              _opaque->internal->maxThreadsPerBlock(),
+                              cuda_stream));
+    // 从设备内存中读取 count_cuda 的值
+
+    cudaStreamSynchronize(cuda_stream);
+
+    // 写回结果
+    *y = new int64_t[(*count) * ndim];
+
+    cudaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
+                    cudaMemcpyDeviceToHost, cuda_stream);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::argwhere::nvidia
diff --git a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh
new file mode 100644
index 000000000..fdc1dd8a5
--- /dev/null
+++ b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef ARGWHERE_NVIDIA_H
+#define ARGWHERE_NVIDIA_H
+
+#include "../argwhere.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/argwhere/operator.cc b/src/infiniop/ops/argwhere/operator.cc
new file mode 100644
index 000000000..d6c5dfa7f
--- /dev/null
+++ b/src/infiniop/ops/argwhere/operator.cc
@@ -0,0 +1,200 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/argwhere.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/argwhere_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/argwhere_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/argwhere_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/argwhere_kunlun.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "cambricon/argwhere_cambricon.cuh"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/argwhere_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateArgwhereDescriptor(
+    infiniopHandle_t handle,
+    infiniopArgwhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::argwhere::NAMESPACE::Descriptor::create(                     \
+            handle,                                                             \
+            reinterpret_cast<op::argwhere::NAMESPACE::Descriptor **>(desc_ptr), \
+            input_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetArgwhereWorkspaceSize(
+    infiniopArgwhereDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::argwhere::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopArgwhere(
+    infiniopArgwhereDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void **output,
+    size_t *count,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                     \
+        return reinterpret_cast<const op::argwhere::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, count, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyArgwhereDescriptor(infiniopArgwhereDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        delete reinterpret_cast<const op::argwhere::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc
new file mode 100644
index 000000000..c95eb3c24
--- /dev/null
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc
@@ -0,0 +1,52 @@
+#include "asin_cpu.h"
+
+namespace op::asin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AsinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AsinOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AsinOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AsinOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h
new file mode 100644
index 000000000..a07404598
--- /dev/null
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __ASIN_CPU_H__
+#define __ASIN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(asin, cpu)
+
+namespace op::asin::cpu {
+typedef struct AsinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::asin(input);
+    }
+} AsinOp;
+} // namespace op::asin::cpu
+
+#endif // __ASIN_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh
new file mode 100644
index 000000000..2d4160ca6
--- /dev/null
+++ b/src/infiniop/ops/asin/cuda/kernel.cuh
@@ -0,0 +1,42 @@
+#ifndef __ASIN_CUDA_H__
+#define __ASIN_CUDA_H__
+
+namespace op::asin::cuda {
+typedef struct AsinOp {
+    static constexpr size_t num_inputs = 1;
+
+    __device__ __forceinline__ float asin_f32_func(float x) const {
+        return asinf(x);
+    }
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(asin_f32_func(vf.x), asin_f32_func(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = asin_f32_func(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            float r0 = asin_f32_func(f0);
+            float r1 = asin_f32_func(f1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            float rf = asin_f32_func(xf);
+            return __float2bfloat16_rn(rf);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return asin_f32_func(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::asin(input);
+        } else {
+            return std::asin(input);
+        }
+    }
+} AsinOp;
+} // namespace op::asin::cuda
+
+#endif // __ASIN_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/asin/metax/asin_metax.h b/src/infiniop/ops/asin/metax/asin_metax.h
new file mode 100644
index 000000000..e1805508d
--- /dev/null
+++ b/src/infiniop/ops/asin/metax/asin_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ASIN_METAX_API_H__
+#define __ASIN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(asin, metax)
+
+#endif // __ASIN_METAX_API_H__
diff --git a/src/infiniop/ops/asin/metax/asin_metax.maca b/src/infiniop/ops/asin/metax/asin_metax.maca
new file mode 100644
index 000000000..89567ac01
--- /dev/null
+++ b/src/infiniop/ops/asin/metax/asin_metax.maca
@@ -0,0 +1,61 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+#include "asin_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::asin::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t
+Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr,
+                   infiniopTensorDescriptor_t out_desc,
+                   std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+  auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+  auto dtype = out_desc->dtype();
+
+  const auto &input_desc = input_desc_vec.at(0);
+  const auto &output_shape = out_desc->shape();
+  const auto &input_shape = input_desc->shape();
+
+  CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
+              INFINI_DTYPE_BF16);
+
+  CHECK_SAME_SHAPE(output_shape, input_shape);
+
+  // create CUDA elementwise descriptor
+  CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+  return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output,
+                                     std::vector<const void *> inputs,
+                                     void *stream) const {
+
+  if (workspace_size < _workspace_size) {
+    return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+  }
+
+  switch (_dtype) {
+  case INFINI_DTYPE_F16:
+    return _device_info->calculate<256, cuda::AsinOp, half>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_BF16:
+    return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_F32:
+    return _device_info->calculate<256, cuda::AsinOp, float>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_F64:
+    return _device_info->calculate<256, cuda::AsinOp, double>(
+        _info, workspace, output, inputs, stream);
+  default:
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+  }
+
+  return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/asin/moore/asin_moore.h b/src/infiniop/ops/asin/moore/asin_moore.h
new file mode 100644
index 000000000..49394ea9d
--- /dev/null
+++ b/src/infiniop/ops/asin/moore/asin_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ASIN_MOORE_API_H__
+#define __ASIN_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(asin, moore)
+
+#endif // __ASIN_MOORE_API_H__
diff --git a/src/infiniop/ops/asin/moore/asin_moore.mu b/src/infiniop/ops/asin/moore/asin_moore.mu
new file mode 100644
index 000000000..b5bbd0eb9
--- /dev/null
+++ b/src/infiniop/ops/asin/moore/asin_moore.mu
@@ -0,0 +1,60 @@
+#include "asin_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "asin_moore_kernel.h"
+
+namespace op::asin::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+   auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::AsinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::AsinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::AsinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::AsinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::moore
diff --git a/src/infiniop/ops/asin/moore/asin_moore_kernel.h b/src/infiniop/ops/asin/moore/asin_moore_kernel.h
new file mode 100644
index 000000000..0401417f2
--- /dev/null
+++ b/src/infiniop/ops/asin/moore/asin_moore_kernel.h
@@ -0,0 +1,42 @@
+#ifndef __ASIN_MOORE_KERNEL_H__
+#define __ASIN_MOORE_KERNEL_H__
+
+namespace op::asin::moore {
+typedef struct AsinOp {
+    static constexpr size_t num_inputs = 1;
+
+    __device__ __forceinline__ float asin_f32_func(float x) const {
+        return asinf(x);
+    }
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(asin_f32_func(vf.x), asin_f32_func(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = asin_f32_func(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            float r0 = asin_f32_func(f0);
+            float r1 = asin_f32_func(f1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            float rf = asin_f32_func(xf);
+            return __float2bfloat16_rn(rf);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return asin_f32_func(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::asin(input);
+        } else {
+            return std::asin(input);
+        }
+    }
+} AsinOp;
+} // namespace op::asin::moore
+
+#endif // __ASIN_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
new file mode 100644
index 000000000..206c277b1
--- /dev/null
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "asin_nvidia.cuh"
+
+namespace op::asin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t
+Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr,
+                   infiniopTensorDescriptor_t out_desc,
+                   std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
+                INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output,
+                                     std::vector<const void *> inputs,
+                                     void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinOp, half>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinOp, float>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::AsinOp, double>(
+            _info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
new file mode 100644
index 000000000..681879341
--- /dev/null
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ASIN_CUDA_API_H__
+#define __ASIN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(asin, nvidia)
+
+#endif // __ASIN_CUDA_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc
new file mode 100644
index 000000000..41d2e7897
--- /dev/null
+++ b/src/infiniop/ops/asin/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/asin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/asin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/asin_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/asin_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/asin_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAsinDescriptor(
+    infiniopHandle_t handle,
+    infiniopAsinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::asin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::asin::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::asin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAsin(
+    infiniopAsinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.cc b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc
new file mode 100644
index 000000000..b9afb2eb4
--- /dev/null
+++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc
@@ -0,0 +1,58 @@
+#include "fmin_cpu.h"
+
+namespace op::fmin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<FminOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<FminOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<FminOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<FminOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<FminOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<FminOp, int64_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::fmin::cpu
diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.h b/src/infiniop/ops/fmin/cpu/fmin_cpu.h
new file mode 100644
index 000000000..f0010feea
--- /dev/null
+++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __FMIN_CPU_H__
+#define __FMIN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(fmin, cpu)
+
+namespace op::fmin::cpu {
+typedef struct FminOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>) {
+            float a_f = utils::cast<float>(a);
+            float b_f = utils::cast<float>(b);
+            float result = std::fminf(a_f, b_f);
+            return utils::cast<T>(result);
+        } else {
+            return std::fmin(a, b);
+        }
+    }
+} FminOp;
+} // namespace op::fmin::cpu
+
+#endif // __FMIN_CPU_H__
diff --git a/src/infiniop/ops/fmin/cuda/kernel.cuh b/src/infiniop/ops/fmin/cuda/kernel.cuh
new file mode 100644
index 000000000..fbe385302
--- /dev/null
+++ b/src/infiniop/ops/fmin/cuda/kernel.cuh
@@ -0,0 +1,33 @@
+#ifndef __FMIN_CUDA_H__
+#define __FMIN_CUDA_H__
+
+#include <type_traits>
+namespace op::fmin::cuda {
+typedef struct FminOp {
+public:
+  static constexpr size_t num_inputs = 2;
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+#if defined(ENABLE_ILUVATAR_API)
+    if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+      float a_f = __bfloat162float(a), b_f = __bfloat162float(b);
+      return __float2bfloat16(fminf(a_f, b_f));
+    } else if constexpr (std::is_same_v<T, half>) {
+      float a_f = __half2float(a), b_f = __half2float(b);
+      return __float2half(fminf(a_f, b_f));
+    }
+#elif defined(ENABLE_NVIDIA_API)
+    if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+      return __hmin(a, b);
+    }
+#endif
+    if constexpr (std::is_same_v<T, float>) {
+      return fminf(a, b);
+    } else {
+      return a < b ? a : b;
+    }
+  }
+} FminOp;
+} // namespace op::fmin::cuda
+
+#endif // __ADD_CUDA_H__
diff --git a/src/infiniop/ops/fmin/metax/fmin_metax.h b/src/infiniop/ops/fmin/metax/fmin_metax.h
new file mode 100644
index 000000000..9348fa599
--- /dev/null
+++ b/src/infiniop/ops/fmin/metax/fmin_metax.h
@@ -0,0 +1,8 @@
+#ifndef __FMIN_METAX_API_H__
+#define __FMIN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(fmin, metax)
+
+#endif // __FMIN_METAX_API_H__
diff --git a/src/infiniop/ops/fmin/metax/fmin_metax.maca b/src/infiniop/ops/fmin/metax/fmin_metax.maca
new file mode 100644
index 000000000..049b3806c
--- /dev/null
+++ b/src/infiniop/ops/fmin/metax/fmin_metax.maca
@@ -0,0 +1,69 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "fmin_metax.h"
+
+namespace op::fmin::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t
+Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr,
+                   infiniopTensorDescriptor_t out_desc,
+                   std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+  auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+  auto dtype = out_desc->dtype();
+
+  const auto &a_desc = input_desc_vec.at(0);
+  const auto &b_desc = input_desc_vec.at(1);
+  const auto &c_shape = out_desc->shape();
+  const auto &a_shape = a_desc->shape();
+  const auto &b_shape = b_desc->shape();
+
+  CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
+              INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
+
+  CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+  // create CUDA elementwise descriptor
+  CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+  return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output,
+                                     std::vector<const void *> inputs,
+                                     void *stream) const {
+
+  if (workspace_size < _workspace_size) {
+    return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+  }
+
+  switch (_dtype) {
+  case INFINI_DTYPE_F16:
+    return _device_info->calculate<256, cuda::FminOp, half>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_BF16:
+    return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_F32:
+    return _device_info->calculate<256, cuda::FminOp, float>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_I32:
+    return _device_info->calculate<256, cuda::FminOp, int32_t>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_I64:
+    return _device_info->calculate<256, cuda::FminOp, int64_t>(
+        _info, workspace, output, inputs, stream);
+  case INFINI_DTYPE_F64:
+    return _device_info->calculate<256, cuda::FminOp, double>(
+        _info, workspace, output, inputs, stream);
+  default:
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+  }
+
+  return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::fmin::metax
diff --git a/src/infiniop/ops/fmin/moore/fmin_moore.h b/src/infiniop/ops/fmin/moore/fmin_moore.h
new file mode 100644
index 000000000..f73d99e7d
--- /dev/null
+++ b/src/infiniop/ops/fmin/moore/fmin_moore.h
@@ -0,0 +1,8 @@
+#ifndef __FMIN_MOORE_API_H__
+#define __FMIN_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(fmin, moore)
+
+#endif // __FMIN_MOORE_API_H__
diff --git a/src/infiniop/ops/fmin/moore/fmin_moore.mu b/src/infiniop/ops/fmin/moore/fmin_moore.mu
new file mode 100644
index 000000000..d19071e06
--- /dev/null
+++ b/src/infiniop/ops/fmin/moore/fmin_moore.mu
@@ -0,0 +1,66 @@
+#include "fmin_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "fmin_moore_kernel.h"
+
+namespace op::fmin::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::FminOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::FminOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::FminOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::FminOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, moore::FminOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, moore::FminOp, int64_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::fmin::moore
diff --git a/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h b/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h
new file mode 100644
index 000000000..cde58f26a
--- /dev/null
+++ b/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h
@@ -0,0 +1,27 @@
+#ifndef __FMIN_MOORE_KERNEL_H__
+#define __FMIN_MOORE_KERNEL_H__
+
+namespace op::fmin::moore {
+typedef struct FminOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmin2(a, b);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hmin(a, b);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float a_f = __bfloat162float(a);
+            float b_f = __bfloat162float(b);
+            return fminf(a_f, b_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fminf(a, b);
+        } else {
+            return a < b ? a : b;
+        }
+    }
+} FminOp;
+} // namespace op::fmin::moore
+
+#endif // __FMIN_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu
new file mode 100644
index 000000000..bfe77c7bc
--- /dev/null
+++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu
@@ -0,0 +1,66 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "fmin_nvidia.cuh"
+
+namespace op::fmin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t
+Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr,
+                   infiniopTensorDescriptor_t out_desc,
+                   std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output,
+                                     std::vector<const void *> inputs,
+                                     void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::FminOp, half>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::FminOp, float>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::FminOp, int32_t>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::FminOp, double>(
+            _info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::fmin::nvidia
diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh
new file mode 100644
index 000000000..f9def467b
--- /dev/null
+++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __FMIN_CUDA_API_H__
+#define __FMIN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(fmin, nvidia)
+
+#endif // __FMIN_CUDA_API_H__
diff --git a/src/infiniop/ops/fmin/operator.cc b/src/infiniop/ops/fmin/operator.cc
new file mode 100644
index 000000000..da17139b9
--- /dev/null
+++ b/src/infiniop/ops/fmin/operator.cc
@@ -0,0 +1,202 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/fmin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/fmin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/fmin_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/fmin_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/fmin_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/fmin_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/fmin_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateFminDescriptor(
+    infiniopHandle_t handle,
+    infiniopFminDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::fmin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::fmin::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                         \
+            {a_desc,                                                        \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::fmin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopFmin(
+    infiniopFminDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::fmin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::fmin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/paged_attention/operator.cc b/src/infiniop/ops/paged_attention/operator.cc
index f41adb2cb..9f28984cd 100644
--- a/src/infiniop/ops/paged_attention/operator.cc
+++ b/src/infiniop/ops/paged_attention/operator.cc
@@ -5,9 +5,9 @@
 #ifdef ENABLE_NVIDIA_API
 #include "nvidia/paged_attention_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/paged_attention_metax.h"
-#endif
+// #ifdef ENABLE_METAX_API
+// #include "metax/paged_attention_metax.h"
+// #endif
 
 __C infiniStatus_t infiniopCreatePagedAttentionDescriptor(
     infiniopHandle_t handle,
@@ -34,9 +34,9 @@ __C infiniStatus_t infiniopCreatePagedAttentionDescriptor(
 #ifdef ENABLE_NVIDIA_API
         CREATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         CREATE(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -54,9 +54,9 @@ __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize(
 #ifdef ENABLE_NVIDIA_API
         GET(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         GET(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -78,9 +78,9 @@ __C infiniStatus_t infiniopPagedAttention(
 #ifdef ENABLE_NVIDIA_API
         CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         CALCULATE(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -97,9 +97,9 @@ __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor(
 #ifdef ENABLE_NVIDIA_API
         DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         DESTROY(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
diff --git a/src/infiniop/ops/paged_caching/operator.cc b/src/infiniop/ops/paged_caching/operator.cc
index a69b0e07e..3d3e08f33 100644
--- a/src/infiniop/ops/paged_caching/operator.cc
+++ b/src/infiniop/ops/paged_caching/operator.cc
@@ -5,9 +5,9 @@
 #ifdef ENABLE_NVIDIA_API
 #include "nvidia/paged_caching_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/paged_caching_metax.h"
-#endif
+// #ifdef ENABLE_METAX_API
+// #include "metax/paged_caching_metax.h"
+// #endif
 
 __C infiniStatus_t infiniopCreatePagedCachingDescriptor(
     infiniopHandle_t handle,
@@ -29,9 +29,9 @@ __C infiniStatus_t infiniopCreatePagedCachingDescriptor(
 #ifdef ENABLE_NVIDIA_API
         CREATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         CREATE(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -49,9 +49,9 @@ __C infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
 #ifdef ENABLE_NVIDIA_API
         GET(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         GET(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -73,9 +73,9 @@ __C infiniStatus_t infiniopPagedCaching(
 #ifdef ENABLE_NVIDIA_API
         CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         CALCULATE(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -92,9 +92,9 @@ __C infiniStatus_t infiniopDestroyPagedCachingDescriptor(
 #ifdef ENABLE_NVIDIA_API
         DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax)
-#endif
+// #ifdef ENABLE_METAX_API
+//         DESTROY(INFINI_DEVICE_METAX, metax)
+// #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
diff --git a/test/infinicore/ops/adaptive_avg_pool3d.py b/test/infinicore/ops/adaptive_avg_pool3d.py
index 5566f3a72..533c49d49 100644
--- a/test/infinicore/ops/adaptive_avg_pool3d.py
+++ b/test/infinicore/ops/adaptive_avg_pool3d.py
@@ -68,9 +68,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.adaptive_avg_pool3d(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.adaptive_avg_pool3d(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.adaptive_avg_pool3d(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/addr.py b/test/infinicore/ops/addr.py
index e1f279349..9c28c36a7 100644
--- a/test/infinicore/ops/addr.py
+++ b/test/infinicore/ops/addr.py
@@ -93,9 +93,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.addr(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.addr(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.addr(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/argwhere.py b/test/infinicore/ops/argwhere.py
index 83fa1e6a5..0a7d531a7 100644
--- a/test/infinicore/ops/argwhere.py
+++ b/test/infinicore/ops/argwhere.py
@@ -52,9 +52,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.argwhere(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.argwhere(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.argwhere(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/asin.py b/test/infinicore/ops/asin.py
index 168c1e209..373dfdfe9 100644
--- a/test/infinicore/ops/asin.py
+++ b/test/infinicore/ops/asin.py
@@ -97,9 +97,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.asin(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.asin(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.asin(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/fmin.py b/test/infinicore/ops/fmin.py
index 280d6bf4e..8ea7ec4dd 100644
--- a/test/infinicore/ops/fmin.py
+++ b/test/infinicore/ops/fmin.py
@@ -96,9 +96,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.fmin(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.fmin(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.fmin(*args, **kwargs)
 
 
 def main():