InfiniTensor
diff --git a/‎include/infinicore/common/hash.hpp‎
Lines changed: 10 additions & 0 deletions b/‎include/infinicore/common/hash.hpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/infinicore/context/context.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/infinicore/context/context.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/infinicore/ops.hpp‎
Lines changed: 5 additions & 0 deletions b/‎include/infinicore/ops.hpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/infinicore/ops/add_rms_norm.hpp‎
Lines changed: 20 additions & 0 deletions b/‎include/infinicore/ops/add_rms_norm.hpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎include/infinicore/ops/common/cache.hpp‎
Lines changed: 4 additions & 0 deletions b/‎include/infinicore/ops/common/cache.hpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/infinicore/ops/flipud.hpp‎
Lines changed: 19 additions & 0 deletions b/‎include/infinicore/ops/flipud.hpp‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎include/infinicore/ops/float_power.hpp‎
Lines changed: 68 additions & 0 deletions b/‎include/infinicore/ops/float_power.hpp‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎include/infinicore/ops/floor_divide.hpp‎
Lines changed: 16 additions & 0 deletions b/‎include/infinicore/ops/floor_divide.hpp‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎include/infinicore/ops/multi_margin_loss.hpp‎
Lines changed: 19 additions & 0 deletions b/‎include/infinicore/ops/multi_margin_loss.hpp‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎include/infinicore/ops/paged_attention.hpp‎
Lines changed: 18 additions & 0 deletions b/‎include/infinicore/ops/paged_attention.hpp‎
Lines changed: 18 additions & 0 deletions
@@ -2,6 +2,7 @@
 
 #include "../tensor.hpp"
 
+#include <optional>
 #include <type_traits>
 
 namespace infinicore {
@@ -24,6 +25,15 @@ inline void hash_combine(size_t &seed, Tensor tensor) {
     }
 }
 
+// Specialization for optional
+template <typename T>
+inline void hash_combine(size_t &seed, const std::optional<T> &opt) {
+    hash_combine(seed, opt.has_value());
+    if (opt) {
+        hash_combine(seed, *opt);
+    }
+}
+
 // Specialization for std::string
 inline void hash_combine(size_t &seed, const std::string &str) {
     hash_combine(seed, std::hash<std::string>{}(str));
 
@@ -11,7 +11,7 @@
 namespace infinicore {
 
 namespace context {
-void setDevice(Device device, bool force_cpu = false);
+void setDevice(Device device);
 Device getDevice();
 size_t getDeviceCount(Device::Type type);
 
 
@@ -1,10 +1,15 @@
 #pragma once
 
 #include "ops/add.hpp"
+#include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
+#include "ops/paged_attention.hpp"
+#include "ops/paged_attention_prefill.hpp"
+#include "ops/paged_caching.hpp"
+#include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <utility>
+
+namespace infinicore::op {
+class AddRMSNorm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, float);
+    static void execute(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// Fused Add and RMS Normalization
+// Returns: (normalized_result, add_result)
+// The add_result can be used as residual for subsequent layers
+std::pair<Tensor, Tensor> add_rms_norm(Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+void add_rms_norm_(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+} // namespace infinicore::op
@@ -36,6 +36,10 @@ class OpCache {
         return cache_vector[device_index];
     }
 
+    BaseCache &getCache(Device device) {
+        return getCache(device.getType(), device.getIndex());
+    }
+
     void setCapacity(size_t capacity) {
         capacity_ = capacity;
         for (auto &vec : caches_) {
 
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Flipud {
+public:
+    // Schema signature: (Output, Input)
+    using schema = void (*)(Tensor, Tensor);
+    
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor flipud(Tensor input);
+void flipud_(Tensor output, Tensor input);
+
+} // namespace infinicore::op
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class FloatPower {
+public:
+    // ==========================================================
+    // Dispatcher Schemas
+    // ==========================================================
+
+    // Output = Input ^ Scalar (scalar must be double!)
+    using schema_scalar = void (*)(Tensor output,
+                                   Tensor input,
+                                   double exponent);
+
+    // Output = Input ^ Tensor
+    using schema_tensor = void (*)(Tensor output,
+                                   Tensor input,
+                                   Tensor exponent);
+
+    // ==========================================================
+    // Execute Entry Points (called by functional interface)
+    // ==========================================================
+
+    static void execute(Tensor output,
+                        Tensor input,
+                        double exponent);
+
+    static void execute(Tensor output,
+                        Tensor input,
+                        Tensor exponent);
+
+    // ==========================================================
+    // Dispatchers
+    // ==========================================================
+
+    static common::OpDispatcher<schema_scalar>& dispatcher_scalar();
+    static common::OpDispatcher<schema_tensor>& dispatcher_tensor();
+};
+
+// =======================================================================
+// Functional Interface (Python-visible semantics)
+// =======================================================================
+
+// -------------------------------
+// 1. Scalar Exponent
+// -------------------------------
+
+// out-of-place: ALWAYS float64
+Tensor float_power(Tensor input, double exponent);
+
+// in-place
+void float_power_(Tensor output, Tensor input, double exponent);
+
+// -------------------------------
+// 2. Tensor Exponent
+// -------------------------------
+
+// out-of-place: ALWAYS float64
+Tensor float_power(Tensor input, Tensor exponent);
+
+// in-place
+void float_power_(Tensor output, Tensor input, Tensor exponent);
+
+} // namespace infinicore::op
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class FloorDivide {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor c, Tensor a, Tensor b);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor floor_divide(Tensor a, Tensor b);
+void floor_divide_(Tensor c, Tensor a, Tensor b);
+} // namespace infinicore::op
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class MultiMarginLoss {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, int64_t, float, int64_t);
+    
+    static void execute(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor multi_margin_loss(Tensor input, Tensor target, Tensor weight = {}, int64_t p = 1, float margin = 1.0f, int64_t reduction = 1);
+void multi_margin_loss_(Tensor output, Tensor input, Tensor target, Tensor weight, int64_t p, float margin, int64_t reduction);
+
+} // namespace infinicore::op
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+class PagedAttention {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
+    static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale);
+void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale);
+} // namespace infinicore::op