quadric-io
diff --git a/‎.github/workflows/wheel.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/wheel.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc‎
Lines changed: 7 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quadric/quantization_fixed_point.cc‎
Lines changed: 168 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/quadric/quantization_fixed_point.cc‎
Lines changed: 168 additions & 0 deletions
diff --git a/‎onnxruntime/core/graph/contrib_ops/contrib_defs.cc‎
Lines changed: 97 additions & 0 deletions b/‎onnxruntime/core/graph/contrib_ops/contrib_defs.cc‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 26 additions & 17 deletions b/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 26 additions & 17 deletions
@@ -33,7 +33,7 @@ jobs:
     - name: Build ONNX Runtime wheel
       working-directory: /workspace
       run: |
-        python3 -m pip install cmake --upgrade
+        python3 -m pip install "cmake<4"
         ./build.sh --build_wheel --config Release --parallel ${{ github.event_name == 'pull_request' && ' ' || '--skip_tests'}} --skip_submodule_sync --allow_running_as_root --compile_no_warning_as_error
         wheel_path=$(find . -name '*.whl' | xargs readlink -f)
         echo "wheel_path=$wheel_path" >> $GITHUB_ENV
 
@@ -153,6 +153,10 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, UnfoldTensor);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, DynamicTimeWarping);
 
+// Quadric contrib ops
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kQuadricDomain, 1, DequantizeLinearFixedPoint);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kQuadricDomain, 1, QuantizeLinearFixedPoint);
+
 #ifdef ENABLE_ATEN
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen);
 #endif
@@ -366,6 +370,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, UnfoldTensor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, DynamicTimeWarping)>,
+      // Quadric contrib ops
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kQuadricDomain, 1, DequantizeLinearFixedPoint)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kQuadricDomain, 1, QuantizeLinearFixedPoint)>,
 
 #ifdef ENABLE_ATEN
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
 
@@ -0,0 +1,168 @@
+#include "core/framework/op_kernel.h"
+#include "core/common/common.h"
+#include <cmath>   // For log2()
+#include <limits>  // For int8_t min/max
+#include <iostream>
+#include <iomanip>  // For std::setprecision
+#include "core/mlas/inc/mlas.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+// --- DequantizeLinearFixedPoint
+
+class DequantizeLinearFixedPoint final : public OpKernel {
+ public:
+  explicit DequantizeLinearFixedPoint(const OpKernelInfo& info) : OpKernel(info) {}
+  Status Compute(OpKernelContext* ctx) const override;
+};
+
+// Register kernel
+ONNX_OPERATOR_KERNEL_EX(
+    DequantizeLinearFixedPoint,
+    kQuadricDomain,  // Ensure this is defined in contrib_ops.h
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<int8_t>())     // Input tensor
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())     // Scale
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>())    // Zero-point
+        .TypeConstraint("T3", DataTypeImpl::GetTensorType<int32_t>()),  // Output
+    DequantizeLinearFixedPoint);
+
+// Compute min/max range from scale & zero-point
+std::pair<float, float> getDequantizedRange(float scale, int8_t zeroPoint) {
+  constexpr int8_t int8Min = std::numeric_limits<int8_t>::min();
+  constexpr int8_t int8Max = std::numeric_limits<int8_t>::max();
+  return {(int8Min - zeroPoint) * scale, (int8Max - zeroPoint) * scale};
+}
+
+// Compute required fractional bits given a range
+int computeFracBits(float minVal, float maxVal) {
+  constexpr int maxFracBits = 31;
+  float absMinVal = std::fabs(minVal);
+  float absMaxVal = std::fabs(maxVal);
+  if (absMinVal > absMaxVal) {
+    return (absMinVal < 1.0f) ? maxFracBits : (maxFracBits - static_cast<int>(std::ceil(std::log2(absMinVal))));
+  } else {
+    return (absMaxVal < 1.0f) ? maxFracBits : (maxFracBits - static_cast<int>(std::ceil(std::log2(absMaxVal + 1))));
+  }
+}
+
+// Fixed-point multiplication with provided shift
+int32_t fixedPointMultiply(int32_t a, int32_t b, int shift) {
+  int64_t product = static_cast<int64_t>(a) * static_cast<int64_t>(b);
+  return (shift > 0) ? (product >> shift) : (product << -shift);
+}
+
+Status DequantizeLinearFixedPoint::Compute(OpKernelContext* ctx) const {
+  // Retrieve input tensors
+  const auto* X = ctx->Input<Tensor>(0);
+  const auto* scale = ctx->Input<Tensor>(1);
+  const auto* zeroPoint = ctx->Input<Tensor>(2);
+
+  // Validate inputs
+  ORT_ENFORCE(X, "Input tensor 'X' is null.");
+  ORT_ENFORCE(scale, "Scale tensor is null.");
+  ORT_ENFORCE(zeroPoint, "Zero-point tensor is null.");
+
+  // Extract values
+  const int8_t* xData = X->Data<int8_t>();
+  float s = *(scale->Data<float>());
+  int8_t zp = *(zeroPoint->Data<int8_t>());
+
+  // Compute range and fractional bits
+  auto [minVal, maxVal] = getDequantizedRange(s, zp);
+  int resultFracBits = computeFracBits(minVal, maxVal);
+
+  // Convert scale to fixed-point
+  std::vector<double> scaleValueVec = {s};
+  auto p = dataToQfp(scaleValueVec, -1, 32, false);
+  int scaleFracBits = p.second;
+  int32_t scaleQfp = static_cast<int32_t>(p.first[0]);
+
+  int shift = scaleFracBits - resultFracBits;
+
+  // Allocate output tensor
+  auto* Y = ctx->Output(0, X->Shape());
+  int32_t* yData = Y->MutableData<int32_t>();
+  size_t tensorSize = X->Shape().Size();
+
+  for (size_t i = 0; i < tensorSize; ++i) {
+    yData[i] = fixedPointMultiply(xData[i] - zp, scaleQfp, shift);
+  }
+
+  return Status::OK();
+}
+
+// --- QuantizeLinearFixedPoint
+class QuantizeLinearFixedPoint final : public OpKernel {
+ public:
+  explicit QuantizeLinearFixedPoint(const OpKernelInfo& info) : OpKernel(info) {}
+  Status Compute(OpKernelContext* ctx) const override;
+};
+
+// Register Kernel
+ONNX_OPERATOR_KERNEL_EX(
+    QuantizeLinearFixedPoint,
+    kQuadricDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<int32_t>())   // Input tensor
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int8_t>())   // xFracBits
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<float>())    // Scale
+        .TypeConstraint("T3", DataTypeImpl::GetTensorType<int8_t>())   // Zero-point
+        .TypeConstraint("T4", DataTypeImpl::GetTensorType<int8_t>()),  // Output
+    QuantizeLinearFixedPoint);
+
+Status QuantizeLinearFixedPoint::Compute(OpKernelContext* ctx) const {
+  // Get input tensors
+  const auto* X = ctx->Input<Tensor>(0);
+  const auto* xFracBitsTensor = ctx->Input<Tensor>(1);
+  const auto* scale = ctx->Input<Tensor>(2);
+  const auto* zeroPoint = ctx->Input<Tensor>(3);
+
+  // Validate inputs
+  ORT_ENFORCE(X != nullptr, "Input X is null");
+  ORT_ENFORCE(xFracBitsTensor != nullptr, "xFracBits is null");
+  ORT_ENFORCE(scale != nullptr, "Scale is null");
+  ORT_ENFORCE(zeroPoint != nullptr, "Zero point is null");
+
+  // Retrieve input data
+  const int32_t* x_data = X->Data<int32_t>();
+  int8_t xFracBits = *(xFracBitsTensor->Data<int8_t>());
+  double s = *(scale->Data<float>());
+  int8_t zp = *(zeroPoint->Data<int8_t>());
+
+  double scaleInv = 1.0 / s;
+  std::vector<double> ScaleValueVec = {scaleInv};
+  auto p = dataToQfp(ScaleValueVec, -1, 32, false);  // Returns std::make_pair(qfp, fracBits)
+  int64_t scaleInvQfp = p.first[0];
+  int scaleInvFracBits = p.second;
+
+  constexpr int postMacIntBits = 29;
+  constexpr int postMacFracBits = 31 - postMacIntBits;
+
+  int resultFracBits = postMacFracBits;
+  int shift = scaleInvFracBits + xFracBits - resultFracBits;
+  if (shift > 31) {
+    shift = 31;
+    resultFracBits = scaleInvFracBits + xFracBits - 31;
+  }
+
+  auto* Y = ctx->Output(0, X->Shape());
+  int8_t* yData = Y->MutableData<int8_t>();
+  size_t tensor_size = X->Shape().Size();
+  for (size_t i = 0; i < tensor_size; ++i) {
+    int32_t product = fixedPointMultiply(x_data[i], scaleInvQfp, shift);
+    int32_t productRound = fxRoundPosInf(static_cast<int32_t>(product), static_cast<uint8_t>(resultFracBits));
+
+    // Clip and apply zero-point
+    yData[i] = static_cast<int8_t>(std::min(std::max(productRound + zp, static_cast<int32_t>(std::numeric_limits<int8_t>::min())), static_cast<int32_t>(std::numeric_limits<int8_t>::max())));
+  }
+  return Status::OK();
+}
+
+}  // namespace contrib
+}  // namespace onnxruntime
@@ -3697,6 +3697,7 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
                       "Allow inputs and outputs to be any kind of tensor.");
 #endif
 
+  // Quadric contrib ops
   ONNX_CONTRIB_OPERATOR_SCHEMA(QuadricCustomOp)
       .SetDomain(kQuadricDomain)
       .SinceVersion(1)
@@ -3715,6 +3716,102 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
       .TypeConstraint("T", OpSchema::all_tensor_types_ir4(),
                       "Allow inputs and outputs to be any kind of tensor.");
 
+  // Quadric ops
+  ONNX_CONTRIB_OPERATOR_SCHEMA(DequantizeLinearFixedPoint)
+      .SetDomain(kQuadricDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+  Dequantizes an int8 input tensor into a fixed-point int32 output tensor using integer arithmetic.
+  The dequantization formula is:
+
+      Y_fixed = ((X - zero_point) * scale_qfp) >> shift
+
+  where `scale_qfp` is the scale converted into fixed-point representation.
+
+  - `X` is the quantized input tensor (int8).
+  - `scale` is a floating-point scalar that will be inverted (1/scale) and converted into a fixed-point multiplier `scale_qfp`.
+  - `zero_point` is the quantization zero-point (int8), which is subtracted from `X` before scaling.
+  - `Y_fixed` is the output tensor (int32) interpreted as a fixed-point representation.
+
+  Unlike `DequantizeLinear`, which produces floating-point outputs, this operator retains
+  a fixed-point integer format to align with Quadric's CGC execution.
+
+  This operator does **per-tensor dequantization**, meaning `scale` and `zero_point` are scalars.
+  )DOC")
+
+      // Inputs
+      .Input(0, "X", "N-D quantized input tensor (int8).", "T")
+      .Input(1, "scale", "Scalar scale factor (float). Converted to fixed-point format internally.", "T1")
+      .Input(2, "zero_point", "Scalar zero-point offset (int8). Must match type of X.", "T2")
+
+      // Outputs
+      .Output(0, "Y", "N-D output tensor (int32). Fixed-point representation.", "T3")
+
+      // Type Constraints
+      .TypeConstraint("T", {"tensor(int8)"}, "Input tensor must be int8.")
+      .TypeConstraint("T1", {"tensor(float)"}, "Scale must be a floating-point scalar.")
+      .TypeConstraint("T2", {"tensor(int8)"}, "Zero point must be int8, matching the input tensor type.")
+      .TypeConstraint("T3", {"tensor(int32)"}, "Output tensor is int32 (fixed-point representation).")
+
+      // Shape Inference
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        auto y_type = ctx.getOutputType(0);
+        y_type->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto::INT32);
+
+        if (!hasInputShape(ctx, 0))
+          return;
+
+        auto& input_shape = getInputShape(ctx, 0);
+        updateOutputShape(ctx, 0, input_shape);
+      });
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QuantizeLinearFixedPoint)
+      .SetDomain(kQuadricDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+  Quantizes an int32 input tensor into an int8 output tensor using fixed-point arithmetic.
+
+  The quantization formula is:
+
+      Y_q = round(((X * scale_inv_qfp) >> shift) + zero_point)
+
+  where:
+  - `X` is the input tensor in int32 (fixed-point representation).
+  - `scale_inv_qfp` is the inverse of scale in fixed-point format.
+  - `zero_point` is the quantization zero-point.
+  - `Y_q` is the quantized output in int8.
+
+  This operator does **per-tensor quantization**, meaning `scale` and `zero_point` are scalars.
+  )DOC")
+
+      // Inputs
+      .Input(0, "X", "N-D input tensor (int32, fixed-point).", "T")
+      .Input(1, "x_frac_bits", "Fractional bits of input (int8).", "T1")
+      .Input(2, "scale", "Scalar scale factor (float).", "T2")
+      .Input(3, "zero_point", "Scalar zero-point offset (int8).", "T3")
+
+      // Outputs
+      .Output(0, "Y", "N-D output tensor (int8).", "T4")
+
+      // Type Constraints
+      .TypeConstraint("T", {"tensor(int32)"}, "Input tensor must be int32.")
+      .TypeConstraint("T1", {"tensor(int8)"}, "Fractional bits must be int8.")
+      .TypeConstraint("T2", {"tensor(float)"}, "Scale must be a floating-point scalar.")
+      .TypeConstraint("T3", {"tensor(int8)"}, "Zero point must be int8, matching the output tensor type.")
+      .TypeConstraint("T4", {"tensor(int8)"}, "Output tensor is int8.")
+
+      // Shape Inference
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        auto y_type = ctx.getOutputType(0);
+        y_type->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto::INT8);
+
+        if (!hasInputShape(ctx, 0))
+          return;
+
+        auto& input_shape = getInputShape(ctx, 0);
+        updateOutputShape(ctx, 0, input_shape);
+      });
+
 #ifdef ENABLE_TRAINING_OPS
   // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once 1). compute optimizer is enabled for inference or
   // 2). this is needed by inference for other purpose.
 
@@ -18,9 +18,10 @@ Module Name:
 #pragma once
 
 #include <cstddef>
-#include <cstdlib>
 #include <cstdint>
+#include <cstdlib>
 #include <stdexcept>
+#include <vector>
 
 //
 // Define the calling convention for Windows targets.
@@ -1268,24 +1269,33 @@ MlasRequantizeOutput(
     size_t CountN
     );
 
-template<typename OutputType>
+template <typename OutputType>
 void
-MLASCALL
-MlasRequantizeOutputFixedPoint(
-    const int32_t* Input,
-    size_t InputLeadingDimension,
-    OutputType* Output,
-    size_t OutputLeadingDimension,
-    const int32_t* Bias,
-    const float* Scale,
-    bool PerColumnScale,
-    OutputType ZeroPoint,
-    size_t StartM,
-    size_t StartN,
-    size_t CountM,
-    size_t CountN
+    MLASCALL
+    MlasRequantizeOutputFixedPoint(
+        const int32_t* Input,
+        size_t InputLeadingDimension,
+        OutputType* Output,
+        size_t OutputLeadingDimension,
+        const int32_t* Bias,
+        const float* Scale,
+        bool PerColumnScale,
+        OutputType ZeroPoint,
+        size_t StartM,
+        size_t StartN,
+        size_t CountM,
+        size_t CountN
     );
 
+int32_t
+fxRoundPosInf(const int32_t a, uint8_t aFracBits);
+
+template <typename T>
+std::pair<std::vector<int>, int>
+dataToQfp(
+    const std::vector<T>& data, int fracBits = -1, int qfpSize = 32, bool scalarAsFloat = true
+);
+
 class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSOR
 {
    public:
@@ -1336,7 +1346,6 @@ class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSOR
     bool OutputIsSigned_;
 };
 
-
 void
 MLASCALL
 MlasFindMinMaxElement(