diff --git a/CMakeLists.txt b/CMakeLists.txt
index d3dbb77118..c02784e089 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Match the minimum required version of LLVM and MLIR
-cmake_minimum_required(VERSION 3.20.0)
+cmake_minimum_required(VERSION 3.26.0)
 
 project(onnx-mlir)
 
diff --git a/docker/Dockerfile.llvm-project b/docker/Dockerfile.llvm-project
index 40f14a79cd..6c11e54519 100644
--- a/docker/Dockerfile.llvm-project
+++ b/docker/Dockerfile.llvm-project
@@ -1,5 +1,5 @@
 # By default, use ubuntu:jammy, remember to change Jenkins build script as well
-ARG BASE_IMAGE="ghcr.io/onnxmlir/ubuntu:jammy"
+ARG BASE_IMAGE="ghcr.io/onnxmlir/ubuntu:noble"
 FROM ${BASE_IMAGE}
 
 # Label the image for various checking and cleanup
diff --git a/docker/Dockerfile.onnx-mlir b/docker/Dockerfile.onnx-mlir
index 3753b11f1d..51d433eae6 100644
--- a/docker/Dockerfile.onnx-mlir
+++ b/docker/Dockerfile.onnx-mlir
@@ -21,6 +21,7 @@ COPY . onnx-mlir
 # CMAKE_INSTALL_LIBDIR to be lib.
 RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
     && cd ${ONNX_ROOT} \
+    && test -f ${ONNX_ROOT}/backend.py || (echo "Required file missing!" && exit 1) \
     # Require patching until upstreamed
     && sed -i -e 's/target_link_libraries(onnx PUBLIC onnx_proto)/target_link_libraries(onnx PUBLIC onnx_proto PUBLIC ${protobuf_ABSL_USED_TARGETS})/g' \
            -e '/absl::log_initialize/a \
@@ -29,9 +30,10 @@ RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
           absl::log_internal_nullguard' CMakeLists.txt \
     # Required for pip install with `--no-build-isolation` flag.
     # setuptools >= 70.x creates conflicts with pip packaging versions.
-    && python3 -m pip install --upgrade setuptools==68.2.2 \
+    # setuptools >= 77.0.0 supports plain string license format in pyproject.toml
+    && python3 -m pip install --upgrade setuptools==77.0.1  \
     && CC=clang CXX=clang++ CMAKE_ARGS="-DCMAKE_INSTALL_LIBDIR=lib" \
-       python3 -m pip install . --no-build-isolation \
+       python3 -m pip install . \
     && rm -rf ${HOME}/.cache
 
 ARG NPROC=4
diff --git a/docker/Dockerfile.onnx-mlir-dev b/docker/Dockerfile.onnx-mlir-dev
index c5ce85db28..1c0e6248f1 100644
--- a/docker/Dockerfile.onnx-mlir-dev
+++ b/docker/Dockerfile.onnx-mlir-dev
@@ -16,6 +16,7 @@ COPY . onnx-mlir
 # Setup onnx
 RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
     && cd ${ONNX_ROOT} \
+    && test -f ${ONNX_ROOT}/backend.py || (echo "Required file missing!" && exit 1) \
     # Require patching until upstreamed
     && sed -i -e 's/target_link_libraries(onnx PUBLIC onnx_proto)/target_link_libraries(onnx PUBLIC onnx_proto PUBLIC ${protobuf_ABSL_USED_TARGETS})/g' \
            -e '/absl::log_initialize/a \
@@ -24,9 +25,12 @@ RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
           absl::log_internal_nullguard' CMakeLists.txt \
     # Required for pip install with `--no-build-isolation` flag.
     # setuptools >= 70.x creates conflicts with pip packaging versions.
-    && python3 -m pip install --upgrade setuptools==68.2.2 \
+    # setuptools >= 68.x enforces strict PEP 621 license format which ONNX doesn't use yet.
+    # Use setuptools 67.8.0 which accepts plain string license format.
+    # Install cmake via pip so it's available with --no-build-isolation.
+    && python3 -m pip install --upgrade setuptools==77.0.1  \
     && CC=clang CXX=clang++ CMAKE_ARGS="-DCMAKE_INSTALL_LIBDIR=lib" \
-       python3 -m pip install . --no-build-isolation
+       python3 -m pip install .
 
 ARG NPROC=4
 ARG ACCEL=NNPA
diff --git a/docs/Dialects/onnx.md b/docs/Dialects/onnx.md
index 54e693062e..183a3e0e9b 100644
--- a/docs/Dialects/onnx.md
+++ b/docs/Dialects/onnx.md
@@ -738,7 +738,7 @@ implementations (even if a seed is specified).
 
 Traits: `AlwaysSpeculatableImplTrait`
 
-Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ResultTypeInferenceOpInterface`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
 
 Effects: `MemoryEffects::Effect{}`
 
@@ -994,7 +994,7 @@ See documentation of the Cast operator for further details.
 
 Traits: `AlwaysSpeculatableImplTrait`
 
-Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ResultTypeInferenceOpInterface`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
 
 Effects: `MemoryEffects::Effect{}`
 
@@ -1101,28 +1101,31 @@ deep models. By default the conversion of a float *x* obeys
 to the following rules. `[x]` means the value rounded to
 the target mantissa width.
 
-| x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
-|------|----|----|----|----|
-| 0 | 0 | 0 | 0 | 0 |
-|-0 | -0 | 0 | -0 | 0 |
-| NaN | NaN | NaN | NaN | NaN |
-| +/- Inf | +/- FLT_MAX | NaN | FLT_MAX | NaN |
-| [x] > FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX |
-| [x] < -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
-| else | RNE | RNE | RNE | RNE |
+| x                 | E4M3FN   | E4M3FNUZ | E5M2     | E5M2FNUZ |
+| ----------------- | -------- | -------- | -------- | -------- |
+| 0                 | 0        | 0        | 0        | 0        |
+| -0                | -0       | 0        | -0       | 0        |
+| NaN               | NaN      | NaN      | NaN      | NaN      |
+| Inf               | FLT_MAX  | NaN      | FLT_MAX  | NaN      |
+| -Inf              | -FLT_MAX | NaN      | -FLT_MAX | NaN      |
+| \[x\] > FLT_MAX   | FLT_MAX  | FLT_MAX  | FLT_MAX  | FLT_MAX  |
+| \[x\] \< -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
+| else              | RNE      | RNE      | RNE      | RNE      |
 
 The behavior changes if the parameter 'saturate' is set to False.
 The rules then become:
 
-| x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
-|------|----|----|----|----|
-| 0 | 0 | 0 | 0 | 0 |
-|-0 | -0 | 0 | -0 | 0 |
-| NaN | NaN | NaN | NaN | NaN |
-| +/- Inf | NaN | NaN | +/- Inf | NaN |
-| [x] > FLT_MAX | NaN | NaN | Inf | NaN |
-| [x] < -FLT_MAX | NaN | NaN | -Inf | NaN |
-| else | RNE | RNE | RNE | RNE |
+| x                 | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
+| ----------------- | ------ | -------- | ---- | -------- |
+| 0                 | 0      | 0        | 0    | 0        |
+| -0                | -0     | 0        | -0   | 0        |
+| NaN               | NaN    | NaN      | NaN  | NaN      |
+| -NaN              | -NaN   | NaN      | -NaN | NaN      |
+| Inf               | NaN    | NaN      | Inf  | NaN      |
+| -Inf              | -NaN   | NaN      | -Inf | NaN      |
+| \[x\] > FLT_MAX   | NaN    | NaN      | Inf  | NaN      |
+| \[x\] \< -FLT_MAX | NaN    | NaN      | -Inf | NaN      |
+| else              | RNE    | RNE      | RNE  | RNE      |
 
 Traits: `AlwaysSpeculatableImplTrait`
 
@@ -1264,12 +1267,18 @@ _ONNX CenterCropPad operation_
 
 Center crop or pad an input to given dimensions.
 
-The crop/pad dimensions can be specified for a subset of the `axes`. Non-specified dimensions will not be
-cropped or padded.
+The crop/pad dimensions can be specified for a subset of the `axes`; unspecified dimensions will remain unchanged.
+
+If the input dimensions are larger than the target crop dimensions, a centered cropping window will be extracted
+from the input. The starting value for the cropping window is rounded down, which means that if the difference
+between the input shape and the crop shape is odd, the cropping window will be shifted half a pixel to the left
+of the input center.
 
-If the input dimensions are bigger than the crop shape, a centered cropping window is extracted from the input.
-If the input dimensions are smaller than the crop shape, the input is padded on each side equally,
-so that the input is centered in the output.
+If the input dimensions are smaller than the target crop dimensions, the input will be padded equally on both sides
+to center it in the output. In cases where the total number of padding pixels is odd, an additional pixel will be
+added to the right side.
+
+The padding value used is zero.
 
 Traits: `AlwaysSpeculatableImplTrait`
 
@@ -1305,6 +1314,8 @@ _ONNX Clip operation_
 Clip operator limits the given input within an interval. The interval is
 specified by the inputs 'min' and 'max'. They default to
 numeric_limits::lowest() and numeric_limits::max(), respectively.
+When 'min' is greater than 'max', the clip operator sets all the 'input' values to
+the value of 'max'. Thus, this is equivalent to 'Min(max, Max(input, min))'.
 
 Traits: `AlwaysSpeculatableImplTrait`
 
@@ -2465,7 +2476,7 @@ y_scale = (maximum(0, max(x)) - minimum(0, min(x))) / (qmax - qmin)
 Zero point is calculated as:
 ```
 intermediate_zero_point = qmin - min(x)/y_scale
-y_zero_point = cast(round(saturate(itermediate_zero_point)))
+y_zero_point = cast(round(saturate(intermediate_zero_point)))
 ```
 
 * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
@@ -2653,13 +2664,13 @@ Effects: `MemoryEffects::Effect{}`
 
 | Operand | Description |
 | :-----: | ----------- |
-| `input` | tensor of 8-bit unsigned integer values or tensor of 16-bit unsigned integer values or tensor of 32-bit unsigned integer values or tensor of 64-bit unsigned integer values or tensor of 8-bit signless integer values or tensor of 16-bit signless integer values or tensor of 32-bit signless integer values or tensor of 64-bit signless integer values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or tensor of bfloat16 type values |
+| `input` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values |
 
 #### Results:
 
 | Result | Description |
 | :----: | ----------- |
-| `output` | tensor of 8-bit unsigned integer values or tensor of 16-bit unsigned integer values or tensor of 32-bit unsigned integer values or tensor of 64-bit unsigned integer values or tensor of 8-bit signless integer values or tensor of 16-bit signless integer values or tensor of 32-bit signless integer values or tensor of 64-bit signless integer values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or tensor of bfloat16 type values |
+| `output` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values |
 
 
 ### `onnx.Exp` (ONNXExpOp)
@@ -2734,7 +2745,7 @@ TensorProto message and be valid as an output type.
 
 Traits: `AlwaysSpeculatableImplTrait`
 
-Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ResultTypeInferenceOpInterface`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
 
 Effects: `MemoryEffects::Effect{}`
 
@@ -3153,7 +3164,22 @@ Given `data` tensor of rank r >= 1, and `indices` tensor of rank q, gather
 entries of the axis dimension of `data` (by default outer-most one as axis=0) indexed by `indices`, and concatenates
 them in an output tensor of rank q + (r - 1).
 
-If `axis = 0`, let `k = indices[i_{0}, ..., i_{q-1\}\]`
+It is an indexing operation that indexes into the input `data` along a single (specified) axis.
+Each entry in `indices` produces a `r-1` dimensional slice of the input tensor.
+The entire operation produces, conceptually, a `q`-dimensional tensor of `r-1` dimensional slices,
+which is arranged into a `q + (r-1)`-dimensional tensor, with the `q` dimensions taking the
+place of the original `axis` that is being indexed into.
+
+The following few examples illustrate how `Gather` works for specific shapes of `data`,
+`indices`, and given value of `axis`:
+| data shape | indices shape | axis | output shape | output equation |
+| --- | --- | --- | --- | --- |
+| (P, Q) | ( )  (a scalar)   | 0 | (Q)       | output[q] = data[indices, q] |
+| (P, Q, R) | ( )  (a scalar)   | 1 | (P, R)       | output[p, r] = data[p, indices, r] |
+| (P, Q) | (R, S) | 0 | (R, S, Q) | output[r, s, q] = data[ [indices[r, s], q] |
+| (P, Q) | (R, S) | 1 | (P, R, S) | output[p, r, s] = data[ p, indices[r, s]] |
+
+More generally, if `axis = 0`, let `k = indices[i_{0}, ..., i_{q-1\}\]`
 then `output[i_{0}, ..., i_{q-1}, j_{0}, ..., j_{r-2\}\] = input[k , j_{0}, ..., j_{r-2\}\]`:
 
 ```
@@ -3363,13 +3389,13 @@ Effects: `MemoryEffects::Effect{}`
 
 | Operand | Description |
 | :-----: | ----------- |
-| `X` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values |
+| `X` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values |
 
 #### Results:
 
 | Result | Description |
 | :----: | ----------- |
-| `Y` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values |
+| `Y` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values |
 
 
 ### `onnx.GlobalMaxPool` (ONNXGlobalMaxPoolOp)
@@ -3724,7 +3750,7 @@ This operator transforms input according to
 y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
 ```
 where the mean and variance are computed per instance per group of channels, and
-`scale` and `bias` should be specified for each group of channels. The number of
+`scale` and `bias` should be specified for each channel. The number of
 groups `num_groups` should be divisible by the number of channels so that there are
 an equal number of channels per group.
 
@@ -5427,19 +5453,23 @@ Effects: `MemoryEffects::Effect{}`
 
 _ONNX Mod operation_
 
-Performs element-wise binary modulus (with Numpy-style broadcasting support).
-  The sign of the remainder is the same as that of the Divisor.
-
-  Mod operator can also behave like C fmod() or numpy.fmod. In this case, the sign of the remainder however, will be the same as the Dividend
-  (in contrast to integer mod). To force a behavior like numpy.fmod() an 'fmod' Attribute is provided.
-  This attribute is set to 0 by default causing the behavior to be like integer mod.
-  Setting this attribute to 1 causes the remainder to be calculated similar to that of numpy.fmod().
+Performs an element-wise binary modulo operation.
+The semantics and supported data types depend on the value of the `fmod` attribute which must be `0` (default), or `1`.
 
-  If the input type is floating point, then `fmod` attribute must be set to 1.
+If the `fmod` attribute is set to `0`, `T` is constrained to integer data types and the semantics follow that of the Python `%`-operator.
+The sign of the result is that of the divisor.
 
-  In case of dividend being zero, the results will be platform dependent.
+If `fmod` is set to `1`, the behavior of this operator follows that of the `fmod` function in C and `T` is constrained to floating point data types.
+The result of this operator is the remainder of the division operation `x / y` where `x` and `y` are respective elements of `A` and `B`. The result is exactly the value `x - n * y`, where `n` is `x / y` with its fractional part truncated.
+The returned value has the same sign as `x` (except if `x` is `-0`) and is less or equal to `|y|` in magnitude.
+The following special cases apply when `fmod` is set to `1`:
+- If `x` is `-0` and `y` is greater than zero, either `+0` or `-0` may be returned.
+- If `x` is `±∞` and `y` is not `NaN`, `NaN` is returned.
+- If `y` is `±0` and `x` is not `NaN`, `NaN` should be returned.
+- If `y` is `±∞` and `x` is finite, `x` is returned.
+- If either argument is `NaN`, `NaN` is returned.
 
-  This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
+This operator supports **multidirectional (i.e., NumPy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
 
 Traits: `AlwaysSpeculatableImplTrait`
 
@@ -9094,7 +9124,7 @@ The `output` is calculated via the following equation:
 output = np.copy(data)
 update_indices = indices.shape[:-1]
 for idx in np.ndindex(update_indices):
-    output[indices[idx]] = updates[idx]
+    output[tuple(indices[idx])] = updates[idx]
 ```
 
 The order of iteration in the above loop is not specified.
@@ -9111,7 +9141,7 @@ When `reduction` is set to some reduction function `f`, `output` is calculated a
 output = np.copy(data)
 update_indices = indices.shape[:-1]
 for idx in np.ndindex(update_indices):
-    output[indices[idx]] = f(output[indices[idx]], updates[idx])
+    output[tuple(indices[idx])] = f(output[tuple(indices[idx])], updates[idx])
 ```
 
 where the `f` is `+`, `*`, `max` or `min` as specified.
@@ -9355,7 +9385,7 @@ Construct an empty tensor sequence, with given data type.
 
 Traits: `AlwaysSpeculatableImplTrait`
 
-Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ResultTypeInferenceOpInterface`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
 
 Effects: `MemoryEffects::Effect{}`
 
@@ -9507,11 +9537,11 @@ If start axis is omitted, the slice starts from axis 0.
 The end axis, if specified, is exclusive (and the returned value will not include the size of that axis).
 If the end axis is omitted, the axes upto the last one will be included.
 Negative axes indicate counting back from the last axis.
-Note that axes will be clamped to the range [0, r-1], where r is the
+Note that axes will be clamped to the range [0, r], where r is the
 rank of the input tensor if they are out-of-range (after adding r in the case of
 negative axis). Thus, specifying any end value > r is equivalent to specifying an end
 value of r, and specifying any start value < -r is equivalent to specifying a start
-value of 0.
+value of 0. If start > end, the result will be an empty shape.
 
 Examples:
 
@@ -10702,9 +10732,16 @@ Effects: `MemoryEffects::Effect{}`
 
 _ONNX Transpose operation_
 
-Transpose the input tensor similar to numpy.transpose. For example, when
-perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-will be (2, 1, 3).
+Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+The optional attribute `perm` must be a permutation of the dimensions of
+the input tensor. Axis `i` of the output tensor corresponds to the axis
+`perm[i]` of the input tensor.
+For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+the output shape will be (2, 1, 3).
+When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+the output shape will be (2, 3, 1).
+If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+where `n` is the rank of the input tensor.
 
 Traits: `AlwaysSpeculatableImplTrait`
 
diff --git a/docs/SupportedONNXOps-cpu.md b/docs/SupportedONNXOps-cpu.md
index 4d24c79d95..e175a2f7c7 100644
--- a/docs/SupportedONNXOps-cpu.md
+++ b/docs/SupportedONNXOps-cpu.md
@@ -29,19 +29,19 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **AveragePool** |6 - * | | |
 | **BatchNormalization** |6 - * |Training not supported. | |
 | **Bernoulli** |none | | | |
-| **Binarizer** |1 - * | | | |
-| **BitShift** |11 - * | | | |
+| **Binarizer** |6 - * | | |
+| **BitShift** |11 - * | | |
 | **BitwiseAnd** |18 - * | | |
-| **BitwiseNot** |18 - * | Only supports signed integers | | 
+| **BitwiseNot** |18 - * | | |
 | **BitwiseOr** |18 - * | | |
 | **BitwiseXor** |18 - * | | |
-| **BlackmanWindow** |17 - * | | | 
+| **BlackmanWindow** |17 - * | | |
 | **Cast** |6 - * |Cast only between float and double types. Only ppc64le and MacOS platforms support float16. Does not support int4 and uint4. | |
 | **CastLike** |19 - * |CastLike only between float and double types. Only ppc64le and MacOS platforms support float16. Does not support int4 and uint4. | |
 | **CastMap** |none | | | |
 | **CategoryMapper** |none | | | |
 | **Ceil** |6 - * | | |
-| **Celu** |12 - * | | | |
+| **Celu** |12 - * | | |
 | **CenterCropPad** |none | | | |
 | **Clip** |6 - * |No support for short integers. | |
 | **Col2Im** |none | | | |
@@ -89,10 +89,10 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **GreaterOrEqual** |12 - * | | |
 | **GridSample** |none | | | |
 | **GroupNormalization** |18 - * | | |
-| **HammingWindow** |17 - * | | | 
-| **HannWindow** |17 - * | | | |
+| **HammingWindow** |17 - * | | |
+| **HannWindow** |17 - * | | |
 | **HardSigmoid** |6 - * | | |
-| **HardSwish** |14 - * | | | |
+| **HardSwish** |none | | | |
 | **Hardmax** |6 - * | | |
 | **Identity** |16 - * |Sequence identity not supported. Does not support int4 and uint4. | |
 | **If** |16 - * |Sequence and Optional outputs are not supported. Does not support int4 and uint4. | |
@@ -112,7 +112,7 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **Log** |6 - * | | |
 | **LogSoftmax** |13 - * |Axis 0, 1, and default currently disabled due to changes in ONNX 1.8.1/Opset 13. |Temporally removed due to changes in onnx 1.8.1. |
 | **Loop** |6 - * |Input must have static shape. Does not support int4 and uint4. | |
-| **LpNormalization** |1 - * | | | |
+| **LpNormalization** |none | | | |
 | **LpPool** |none | | | |
 | **MatMul** |6 - * | | |
 | **MatMulInteger** |10 - * | | |
@@ -121,10 +121,10 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **MaxRoiPool** |none | | | |
 | **MaxUnpool** |none | | | |
 | **Mean** |6 - * | | |
-| **MeanVarianceNormalization** |9 - * | | | |
+| **MeanVarianceNormalization** |13 - * | | |
 | **MelWeightMatrix** |none | | | |
 | **Min** |6 - * |Does not support unsigned numbers. Only ppc64le and MacOS platforms support float16. | |
-| **Mish** |18 - * | | | |
+| **Mish** |18 - * | | |
 | **Mod** |10 - * |Support float and double only. Only ppc64le and MacOS platforms support float16. | |
 | **Momentum** |none | | | |
 | **Mul** |6 - * |Does not support short integers. | |
@@ -150,8 +150,8 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **RNN** |7 - * |W, B and R must be constants. | |
 | **RandomNormal** |none | | | |
 | **RandomNormalLike** |none | | | |
-| **RandomUniform** |1 - * | | | |
-| **RandomUniformLike** |1 - * | | | |
+| **RandomUniform** |none | | | |
+| **RandomUniformLike** |none | | | |
 | **Range** |11 - * | | |
 | **Reciprocal** |6 - * | | |
 | **ReduceL1** |13 - * |do_not_keep_dim not supported. | |
@@ -187,7 +187,7 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **SequenceLength** |none | | | |
 | **SequenceMap** |none | | | |
 | **Shape** |15 - * |Does not support start and end attributes. Does not support int4 and uint4. | |
-| **Shrink** |9 - * | | | |
+| **Shrink** |9 - * | | |
 | **Sigmoid** |6 - * | | |
 | **Sign** |9 - * | | |
 | **Sin** |7 - * | | |
@@ -209,7 +209,7 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **Tan** |7 - * | | |
 | **Tanh** |6 - * | | |
 | **TfIdfVectorizer** |none | | | |
-| **ThresholdedRelu** |10 - * | | | |
+| **ThresholdedRelu** |10 - * | | |
 | **Tile** |6 - * | | |
 | **TopK** |10 - * |`K`, the number of top elements to retrieve, must have static shape. | |
 | **Transpose** |6 - * |Does not support int4 and uint4. | |
diff --git a/src/Builder/FrontendDialectTransformer.cpp b/src/Builder/FrontendDialectTransformer.cpp
index 542e5caa9f..459bb369db 100644
--- a/src/Builder/FrontendDialectTransformer.cpp
+++ b/src/Builder/FrontendDialectTransformer.cpp
@@ -1310,10 +1310,12 @@ class FrontendGenImpl {
     OpsetImportsMap function_opset_map =
         GetOpsetImportsFromProto(functionProto);
 
+    // Update the ShapeInferenceOptions with the new interface.
+    const onnx::ShapeInferenceOptions &options = onnx::ShapeInferenceOptions();
+
     // Populates graph.value_info().
     onnx::shape_inference::InferShapes(&graph, function_opset_map,
-        onnx::OpSchemaRegistry::Instance(),
-        /*options=*/{}, in_model_functions_);
+        onnx::OpSchemaRegistry::Instance(), options, in_model_functions_);
 
     // Save caller context, while generating function body.
     ModelLocalFunctionsMap callerModelFunctions;
diff --git a/src/Builder/OpBuildTable.inc b/src/Builder/OpBuildTable.inc
index ed0c277f77..ea1ad61161 100644
--- a/src/Builder/OpBuildTable.inc
+++ b/src/Builder/OpBuildTable.inc
@@ -710,6 +710,7 @@ op_opsets_map_["Asin"] = {22, 7};
 op_opsets_map_["Asinh"] = {22, 9};
 op_opsets_map_["Atan"] = {22, 7};
 op_opsets_map_["Atanh"] = {22, 9};
+op_opsets_map_["Attention"] = {24, 23};
 op_opsets_map_["AveragePool"] = {22, 19, 11, 10, 7, 1};
 op_opsets_map_["BatchNormalization"] = {15, 14, 9, 7, 6, 1};
 op_opsets_map_["Bernoulli"] = {22, 15};
@@ -719,8 +720,8 @@ op_opsets_map_["BitwiseNot"] = {18};
 op_opsets_map_["BitwiseOr"] = {18};
 op_opsets_map_["BitwiseXor"] = {18};
 op_opsets_map_["BlackmanWindow"] = {17};
-op_opsets_map_["Cast"] = {21, 19, 13, 9, 6, 1};
-op_opsets_map_["CastLike"] = {21, 19, 15};
+op_opsets_map_["Cast"] = {25, 24, 23, 21, 19, 13, 9, 6, 1};
+op_opsets_map_["CastLike"] = {25, 24, 23, 21, 19, 15};
 op_opsets_map_["Ceil"] = {13, 6, 1};
 op_opsets_map_["Celu"] = {12};
 op_opsets_map_["CenterCropPad"] = {18};
@@ -729,8 +730,8 @@ op_opsets_map_["Col2Im"] = {18};
 op_opsets_map_["Compress"] = {11, 9};
 op_opsets_map_["Concat"] = {13, 11, 4, 1};
 op_opsets_map_["ConcatFromSequence"] = {11};
-op_opsets_map_["Constant"] = {21, 19, 13, 12, 11, 9, 1};
-op_opsets_map_["ConstantOfShape"] = {21, 20, 9};
+op_opsets_map_["Constant"] = {25, 24, 23, 21, 19, 13, 12, 11, 9, 1};
+op_opsets_map_["ConstantOfShape"] = {25, 24, 23, 21, 20, 9};
 op_opsets_map_["Conv"] = {22, 11, 1};
 op_opsets_map_["ConvInteger"] = {10};
 op_opsets_map_["ConvTranspose"] = {22, 11, 1};
@@ -740,7 +741,7 @@ op_opsets_map_["CumSum"] = {14, 11};
 op_opsets_map_["DFT"] = {20, 17};
 op_opsets_map_["DeformConv"] = {22, 19};
 op_opsets_map_["DepthToSpace"] = {13, 11, 1};
-op_opsets_map_["DequantizeLinear"] = {21, 19, 13, 10};
+op_opsets_map_["DequantizeLinear"] = {25, 24, 23, 21, 19, 13, 10};
 op_opsets_map_["Det"] = {22, 11};
 op_opsets_map_["Div"] = {14, 13, 7, 6, 1};
 op_opsets_map_["Dropout"] = {22, 13, 12, 10, 7, 6, 1};
@@ -752,7 +753,7 @@ op_opsets_map_["Erf"] = {13, 9};
 op_opsets_map_["Exp"] = {13, 6, 1};
 op_opsets_map_["Expand"] = {13, 8};
 op_opsets_map_["EyeLike"] = {22, 9};
-op_opsets_map_["Flatten"] = {21, 13, 11, 9, 1};
+op_opsets_map_["Flatten"] = {25, 24, 23, 21, 13, 11, 9, 1};
 op_opsets_map_["Floor"] = {13, 6, 1};
 op_opsets_map_["GRU"] = {22, 14, 7, 3, 1};
 op_opsets_map_["Gather"] = {13, 11, 1};
@@ -772,8 +773,8 @@ op_opsets_map_["HannWindow"] = {17};
 op_opsets_map_["HardSigmoid"] = {22, 6, 1};
 op_opsets_map_["HardSwish"] = {22, 14};
 op_opsets_map_["Hardmax"] = {13, 11, 1};
-op_opsets_map_["Identity"] = {21, 19, 16, 14, 13, 1};
-op_opsets_map_["If"] = {21, 19, 16, 13, 11, 1};
+op_opsets_map_["Identity"] = {25, 24, 23, 21, 19, 16, 14, 13, 1};
+op_opsets_map_["If"] = {25, 24, 23, 21, 19, 16, 13, 11, 1};
 op_opsets_map_["ImageDecoder"] = {20};
 op_opsets_map_["InstanceNormalization"] = {22, 6, 1};
 op_opsets_map_["IsInf"] = {20, 10};
@@ -786,7 +787,7 @@ op_opsets_map_["Less"] = {13, 9, 7, 1};
 op_opsets_map_["LessOrEqual"] = {16, 12};
 op_opsets_map_["Log"] = {13, 6, 1};
 op_opsets_map_["LogSoftmax"] = {13, 11, 1};
-op_opsets_map_["Loop"] = {21, 19, 16, 13, 11, 1};
+op_opsets_map_["Loop"] = {25, 24, 23, 21, 19, 16, 13, 11, 1};
 op_opsets_map_["LpNormalization"] = {22, 1};
 op_opsets_map_["LpPool"] = {22, 18, 11, 2, 1};
 op_opsets_map_["MatMul"] = {13, 9, 1};
@@ -814,11 +815,12 @@ op_opsets_map_["OptionalGetElement"] = {18, 15};
 op_opsets_map_["OptionalHasElement"] = {18, 15};
 op_opsets_map_["Or"] = {7, 1};
 op_opsets_map_["PRelu"] = {16, 9, 7, 6, 1};
-op_opsets_map_["Pad"] = {21, 19, 18, 13, 11, 2, 1};
+op_opsets_map_["Pad"] = {25, 24, 23, 21, 19, 18, 13, 11, 2, 1};
 op_opsets_map_["Pow"] = {15, 13, 12, 7, 1};
 op_opsets_map_["QLinearConv"] = {10};
 op_opsets_map_["QLinearMatMul"] = {21, 10};
-op_opsets_map_["QuantizeLinear"] = {21, 19, 13, 10};
+op_opsets_map_["QuantizeLinear"] = {25, 24, 23, 21, 19, 13, 10};
+op_opsets_map_["RMSNormalization"] = {23};
 op_opsets_map_["RNN"] = {22, 14, 7, 1};
 op_opsets_map_["RandomNormal"] = {22, 1};
 op_opsets_map_["RandomNormalLike"] = {22, 1};
@@ -838,13 +840,14 @@ op_opsets_map_["ReduceSum"] = {13, 11, 1};
 op_opsets_map_["ReduceSumSquare"] = {18, 13, 11, 1};
 op_opsets_map_["RegexFullMatch"] = {20};
 op_opsets_map_["Relu"] = {14, 13, 6, 1};
-op_opsets_map_["Reshape"] = {21, 19, 14, 13, 5, 1};
+op_opsets_map_["Reshape"] = {25, 24, 23, 21, 19, 14, 13, 5, 1};
 op_opsets_map_["Resize"] = {19, 18, 13, 11, 10};
 op_opsets_map_["ReverseSequence"] = {10};
 op_opsets_map_["RoiAlign"] = {22, 16, 10};
+op_opsets_map_["RotaryEmbedding"] = {23};
 op_opsets_map_["Round"] = {22, 11};
 op_opsets_map_["STFT"] = {17};
-op_opsets_map_["Scan"] = {21, 19, 16, 11, 9, 8};
+op_opsets_map_["Scan"] = {25, 24, 23, 21, 19, 16, 11, 9, 8};
 op_opsets_map_["Scatter"] = {11, 9};
 op_opsets_map_["ScatterElements"] = {18, 16, 13, 11};
 op_opsets_map_["ScatterND"] = {18, 16, 13, 11};
@@ -856,13 +859,13 @@ op_opsets_map_["SequenceErase"] = {11};
 op_opsets_map_["SequenceInsert"] = {11};
 op_opsets_map_["SequenceLength"] = {11};
 op_opsets_map_["SequenceMap"] = {17};
-op_opsets_map_["Shape"] = {21, 19, 15, 13, 1};
+op_opsets_map_["Shape"] = {25, 24, 23, 21, 19, 15, 13, 1};
 op_opsets_map_["Shrink"] = {9};
 op_opsets_map_["Sigmoid"] = {13, 6, 1};
 op_opsets_map_["Sign"] = {13, 9};
 op_opsets_map_["Sin"] = {22, 7};
 op_opsets_map_["Sinh"] = {22, 9};
-op_opsets_map_["Size"] = {21, 19, 13, 1};
+op_opsets_map_["Size"] = {25, 24, 23, 21, 19, 13, 1};
 op_opsets_map_["Slice"] = {13, 11, 10, 1};
 op_opsets_map_["Softmax"] = {13, 11, 1};
 op_opsets_map_["SoftmaxCrossEntropyLoss"] = {13, 12};
@@ -870,24 +873,26 @@ op_opsets_map_["Softplus"] = {22, 1};
 op_opsets_map_["Softsign"] = {22, 1};
 op_opsets_map_["SpaceToDepth"] = {13, 1};
 op_opsets_map_["Split"] = {18, 13, 11, 2, 1};
-op_opsets_map_["SplitToSequence"] = {11};
+op_opsets_map_["SplitToSequence"] = {24, 11};
 op_opsets_map_["Sqrt"] = {13, 6, 1};
-op_opsets_map_["Squeeze"] = {21, 13, 11, 1};
+op_opsets_map_["Squeeze"] = {25, 24, 23, 21, 13, 11, 1};
 op_opsets_map_["StringConcat"] = {20};
 op_opsets_map_["StringNormalizer"] = {10};
 op_opsets_map_["StringSplit"] = {20};
 op_opsets_map_["Sub"] = {14, 13, 7, 6, 1};
 op_opsets_map_["Sum"] = {13, 8, 6, 1};
+op_opsets_map_["Swish"] = {24};
 op_opsets_map_["Tan"] = {22, 7};
 op_opsets_map_["Tanh"] = {13, 6, 1};
+op_opsets_map_["TensorScatter"] = {24};
 op_opsets_map_["TfIdfVectorizer"] = {9};
 op_opsets_map_["ThresholdedRelu"] = {22, 10};
 op_opsets_map_["Tile"] = {13, 6, 1};
-op_opsets_map_["TopK"] = {11, 10, 1};
-op_opsets_map_["Transpose"] = {21, 13, 1};
+op_opsets_map_["TopK"] = {24, 11, 10, 1};
+op_opsets_map_["Transpose"] = {25, 24, 23, 21, 13, 1};
 op_opsets_map_["Trilu"] = {14};
 op_opsets_map_["Unique"] = {11};
-op_opsets_map_["Unsqueeze"] = {21, 13, 11, 1};
+op_opsets_map_["Unsqueeze"] = {25, 24, 23, 21, 13, 11, 1};
 op_opsets_map_["Upsample"] = {10, 9, 7, 1};
 op_opsets_map_["Where"] = {16, 9};
 op_opsets_map_["Xor"] = {7, 1};
diff --git a/src/Dialect/ONNX/ONNXOps.td.inc b/src/Dialect/ONNX/ONNXOps.td.inc
index 7db77ab5a6..48754ee007 100644
--- a/src/Dialect/ONNX/ONNXOps.td.inc
+++ b/src/Dialect/ONNX/ONNXOps.td.inc
@@ -841,28 +841,31 @@ def ONNXCastOp:ONNX_Op<"Cast",
   to the following rules. `[x]` means the value rounded to
   the target mantissa width.
   
-  | x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
-  |------|----|----|----|----|
-  | 0 | 0 | 0 | 0 | 0 |
-  |-0 | -0 | 0 | -0 | 0 |
-  | NaN | NaN | NaN | NaN | NaN |
-  | +/- Inf | +/- FLT_MAX | NaN | FLT_MAX | NaN |
-  | [x] > FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX |
-  | [x] < -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
-  | else | RNE | RNE | RNE | RNE |
+  | x                 | E4M3FN   | E4M3FNUZ | E5M2     | E5M2FNUZ |
+  | ----------------- | -------- | -------- | -------- | -------- |
+  | 0                 | 0        | 0        | 0        | 0        |
+  | -0                | -0       | 0        | -0       | 0        |
+  | NaN               | NaN      | NaN      | NaN      | NaN      |
+  | Inf               | FLT_MAX  | NaN      | FLT_MAX  | NaN      |
+  | -Inf              | -FLT_MAX | NaN      | -FLT_MAX | NaN      |
+  | \[x\] > FLT_MAX   | FLT_MAX  | FLT_MAX  | FLT_MAX  | FLT_MAX  |
+  | \[x\] \< -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
+  | else              | RNE      | RNE      | RNE      | RNE      |
   
   The behavior changes if the parameter 'saturate' is set to False.
   The rules then become:
   
-  | x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
-  |------|----|----|----|----|
-  | 0 | 0 | 0 | 0 | 0 |
-  |-0 | -0 | 0 | -0 | 0 |
-  | NaN | NaN | NaN | NaN | NaN |
-  | +/- Inf | NaN | NaN | +/- Inf | NaN |
-  | [x] > FLT_MAX | NaN | NaN | Inf | NaN |
-  | [x] < -FLT_MAX | NaN | NaN | -Inf | NaN |
-  | else | RNE | RNE | RNE | RNE |
+  | x                 | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
+  | ----------------- | ------ | -------- | ---- | -------- |
+  | 0                 | 0      | 0        | 0    | 0        |
+  | -0                | -0     | 0        | -0   | 0        |
+  | NaN               | NaN    | NaN      | NaN  | NaN      |
+  | -NaN              | -NaN   | NaN      | -NaN | NaN      |
+  | Inf               | NaN    | NaN      | Inf  | NaN      |
+  | -Inf              | -NaN   | NaN      | -Inf | NaN      |
+  | \[x\] > FLT_MAX   | NaN    | NaN      | Inf  | NaN      |
+  | \[x\] \< -FLT_MAX | NaN    | NaN      | -Inf | NaN      |
+  | else              | RNE    | RNE      | RNE  | RNE      |
   }];
   let arguments = (ins AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I1]>, TensorOf<[StringType]>, TensorOf<[BF16]>, TensorOf<[F8E4M3FN]>, TensorOf<[F8E4M3FNUZ]>, TensorOf<[F8E5M2]>, TensorOf<[F8E5M2FNUZ]>, TensorOf<[UI<4>]>, TensorOf<[I<4>]>]>:$input,
     DefaultValuedAttr<SI64Attr, "1">:$saturate,
@@ -1003,12 +1006,18 @@ def ONNXCenterCropPadOp:ONNX_Op<"CenterCropPad",
   let description = [{
   Center crop or pad an input to given dimensions.
   
-  The crop/pad dimensions can be specified for a subset of the `axes`. Non-specified dimensions will not be
-  cropped or padded.
+  The crop/pad dimensions can be specified for a subset of the `axes`; unspecified dimensions will remain unchanged.
   
-  If the input dimensions are bigger than the crop shape, a centered cropping window is extracted from the input.
-  If the input dimensions are smaller than the crop shape, the input is padded on each side equally,
-  so that the input is centered in the output.
+  If the input dimensions are larger than the target crop dimensions, a centered cropping window will be extracted
+  from the input. The starting value for the cropping window is rounded down, which means that if the difference
+  between the input shape and the crop shape is odd, the cropping window will be shifted half a pixel to the left
+  of the input center.
+  
+  If the input dimensions are smaller than the target crop dimensions, the input will be padded equally on both sides
+  to center it in the output. In cases where the total number of padding pixels is odd, an additional pixel will be
+  added to the right side.
+  
+  The padding value used is zero.
   }];
   let arguments = (ins AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[StringType]>, TensorOf<[I1]>, TensorOf<[Complex<F32>]>, TensorOf<[Complex<F64>]>]>:$input_data,
     AnyTypeOf<[TensorOf<[I32]>, TensorOf<[I64]>]>:$shape,
@@ -1043,6 +1052,8 @@ def ONNXClipOp:ONNX_Op<"Clip",
   Clip operator limits the given input within an interval. The interval is
   specified by the inputs 'min' and 'max'. They default to
   numeric_limits::lowest() and numeric_limits::max(), respectively.
+  When 'min' is greater than 'max', the clip operator sets all the 'input' values to
+  the value of 'max'. Thus, this is equivalent to 'Min(max, Max(input, min))'.
   }];
   let arguments = (ins AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$input,
     AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>, NoneType]>:$min,
@@ -2016,7 +2027,7 @@ def ONNXDynamicQuantizeLinearOp:ONNX_Op<"DynamicQuantizeLinear",
   Zero point is calculated as:
   ```
   intermediate_zero_point = qmin - min(x)/y_scale
-  y_zero_point = cast(round(saturate(itermediate_zero_point)))
+  y_zero_point = cast(round(saturate(intermediate_zero_point)))
   ```
   
   * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
@@ -2207,8 +2218,8 @@ def ONNXErfOp:ONNX_Op<"Erf",
   let description = [{
   Computes the error function of the given input tensor element-wise.
   }];
-  let arguments = (ins AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$input);
-  let results = (outs AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$output);
+  let arguments = (ins AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$input);
+  let results = (outs AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$output);
   let extraClassDeclaration = [{
     static int getNumberOfOperands() {
       return 1;
@@ -2507,7 +2518,22 @@ def ONNXGatherOp:ONNX_Op<"Gather",
   entries of the axis dimension of `data` (by default outer-most one as axis=0) indexed by `indices`, and concatenates
   them in an output tensor of rank q + (r - 1).
   
-  If `axis = 0`, let `k = indices[i_{0}, ..., i_{q-1\}\]`
+  It is an indexing operation that indexes into the input `data` along a single (specified) axis.
+  Each entry in `indices` produces a `r-1` dimensional slice of the input tensor.
+  The entire operation produces, conceptually, a `q`-dimensional tensor of `r-1` dimensional slices,
+  which is arranged into a `q + (r-1)`-dimensional tensor, with the `q` dimensions taking the
+  place of the original `axis` that is being indexed into.
+  
+  The following few examples illustrate how `Gather` works for specific shapes of `data`,
+  `indices`, and given value of `axis`:
+  | data shape | indices shape | axis | output shape | output equation |
+  | --- | --- | --- | --- | --- |
+  | (P, Q) | ( )  (a scalar)   | 0 | (Q)       | output[q] = data[indices, q] |
+  | (P, Q, R) | ( )  (a scalar)   | 1 | (P, R)       | output[p, r] = data[p, indices, r] |
+  | (P, Q) | (R, S) | 0 | (R, S, Q) | output[r, s, q] = data[ [indices[r, s], q] |
+  | (P, Q) | (R, S) | 1 | (P, R, S) | output[p, r, s] = data[ p, indices[r, s]] |
+  
+  More generally, if `axis = 0`, let `k = indices[i_{0}, ..., i_{q-1\}\]`
   then `output[i_{0}, ..., i_{q-1}, j_{0}, ..., j_{r-2\}\] = input[k , j_{0}, ..., j_{r-2\}\]`:
   
   ```
@@ -2898,9 +2924,9 @@ def ONNXGlobalLpPoolOp:ONNX_Op<"GlobalLpPool",
    the values in the same channel. This is equivalent to LpPool with kernel size
    equal to the spatial dimension of input tensor.
   }];
-  let arguments = (ins AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$X,
+  let arguments = (ins AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$X,
     DefaultValuedAttr<SI64Attr, "2">:$p);
-  let results = (outs AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$Y);
+  let results = (outs AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$Y);
   let extraClassDeclaration = [{
     static int getNumberOfOperands() {
       return 1;
@@ -3176,7 +3202,7 @@ def ONNXGroupNormalizationOp:ONNX_Op<"GroupNormalization",
   y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
   ```
   where the mean and variance are computed per instance per group of channels, and
-  `scale` and `bias` should be specified for each group of channels. The number of
+  `scale` and `bias` should be specified for each channel. The number of
   groups `num_groups` should be divisible by the number of channels so that there are
   an equal number of channels per group.
   
@@ -4722,19 +4748,23 @@ def ONNXModOp:ONNX_Op<"Mod",
   [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
   let summary = "ONNX Mod operation";
   let description = [{
-  Performs element-wise binary modulus (with Numpy-style broadcasting support).
-    The sign of the remainder is the same as that of the Divisor.
-  
-    Mod operator can also behave like C fmod() or numpy.fmod. In this case, the sign of the remainder however, will be the same as the Dividend
-    (in contrast to integer mod). To force a behavior like numpy.fmod() an 'fmod' Attribute is provided.
-    This attribute is set to 0 by default causing the behavior to be like integer mod.
-    Setting this attribute to 1 causes the remainder to be calculated similar to that of numpy.fmod().
+  Performs an element-wise binary modulo operation.
+  The semantics and supported data types depend on the value of the `fmod` attribute which must be `0` (default), or `1`.
   
-    If the input type is floating point, then `fmod` attribute must be set to 1.
+  If the `fmod` attribute is set to `0`, `T` is constrained to integer data types and the semantics follow that of the Python `%`-operator.
+  The sign of the result is that of the divisor.
   
-    In case of dividend being zero, the results will be platform dependent.
+  If `fmod` is set to `1`, the behavior of this operator follows that of the `fmod` function in C and `T` is constrained to floating point data types.
+  The result of this operator is the remainder of the division operation `x / y` where `x` and `y` are respective elements of `A` and `B`. The result is exactly the value `x - n * y`, where `n` is `x / y` with its fractional part truncated.
+  The returned value has the same sign as `x` (except if `x` is `-0`) and is less or equal to `|y|` in magnitude.
+  The following special cases apply when `fmod` is set to `1`:
+  - If `x` is `-0` and `y` is greater than zero, either `+0` or `-0` may be returned.
+  - If `x` is `±∞` and `y` is not `NaN`, `NaN` is returned.
+  - If `y` is `±0` and `x` is not `NaN`, `NaN` should be returned.
+  - If `y` is `±∞` and `x` is finite, `x` is returned.
+  - If either argument is `NaN`, `NaN` is returned.
   
-    This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
+  This operator supports **multidirectional (i.e., NumPy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
   }];
   let arguments = (ins AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$A,
     AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$B,
@@ -8160,7 +8190,7 @@ def ONNXScatterNDOp:ONNX_Op<"ScatterND",
   output = np.copy(data)
   update_indices = indices.shape[:-1]
   for idx in np.ndindex(update_indices):
-      output[indices[idx]] = updates[idx]
+      output[tuple(indices[idx])] = updates[idx]
   ```
   
   The order of iteration in the above loop is not specified.
@@ -8177,7 +8207,7 @@ def ONNXScatterNDOp:ONNX_Op<"ScatterND",
   output = np.copy(data)
   update_indices = indices.shape[:-1]
   for idx in np.ndindex(update_indices):
-      output[indices[idx]] = f(output[indices[idx]], updates[idx])
+      output[tuple(indices[idx])] = f(output[tuple(indices[idx])], updates[idx])
   ```
   
   where the `f` is `+`, `*`, `max` or `min` as specified.
@@ -8519,11 +8549,11 @@ def ONNXShapeOp:ONNX_Op<"Shape",
   The end axis, if specified, is exclusive (and the returned value will not include the size of that axis).
   If the end axis is omitted, the axes upto the last one will be included.
   Negative axes indicate counting back from the last axis.
-  Note that axes will be clamped to the range [0, r-1], where r is the
+  Note that axes will be clamped to the range [0, r], where r is the
   rank of the input tensor if they are out-of-range (after adding r in the case of
   negative axis). Thus, specifying any end value > r is equivalent to specifying an end
   value of r, and specifying any start value < -r is equivalent to specifying a start
-  value of 0.
+  value of 0. If start > end, the result will be an empty shape.
   
   Examples:
   
@@ -9781,9 +9811,16 @@ def ONNXTransposeOp:ONNX_Op<"Transpose",
   let hasCanonicalizer = 1;
   let summary = "ONNX Transpose operation";
   let description = [{
-  Transpose the input tensor similar to numpy.transpose. For example, when
-  perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-  will be (2, 1, 3).
+  Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+  The optional attribute `perm` must be a permutation of the dimensions of
+  the input tensor. Axis `i` of the output tensor corresponds to the axis
+  `perm[i]` of the input tensor.
+  For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+  the output shape will be (2, 1, 3).
+  When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+  the output shape will be (2, 3, 1).
+  If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+  where `n` is the rank of the input tensor.
   }];
   let arguments = (ins AnyTypeOf<[TensorOf<[UI8]>, TensorOf<[UI16]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I8]>, TensorOf<[I16]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[StringType]>, TensorOf<[I1]>, TensorOf<[Complex<F32>]>, TensorOf<[Complex<F64>]>, TensorOf<[F8E4M3FN]>, TensorOf<[F8E4M3FNUZ]>, TensorOf<[F8E5M2]>, TensorOf<[F8E5M2FNUZ]>, TensorOf<[UI<4>]>, TensorOf<[I<4>]>]>:$data,
     OptionalAttr<I64ArrayAttr>:$perm);
diff --git a/test/backend/all_test_names.txt b/test/backend/all_test_names.txt
index 0daf5403ea..904a88d282 100644
--- a/test/backend/all_test_names.txt
+++ b/test/backend/all_test_names.txt
@@ -1,5 +1,5 @@
 # This file is automatically generated by "make check-onnx-backend-case"
-# From onnx 1.17.0
+# From onnx 1.20.1
 # All test cases for cpu target
 test_bvlc_alexnet_cpu
 test_densenet121_cpu
@@ -21,6 +21,11 @@ test_adam_cpu
 test_adam_multiple_cpu
 test_add_bcast_cpu
 test_add_cpu
+test_add_int16_cpu
+test_add_int8_cpu
+test_add_uint16_cpu
+test_add_uint32_cpu
+test_add_uint64_cpu
 test_add_uint8_cpu
 test_affine_grid_2d_align_corners_cpu
 test_affine_grid_2d_align_corners_expanded_cpu
@@ -86,8 +91,133 @@ test_atan_cpu
 test_atan_example_cpu
 test_atanh_cpu
 test_atanh_example_cpu
+test_attention_3d_attn_mask_cpu
+test_attention_3d_attn_mask_expanded_cpu
+test_attention_3d_causal_cpu
+test_attention_3d_causal_expanded_cpu
+test_attention_3d_cpu
+test_attention_3d_diff_heads_sizes_attn_mask_cpu
+test_attention_3d_diff_heads_sizes_attn_mask_expanded_cpu
+test_attention_3d_diff_heads_sizes_causal_cpu
+test_attention_3d_diff_heads_sizes_causal_expanded_cpu
+test_attention_3d_diff_heads_sizes_cpu
+test_attention_3d_diff_heads_sizes_expanded_cpu
+test_attention_3d_diff_heads_sizes_scaled_cpu
+test_attention_3d_diff_heads_sizes_scaled_expanded_cpu
+test_attention_3d_diff_heads_sizes_softcap_cpu
+test_attention_3d_diff_heads_sizes_softcap_expanded_cpu
+test_attention_3d_diff_heads_with_past_and_present_cpu
+test_attention_3d_diff_heads_with_past_and_present_expanded_cpu
+test_attention_3d_expanded_cpu
+test_attention_3d_gqa_attn_mask_cpu
+test_attention_3d_gqa_attn_mask_expanded_cpu
+test_attention_3d_gqa_causal_cpu
+test_attention_3d_gqa_causal_expanded_cpu
+test_attention_3d_gqa_cpu
+test_attention_3d_gqa_expanded_cpu
+test_attention_3d_gqa_scaled_cpu
+test_attention_3d_gqa_scaled_expanded_cpu
+test_attention_3d_gqa_softcap_cpu
+test_attention_3d_gqa_softcap_expanded_cpu
+test_attention_3d_gqa_with_past_and_present_cpu
+test_attention_3d_gqa_with_past_and_present_expanded_cpu
+test_attention_3d_scaled_cpu
+test_attention_3d_scaled_expanded_cpu
+test_attention_3d_softcap_cpu
+test_attention_3d_softcap_expanded_cpu
+test_attention_3d_transpose_verification_cpu
+test_attention_3d_transpose_verification_expanded_cpu
+test_attention_3d_with_past_and_present_cpu
+test_attention_3d_with_past_and_present_expanded_cpu
+test_attention_3d_with_past_and_present_qk_matmul_bias_cpu
+test_attention_3d_with_past_and_present_qk_matmul_bias_expanded_cpu
+test_attention_3d_with_past_and_present_qk_matmul_cpu
+test_attention_3d_with_past_and_present_qk_matmul_expanded_cpu
+test_attention_3d_with_past_and_present_qk_matmul_softcap_cpu
+test_attention_3d_with_past_and_present_qk_matmul_softcap_expanded_cpu
+test_attention_3d_with_past_and_present_qk_matmul_softmax_cpu
+test_attention_3d_with_past_and_present_qk_matmul_softmax_expanded_cpu
+test_attention_4d_attn_mask_3d_causal_cpu
+test_attention_4d_attn_mask_3d_causal_expanded_cpu
+test_attention_4d_attn_mask_3d_cpu
+test_attention_4d_attn_mask_3d_expanded_cpu
+test_attention_4d_attn_mask_4d_causal_cpu
+test_attention_4d_attn_mask_4d_causal_expanded_cpu
+test_attention_4d_attn_mask_4d_cpu
+test_attention_4d_attn_mask_4d_expanded_cpu
+test_attention_4d_attn_mask_bool_4d_cpu
+test_attention_4d_attn_mask_bool_4d_expanded_cpu
+test_attention_4d_attn_mask_bool_cpu
+test_attention_4d_attn_mask_bool_expanded_cpu
+test_attention_4d_attn_mask_cpu
+test_attention_4d_attn_mask_expanded_cpu
+test_attention_4d_causal_cpu
+test_attention_4d_causal_expanded_cpu
+test_attention_4d_cpu
+test_attention_4d_diff_heads_mask4d_padded_kv_cpu
+test_attention_4d_diff_heads_mask4d_padded_kv_expanded_cpu
+test_attention_4d_diff_heads_sizes_attn_mask_cpu
+test_attention_4d_diff_heads_sizes_attn_mask_expanded_cpu
+test_attention_4d_diff_heads_sizes_causal_cpu
+test_attention_4d_diff_heads_sizes_causal_expanded_cpu
+test_attention_4d_diff_heads_sizes_cpu
+test_attention_4d_diff_heads_sizes_expanded_cpu
+test_attention_4d_diff_heads_sizes_scaled_cpu
+test_attention_4d_diff_heads_sizes_scaled_expanded_cpu
+test_attention_4d_diff_heads_sizes_softcap_cpu
+test_attention_4d_diff_heads_sizes_softcap_expanded_cpu
+test_attention_4d_diff_heads_with_past_and_present_cpu
+test_attention_4d_diff_heads_with_past_and_present_expanded_cpu
+test_attention_4d_diff_heads_with_past_and_present_mask3d_cpu
+test_attention_4d_diff_heads_with_past_and_present_mask3d_expanded_cpu
+test_attention_4d_diff_heads_with_past_and_present_mask4d_cpu
+test_attention_4d_diff_heads_with_past_and_present_mask4d_expanded_cpu
+test_attention_4d_expanded_cpu
+test_attention_4d_fp16_cpu
+test_attention_4d_fp16_expanded_cpu
+test_attention_4d_gqa_attn_mask_cpu
+test_attention_4d_gqa_attn_mask_expanded_cpu
+test_attention_4d_gqa_causal_cpu
+test_attention_4d_gqa_causal_expanded_cpu
+test_attention_4d_gqa_cpu
+test_attention_4d_gqa_expanded_cpu
+test_attention_4d_gqa_scaled_cpu
+test_attention_4d_gqa_scaled_expanded_cpu
+test_attention_4d_gqa_softcap_cpu
+test_attention_4d_gqa_softcap_expanded_cpu
+test_attention_4d_gqa_with_past_and_present_cpu
+test_attention_4d_gqa_with_past_and_present_expanded_cpu
+test_attention_4d_gqa_with_past_and_present_fp16_cpu
+test_attention_4d_gqa_with_past_and_present_fp16_expanded_cpu
+test_attention_4d_scaled_cpu
+test_attention_4d_scaled_expanded_cpu
+test_attention_4d_softcap_cpu
+test_attention_4d_softcap_expanded_cpu
+test_attention_4d_with_past_and_present_cpu
+test_attention_4d_with_past_and_present_expanded_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_expanded_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal_expanded_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_expanded_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_cpu
+test_attention_4d_with_past_and_present_qk_matmul_bias_expanded_cpu
+test_attention_4d_with_past_and_present_qk_matmul_cpu
+test_attention_4d_with_past_and_present_qk_matmul_expanded_cpu
+test_attention_4d_with_qk_matmul_bias_cpu
+test_attention_4d_with_qk_matmul_bias_expanded_cpu
+test_attention_4d_with_qk_matmul_cpu
+test_attention_4d_with_qk_matmul_expanded_cpu
+test_attention_4d_with_qk_matmul_softcap_cpu
+test_attention_4d_with_qk_matmul_softcap_expanded_cpu
+test_attention_4d_with_qk_matmul_softmax_cpu
+test_attention_4d_with_qk_matmul_softmax_expanded_cpu
 test_averagepool_1d_default_cpu
 test_averagepool_2d_ceil_cpu
+test_averagepool_2d_ceil_last_window_starts_on_pad_cpu
 test_averagepool_2d_default_cpu
 test_averagepool_2d_dilations_cpu
 test_averagepool_2d_pads_count_include_pad_cpu
@@ -150,13 +280,18 @@ test_cast_BFLOAT16_to_FLOAT_cpu
 test_cast_DOUBLE_to_FLOAT16_cpu
 test_cast_DOUBLE_to_FLOAT_cpu
 test_cast_FLOAT16_to_DOUBLE_cpu
+test_cast_FLOAT16_to_FLOAT4E2M1_cpu
 test_cast_FLOAT16_to_FLOAT8E4M3FNUZ_cpu
 test_cast_FLOAT16_to_FLOAT8E4M3FN_cpu
 test_cast_FLOAT16_to_FLOAT8E5M2FNUZ_cpu
 test_cast_FLOAT16_to_FLOAT8E5M2_cpu
 test_cast_FLOAT16_to_FLOAT_cpu
+test_cast_FLOAT16_to_INT2_cpu
 test_cast_FLOAT16_to_INT4_cpu
+test_cast_FLOAT16_to_UINT2_cpu
 test_cast_FLOAT16_to_UINT4_cpu
+test_cast_FLOAT4E2M1_to_FLOAT16_cpu
+test_cast_FLOAT4E2M1_to_FLOAT_cpu
 test_cast_FLOAT8E4M3FNUZ_to_FLOAT16_cpu
 test_cast_FLOAT8E4M3FNUZ_to_FLOAT_cpu
 test_cast_FLOAT8E4M3FN_to_FLOAT16_cpu
@@ -168,20 +303,31 @@ test_cast_FLOAT8E5M2_to_FLOAT_cpu
 test_cast_FLOAT_to_BFLOAT16_cpu
 test_cast_FLOAT_to_DOUBLE_cpu
 test_cast_FLOAT_to_FLOAT16_cpu
+test_cast_FLOAT_to_FLOAT4E2M1_cpu
 test_cast_FLOAT_to_FLOAT8E4M3FNUZ_cpu
 test_cast_FLOAT_to_FLOAT8E4M3FN_cpu
 test_cast_FLOAT_to_FLOAT8E5M2FNUZ_cpu
 test_cast_FLOAT_to_FLOAT8E5M2_cpu
+test_cast_FLOAT_to_INT2_cpu
 test_cast_FLOAT_to_INT4_cpu
-test_cast_FLOAT_to_STRING_cpu
+test_cast_FLOAT_to_UINT2_cpu
 test_cast_FLOAT_to_UINT4_cpu
+test_cast_INT2_to_FLOAT16_cpu
+test_cast_INT2_to_FLOAT_cpu
+test_cast_INT2_to_INT8_cpu
 test_cast_INT4_to_FLOAT16_cpu
 test_cast_INT4_to_FLOAT_cpu
 test_cast_INT4_to_INT8_cpu
-test_cast_STRING_to_FLOAT_cpu
+test_cast_UINT2_to_FLOAT16_cpu
+test_cast_UINT2_to_FLOAT_cpu
+test_cast_UINT2_to_UINT8_cpu
 test_cast_UINT4_to_FLOAT16_cpu
 test_cast_UINT4_to_FLOAT_cpu
 test_cast_UINT4_to_UINT8_cpu
+test_cast_e8m0_FLOAT16_to_FLOAT8E8M0_cpu
+test_cast_e8m0_FLOAT8E8M0_to_FLOAT16_cpu
+test_cast_e8m0_FLOAT8E8M0_to_FLOAT_cpu
+test_cast_e8m0_FLOAT_to_FLOAT8E8M0_cpu
 test_cast_no_saturate_FLOAT16_to_FLOAT8E4M3FNUZ_cpu
 test_cast_no_saturate_FLOAT16_to_FLOAT8E4M3FN_cpu
 test_cast_no_saturate_FLOAT16_to_FLOAT8E5M2FNUZ_cpu
@@ -198,14 +344,44 @@ test_castlike_DOUBLE_to_FLOAT_cpu
 test_castlike_DOUBLE_to_FLOAT_expanded_cpu
 test_castlike_FLOAT16_to_DOUBLE_cpu
 test_castlike_FLOAT16_to_DOUBLE_expanded_cpu
+test_castlike_FLOAT16_to_FLOAT4E2M1_cpu
+test_castlike_FLOAT16_to_FLOAT4E2M1_expanded_cpu
+test_castlike_FLOAT16_to_FLOAT8E4M3FNUZ_cpu
+test_castlike_FLOAT16_to_FLOAT8E4M3FNUZ_expanded_cpu
+test_castlike_FLOAT16_to_FLOAT8E4M3FN_cpu
+test_castlike_FLOAT16_to_FLOAT8E4M3FN_expanded_cpu
+test_castlike_FLOAT16_to_FLOAT8E5M2FNUZ_cpu
+test_castlike_FLOAT16_to_FLOAT8E5M2FNUZ_expanded_cpu
+test_castlike_FLOAT16_to_FLOAT8E5M2_cpu
+test_castlike_FLOAT16_to_FLOAT8E5M2_expanded_cpu
 test_castlike_FLOAT16_to_FLOAT_cpu
 test_castlike_FLOAT16_to_FLOAT_expanded_cpu
+test_castlike_FLOAT16_to_INT2_cpu
+test_castlike_FLOAT16_to_INT2_expanded_cpu
+test_castlike_FLOAT16_to_INT4_cpu
+test_castlike_FLOAT16_to_INT4_expanded_cpu
+test_castlike_FLOAT16_to_UINT2_cpu
+test_castlike_FLOAT16_to_UINT2_expanded_cpu
+test_castlike_FLOAT16_to_UINT4_cpu
+test_castlike_FLOAT16_to_UINT4_expanded_cpu
+test_castlike_FLOAT4E2M1_to_FLOAT16_cpu
+test_castlike_FLOAT4E2M1_to_FLOAT16_expanded_cpu
+test_castlike_FLOAT4E2M1_to_FLOAT_cpu
+test_castlike_FLOAT4E2M1_to_FLOAT_expanded_cpu
+test_castlike_FLOAT8E4M3FNUZ_to_FLOAT16_cpu
+test_castlike_FLOAT8E4M3FNUZ_to_FLOAT16_expanded_cpu
 test_castlike_FLOAT8E4M3FNUZ_to_FLOAT_cpu
 test_castlike_FLOAT8E4M3FNUZ_to_FLOAT_expanded_cpu
+test_castlike_FLOAT8E4M3FN_to_FLOAT16_cpu
+test_castlike_FLOAT8E4M3FN_to_FLOAT16_expanded_cpu
 test_castlike_FLOAT8E4M3FN_to_FLOAT_cpu
 test_castlike_FLOAT8E4M3FN_to_FLOAT_expanded_cpu
+test_castlike_FLOAT8E5M2FNUZ_to_FLOAT16_cpu
+test_castlike_FLOAT8E5M2FNUZ_to_FLOAT16_expanded_cpu
 test_castlike_FLOAT8E5M2FNUZ_to_FLOAT_cpu
 test_castlike_FLOAT8E5M2FNUZ_to_FLOAT_expanded_cpu
+test_castlike_FLOAT8E5M2_to_FLOAT16_cpu
+test_castlike_FLOAT8E5M2_to_FLOAT16_expanded_cpu
 test_castlike_FLOAT8E5M2_to_FLOAT_cpu
 test_castlike_FLOAT8E5M2_to_FLOAT_expanded_cpu
 test_castlike_FLOAT_to_BFLOAT16_cpu
@@ -214,6 +390,8 @@ test_castlike_FLOAT_to_DOUBLE_cpu
 test_castlike_FLOAT_to_DOUBLE_expanded_cpu
 test_castlike_FLOAT_to_FLOAT16_cpu
 test_castlike_FLOAT_to_FLOAT16_expanded_cpu
+test_castlike_FLOAT_to_FLOAT4E2M1_cpu
+test_castlike_FLOAT_to_FLOAT4E2M1_expanded_cpu
 test_castlike_FLOAT_to_FLOAT8E4M3FNUZ_cpu
 test_castlike_FLOAT_to_FLOAT8E4M3FNUZ_expanded_cpu
 test_castlike_FLOAT_to_FLOAT8E4M3FN_cpu
@@ -222,10 +400,54 @@ test_castlike_FLOAT_to_FLOAT8E5M2FNUZ_cpu
 test_castlike_FLOAT_to_FLOAT8E5M2FNUZ_expanded_cpu
 test_castlike_FLOAT_to_FLOAT8E5M2_cpu
 test_castlike_FLOAT_to_FLOAT8E5M2_expanded_cpu
-test_castlike_FLOAT_to_STRING_cpu
-test_castlike_FLOAT_to_STRING_expanded_cpu
-test_castlike_STRING_to_FLOAT_cpu
-test_castlike_STRING_to_FLOAT_expanded_cpu
+test_castlike_FLOAT_to_INT2_cpu
+test_castlike_FLOAT_to_INT2_expanded_cpu
+test_castlike_FLOAT_to_INT4_cpu
+test_castlike_FLOAT_to_INT4_expanded_cpu
+test_castlike_FLOAT_to_UINT2_cpu
+test_castlike_FLOAT_to_UINT2_expanded_cpu
+test_castlike_FLOAT_to_UINT4_cpu
+test_castlike_FLOAT_to_UINT4_expanded_cpu
+test_castlike_INT2_to_FLOAT16_cpu
+test_castlike_INT2_to_FLOAT16_expanded_cpu
+test_castlike_INT2_to_FLOAT_cpu
+test_castlike_INT2_to_FLOAT_expanded_cpu
+test_castlike_INT2_to_INT8_cpu
+test_castlike_INT2_to_INT8_expanded_cpu
+test_castlike_INT4_to_FLOAT16_cpu
+test_castlike_INT4_to_FLOAT16_expanded_cpu
+test_castlike_INT4_to_FLOAT_cpu
+test_castlike_INT4_to_FLOAT_expanded_cpu
+test_castlike_INT4_to_INT8_cpu
+test_castlike_INT4_to_INT8_expanded_cpu
+test_castlike_UINT2_to_FLOAT16_cpu
+test_castlike_UINT2_to_FLOAT16_expanded_cpu
+test_castlike_UINT2_to_FLOAT_cpu
+test_castlike_UINT2_to_FLOAT_expanded_cpu
+test_castlike_UINT2_to_UINT8_cpu
+test_castlike_UINT2_to_UINT8_expanded_cpu
+test_castlike_UINT4_to_FLOAT16_cpu
+test_castlike_UINT4_to_FLOAT16_expanded_cpu
+test_castlike_UINT4_to_FLOAT_cpu
+test_castlike_UINT4_to_FLOAT_expanded_cpu
+test_castlike_UINT4_to_UINT8_cpu
+test_castlike_UINT4_to_UINT8_expanded_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E4M3FNUZ_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E4M3FNUZ_expanded_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E4M3FN_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E4M3FN_expanded_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E5M2FNUZ_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E5M2FNUZ_expanded_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E5M2_cpu
+test_castlike_no_saturate_FLOAT16_to_FLOAT8E5M2_expanded_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E4M3FNUZ_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E4M3FNUZ_expanded_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E4M3FN_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E4M3FN_expanded_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E5M2FNUZ_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E5M2FNUZ_expanded_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E5M2_cpu
+test_castlike_no_saturate_FLOAT_to_FLOAT8E5M2_expanded_cpu
 test_ceil_cpu
 test_ceil_example_cpu
 test_celu_cpu
@@ -260,6 +482,8 @@ test_clip_example_expanded_cpu
 test_clip_expanded_cpu
 test_clip_inbounds_cpu
 test_clip_inbounds_expanded_cpu
+test_clip_min_greater_than_max_cpu
+test_clip_min_greater_than_max_expanded_cpu
 test_clip_outbounds_cpu
 test_clip_outbounds_expanded_cpu
 test_clip_splitbounds_cpu
@@ -315,10 +539,12 @@ test_cosh_cpu
 test_cosh_example_cpu
 test_cumsum_1d_cpu
 test_cumsum_1d_exclusive_cpu
+test_cumsum_1d_int32_exclusive_cpu
 test_cumsum_1d_reverse_cpu
 test_cumsum_1d_reverse_exclusive_cpu
 test_cumsum_2d_axis_0_cpu
 test_cumsum_2d_axis_1_cpu
+test_cumsum_2d_int32_cpu
 test_cumsum_2d_negative_axis_cpu
 test_deform_conv_with_mask_bias_cpu
 test_deform_conv_with_multiple_offset_groups_cpu
@@ -331,9 +557,12 @@ test_dequantizelinear_e4m3fn_cpu
 test_dequantizelinear_e4m3fn_float16_cpu
 test_dequantizelinear_e4m3fn_zero_point_cpu
 test_dequantizelinear_e5m2_cpu
+test_dequantizelinear_float4e2m1_cpu
 test_dequantizelinear_int16_cpu
+test_dequantizelinear_int2_cpu
 test_dequantizelinear_int4_cpu
 test_dequantizelinear_uint16_cpu
+test_dequantizelinear_uint2_cpu
 test_dequantizelinear_uint4_cpu
 test_det_2d_cpu
 test_det_nd_cpu
@@ -346,6 +575,11 @@ test_dft_opset19_cpu
 test_div_bcast_cpu
 test_div_cpu
 test_div_example_cpu
+test_div_int16_cpu
+test_div_int8_cpu
+test_div_uint16_cpu
+test_div_uint32_cpu
+test_div_uint64_cpu
 test_div_uint8_cpu
 test_dropout_default_cpu
 test_dropout_default_mask_cpu
@@ -363,6 +597,7 @@ test_edge_pad_cpu
 test_einsum_batch_diagonal_cpu
 test_einsum_batch_matmul_cpu
 test_einsum_inner_prod_cpu
+test_einsum_scalar_cpu
 test_einsum_sum_cpu
 test_einsum_transpose_cpu
 test_elu_cpu
@@ -373,8 +608,14 @@ test_elu_example_expanded_ver18_cpu
 test_elu_expanded_ver18_cpu
 test_equal_bcast_cpu
 test_equal_cpu
+test_equal_int16_cpu
+test_equal_int8_cpu
 test_equal_string_broadcast_cpu
 test_equal_string_cpu
+test_equal_uint16_cpu
+test_equal_uint32_cpu
+test_equal_uint64_cpu
+test_equal_uint8_cpu
 test_erf_cpu
 test_exp_cpu
 test_exp_example_cpu
@@ -433,6 +674,24 @@ test_greater_equal_bcast_cpu
 test_greater_equal_bcast_expanded_cpu
 test_greater_equal_cpu
 test_greater_equal_expanded_cpu
+test_greater_equal_int16_cpu
+test_greater_equal_int16_expanded_cpu
+test_greater_equal_int8_cpu
+test_greater_equal_int8_expanded_cpu
+test_greater_equal_uint16_cpu
+test_greater_equal_uint16_expanded_cpu
+test_greater_equal_uint32_cpu
+test_greater_equal_uint32_expanded_cpu
+test_greater_equal_uint64_cpu
+test_greater_equal_uint64_expanded_cpu
+test_greater_equal_uint8_cpu
+test_greater_equal_uint8_expanded_cpu
+test_greater_int16_cpu
+test_greater_int8_cpu
+test_greater_uint16_cpu
+test_greater_uint32_cpu
+test_greater_uint64_cpu
+test_greater_uint8_cpu
 test_gridsample_aligncorners_true_cpu
 test_gridsample_bicubic_align_corners_0_additional_1_cpu
 test_gridsample_bicubic_align_corners_1_additional_1_cpu
@@ -505,6 +764,11 @@ test_isinf_negative_cpu
 test_isinf_positive_cpu
 test_isnan_cpu
 test_isnan_float16_cpu
+test_l1normalization_axis_0_cpu
+test_l1normalization_axis_1_cpu
+test_l1normalization_axis_last_cpu
+test_l2normalization_axis_0_cpu
+test_l2normalization_axis_1_cpu
 test_layer_normalization_2d_axis0_cpu
 test_layer_normalization_2d_axis0_expanded_cpu
 test_layer_normalization_2d_axis0_expanded_ver18_cpu
@@ -574,6 +838,24 @@ test_less_equal_bcast_cpu
 test_less_equal_bcast_expanded_cpu
 test_less_equal_cpu
 test_less_equal_expanded_cpu
+test_less_equal_int16_cpu
+test_less_equal_int16_expanded_cpu
+test_less_equal_int8_cpu
+test_less_equal_int8_expanded_cpu
+test_less_equal_uint16_cpu
+test_less_equal_uint16_expanded_cpu
+test_less_equal_uint32_cpu
+test_less_equal_uint32_expanded_cpu
+test_less_equal_uint64_cpu
+test_less_equal_uint64_expanded_cpu
+test_less_equal_uint8_cpu
+test_less_equal_uint8_expanded_cpu
+test_less_int16_cpu
+test_less_int8_cpu
+test_less_uint16_cpu
+test_less_uint32_cpu
+test_less_uint64_cpu
+test_less_uint8_cpu
 test_log_cpu
 test_log_example_cpu
 test_logsoftmax_axis_0_cpu
@@ -600,6 +882,7 @@ test_logsoftmax_negative_axis_expanded_ver18_cpu
 test_loop11_cpu
 test_loop13_seq_cpu
 test_loop16_seq_none_cpu
+test_lpnormalization_default_cpu
 test_lppool_1d_default_cpu
 test_lppool_2d_default_cpu
 test_lppool_2d_dilations_cpu
@@ -614,9 +897,13 @@ test_lstm_batchwise_cpu
 test_lstm_defaults_cpu
 test_lstm_with_initial_bias_cpu
 test_lstm_with_peepholes_cpu
+test_matmul_1d_1d_cpu
+test_matmul_1d_3d_cpu
 test_matmul_2d_cpu
 test_matmul_3d_cpu
+test_matmul_4d_1d_cpu
 test_matmul_4d_cpu
+test_matmul_bcast_cpu
 test_matmulinteger_cpu
 test_max_example_cpu
 test_max_float16_cpu
@@ -691,6 +978,11 @@ test_momentum_multiple_cpu
 test_mul_bcast_cpu
 test_mul_cpu
 test_mul_example_cpu
+test_mul_int16_cpu
+test_mul_int8_cpu
+test_mul_uint16_cpu
+test_mul_uint32_cpu
+test_mul_uint64_cpu
 test_mul_uint8_cpu
 test_mvn_cpu
 test_mvn_expanded_cpu
@@ -803,9 +1095,12 @@ test_quantizelinear_blocked_symmetric_cpu
 test_quantizelinear_cpu
 test_quantizelinear_e4m3fn_cpu
 test_quantizelinear_e5m2_cpu
+test_quantizelinear_float4e2m1_cpu
 test_quantizelinear_int16_cpu
+test_quantizelinear_int2_cpu
 test_quantizelinear_int4_cpu
 test_quantizelinear_uint16_cpu
+test_quantizelinear_uint2_cpu
 test_quantizelinear_uint4_cpu
 test_range_float_type_positive_delta_cpu
 test_range_float_type_positive_delta_expanded_cpu
@@ -1001,10 +1296,64 @@ test_resize_upsample_sizes_nearest_not_smaller_cpu
 test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric_cpu
 test_reversesequence_batch_cpu
 test_reversesequence_time_cpu
+test_rms_normalization_2d_axis0_cpu
+test_rms_normalization_2d_axis0_expanded_cpu
+test_rms_normalization_2d_axis1_cpu
+test_rms_normalization_2d_axis1_expanded_cpu
+test_rms_normalization_2d_axis_negative_1_cpu
+test_rms_normalization_2d_axis_negative_1_expanded_cpu
+test_rms_normalization_2d_axis_negative_2_cpu
+test_rms_normalization_2d_axis_negative_2_expanded_cpu
+test_rms_normalization_3d_axis0_epsilon_cpu
+test_rms_normalization_3d_axis0_epsilon_expanded_cpu
+test_rms_normalization_3d_axis1_epsilon_cpu
+test_rms_normalization_3d_axis1_epsilon_expanded_cpu
+test_rms_normalization_3d_axis2_epsilon_cpu
+test_rms_normalization_3d_axis2_epsilon_expanded_cpu
+test_rms_normalization_3d_axis_negative_1_epsilon_cpu
+test_rms_normalization_3d_axis_negative_1_epsilon_expanded_cpu
+test_rms_normalization_3d_axis_negative_2_epsilon_cpu
+test_rms_normalization_3d_axis_negative_2_epsilon_expanded_cpu
+test_rms_normalization_3d_axis_negative_3_epsilon_cpu
+test_rms_normalization_3d_axis_negative_3_epsilon_expanded_cpu
+test_rms_normalization_4d_axis0_cpu
+test_rms_normalization_4d_axis0_expanded_cpu
+test_rms_normalization_4d_axis1_cpu
+test_rms_normalization_4d_axis1_expanded_cpu
+test_rms_normalization_4d_axis2_cpu
+test_rms_normalization_4d_axis2_expanded_cpu
+test_rms_normalization_4d_axis3_cpu
+test_rms_normalization_4d_axis3_expanded_cpu
+test_rms_normalization_4d_axis_negative_1_cpu
+test_rms_normalization_4d_axis_negative_1_expanded_cpu
+test_rms_normalization_4d_axis_negative_2_cpu
+test_rms_normalization_4d_axis_negative_2_expanded_cpu
+test_rms_normalization_4d_axis_negative_3_cpu
+test_rms_normalization_4d_axis_negative_3_expanded_cpu
+test_rms_normalization_4d_axis_negative_4_cpu
+test_rms_normalization_4d_axis_negative_4_expanded_cpu
+test_rms_normalization_default_axis_cpu
+test_rms_normalization_default_axis_expanded_cpu
 test_rnn_seq_length_cpu
 test_roialign_aligned_false_cpu
 test_roialign_aligned_true_cpu
 test_roialign_mode_max_cpu
+test_rotary_embedding_3d_input_cpu
+test_rotary_embedding_3d_input_expanded_cpu
+test_rotary_embedding_cpu
+test_rotary_embedding_expanded_cpu
+test_rotary_embedding_interleaved_cpu
+test_rotary_embedding_interleaved_expanded_cpu
+test_rotary_embedding_no_position_ids_cpu
+test_rotary_embedding_no_position_ids_expanded_cpu
+test_rotary_embedding_no_position_ids_interleaved_cpu
+test_rotary_embedding_no_position_ids_interleaved_expanded_cpu
+test_rotary_embedding_no_position_ids_rotary_dim_cpu
+test_rotary_embedding_no_position_ids_rotary_dim_expanded_cpu
+test_rotary_embedding_with_interleaved_rotary_dim_cpu
+test_rotary_embedding_with_interleaved_rotary_dim_expanded_cpu
+test_rotary_embedding_with_rotary_dim_cpu
+test_rotary_embedding_with_rotary_dim_expanded_cpu
 test_round_cpu
 test_scan9_sum_cpu
 test_scan_sum_cpu
@@ -1118,6 +1467,7 @@ test_shape_example_cpu
 test_shape_start_1_cpu
 test_shape_start_1_end_2_cpu
 test_shape_start_1_end_negative_1_cpu
+test_shape_start_greater_than_end_cpu
 test_shape_start_negative_1_cpu
 test_shrink_hard_cpu
 test_shrink_hard_expanded_ver18_cpu
@@ -1219,14 +1569,24 @@ test_strnormalizer_nostopwords_nochangecase_cpu
 test_sub_bcast_cpu
 test_sub_cpu
 test_sub_example_cpu
+test_sub_int16_cpu
+test_sub_int8_cpu
+test_sub_uint16_cpu
+test_sub_uint32_cpu
+test_sub_uint64_cpu
 test_sub_uint8_cpu
 test_sum_example_cpu
 test_sum_one_input_cpu
 test_sum_two_inputs_cpu
+test_swish_cpu
+test_swish_expanded_cpu
 test_tan_cpu
 test_tan_example_cpu
 test_tanh_cpu
 test_tanh_example_cpu
+test_tensorscatter_3d_cpu
+test_tensorscatter_circular_cpu
+test_tensorscatter_cpu
 test_tfidfvectorizer_tf_batch_onlybigrams_skip0_cpu
 test_tfidfvectorizer_tf_batch_onlybigrams_skip5_cpu
 test_tfidfvectorizer_tf_batch_uniandbigrams_skip5_cpu
@@ -1244,7 +1604,11 @@ test_tile_cpu
 test_tile_precomputed_cpu
 test_top_k_cpu
 test_top_k_negative_axis_cpu
+test_top_k_same_values_2d_cpu
+test_top_k_same_values_cpu
+test_top_k_same_values_largest_cpu
 test_top_k_smallest_cpu
+test_top_k_uint64_cpu
 test_training_dropout_cpu
 test_training_dropout_default_cpu
 test_training_dropout_default_mask_cpu
@@ -1276,6 +1640,7 @@ test_triu_pos_cpu
 test_triu_square_cpu
 test_triu_square_neg_cpu
 test_triu_zero_cpu
+test_unique_length_1_cpu
 test_unique_not_sorted_without_axis_cpu
 test_unique_sorted_with_axis_3d_cpu
 test_unique_sorted_with_axis_cpu
@@ -1299,4 +1664,4 @@ test_xor_bcast3v1d_cpu
 test_xor_bcast3v2d_cpu
 test_xor_bcast4v2d_cpu
 test_xor_bcast4v3d_cpu
-test_xor_bcast4v4d_cpu
+test_xor_bcast4v4d_cpu
\ No newline at end of file
diff --git a/test/backend/inference_backend.py b/test/backend/inference_backend.py
index 8966abae1b..39201745c1 100644
--- a/test/backend/inference_backend.py
+++ b/test/backend/inference_backend.py
@@ -365,7 +365,7 @@ def get_test_models():
             DYNAMIC_SHAPE: {-1: {-1}},
             CONSTANT_INPUT: {-1},
         },
-        # ==OP== Bitshift
+        # ==OP== BitShift
         # ==MIN== 11
         "test_bitshift_right_uint8_cpu": {
             STATIC_SHAPE: {},
diff --git a/third_party/onnx b/third_party/onnx
index b8baa84466..d3f6b795ae 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit b8baa8446686496da4cc8fda09f2b6fe65c2a02c
+Subproject commit d3f6b795aedb48eaecc881bf5e8f5dd6efbe25b3
diff --git a/utils/gen_onnx_mlir.py b/utils/gen_onnx_mlir.py
index 258a8b92a8..447deac17a 100755
--- a/utils/gen_onnx_mlir.py
+++ b/utils/gen_onnx_mlir.py
@@ -66,7 +66,7 @@
 
 # ==UPDATE_ONNX_VERSION_OPSET==
 # Look for tag above and update all references when upgrading the ONNX support within ONNX-MLIR.
-current_onnx_version = "1.17.0"
+current_onnx_version = "1.20.1"
 
 # Check the version of onnx package being used.
 if (
diff --git a/utils/pre-onnx-mlir.py b/utils/pre-onnx-mlir.py
index 54f8b6f517..949e3b4eef 100644
--- a/utils/pre-onnx-mlir.py
+++ b/utils/pre-onnx-mlir.py
@@ -40,7 +40,7 @@
 # ==UPDATE_ONNX_VERSION_OPSET==
 # Look for tag above and update all references when upgrading the ONNX support within ONNX-MLIR.
 # To update all occurrence of the current ONNX opset, please grep "CURRENT_ONNX_OPSET" and update all locations accordingly.
-current_onnx_opset = 22
+current_onnx_opset = 25
 
 converted_model = version_converter.convert_version(original_model, current_onnx_opset)