diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 58e32fd..d9a2bf6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,6 @@ name: CI
 
 on:
   push:
-    branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17275e1..1d7b05d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,8 +274,13 @@ set(BENCH_KERNLES_FILES
 )
 
 set(SRC_INTERFACE_FILES
-    TensorUtils.h
+    Contraction.cpp
+    Einsum.cpp
+    Einsum.h
+    Gemm.cpp
     Tensor.cpp
+    TensorUtils.h
+    Unary.cpp
 )
 
 set(TEST_INTERFACE_FILES
@@ -326,6 +331,8 @@ endforeach()
 # ==== Public headers of the installed library ====
 set(public_headers
     include/${PROJECT_NAME}/Tensor.h
+    include/${PROJECT_NAME}/Error.h
+    include/${PROJECT_NAME}/UnaryType.h
 )
 
 list(APPEND TEST_FILEPATHS "${INTERFACE_FILEPATHS}" "${public_headers}")
@@ -436,7 +443,7 @@ target_include_directories(${PROJECT_NAME}
     PUBLIC
         # using the project name as additional directory to include <project_name>/header.h instead of header.h if it is included as internal library
         # where top-level project will look for the library's public headers
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/${PROJECT_NAME}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
         # where external projects will look for the library's public headers
         $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
@@ -471,6 +478,7 @@ install(EXPORT "${PROJECT_NAME}Targets"
     NAMESPACE ${namespace}::
     DESTINATION cmake
 )
+add_library(mlc::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 
 include(CMakePackageConfigHelpers)
 
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..787c50f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,25 @@
+# Machine Learning Compilers
+
+This repository was created as part of the **Machine Learning Compilers** lecture and lab at Friedrich Schiller University Jena during the summer term 2025. While the lecture focused on theoretical concepts, the lab had a practical orientation, with the goal of implementing a domain-specific compiler for tensor expressions.
+
+The main objective of the lab was to build a Just-In-Time (JIT) compiler from scratch that supports a variety of tensor operations. Tensor compilers automate the transformation of tensor expressions into executable code, aiming for high throughput, low latency, short compile times, flexibility and portability.
+
+The lab involved weekly tasks that guided the development of this compiler. The corresponding code and implementations are part of this repository.
+
+## Overview
+
+This repository includes:
+
+- Implementations of all lab tasks
+- Source code of a functional JIT compiler for tensor operations
+- Modular code structured for reuse and extensibility
+
+The weekly tasks from the lab can be found here: [scalable-analyses](https://github.com/scalable-analyses/pbtc/tree/main/lab)
+
+## Technical Documentation
+
+A detailed technical documentation of our implementation including the design decisions and solutions to the lab tasks, and explanations of the source code is available on our [project website](https://integer-ctrl.github.io/machine-learning-compilers/).
+
+## CMake Library
+
+To make the compiler easy to integrate into other projects, we structured it as a CMake library. This allows users to include and build upon our functionality directly in their own CMake-based projects. More details about the library and how to use it can be found in the [user-guide.md](https://github.com/Integer-Ctrl/machine-learning-compilers/cmake-library/user-guide.md).
diff --git a/cmake-library/example-project/CMakeLists.txt b/cmake-library/example-project/CMakeLists.txt
new file mode 100644
index 0000000..0b850c1
--- /dev/null
+++ b/cmake-library/example-project/CMakeLists.txt
@@ -0,0 +1,58 @@
+cmake_minimum_required(VERSION 3.28.0)
+project(ExampleProject VERSION 0.1.0 LANGUAGES C CXX ASM)
+
+# The MachineLearningCompiler library is only supported on Linux on arm.
+if(NOT (UNIX AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm)"))
+   message(FATAL_ERROR "Only arm on Linux is supported.") 
+endif()
+
+
+# Set default build type to Release if not specified
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+get_property(IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+if(IS_MULTI_CONFIG)
+    message(NOTICE "Using multi-config generator. Compile with: cmake --build . --config [Debug|Release] --target <target>")
+else()
+    message(NOTICE "Using single-config generator. Generate with: cmake .. -DCMAKE_BUILD_TYPE=[Debug|Release]")
+    if(NOT CMAKE_BUILD_TYPE)
+        set(CMAKE_BUILD_TYPE "Release")
+        message(WARNING "No Build type is set. Using Release!")
+    endif()
+endif()
+
+message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}")
+
+
+# ===========================================
+# Include the MachineLearningCompiler Library
+# ===========================================
+
+# Option 1: Including the MachineLearningCompiler Library
+
+# Optional: Toggles if included libraries is build as shared or static libraries. Default is ON.
+set(BUILD_SHARED_LIBS ON)
+
+# Optional: Toggles if OpenMP should be used by the library. Default is ON.
+set(MLC_USE_OPENMP ON)
+
+Include(FetchContent)
+FetchContent_Declare(
+    MachineLearningCompiler
+    GIT_REPOSITORY https://github.com/Integer-Ctrl/machine-learning-compilers
+    GIT_TAG        individual-phase #TODO change
+    EXCLUDE_FROM_ALL
+)
+FetchContent_MakeAvailable(MachineLearningCompiler)
+
+# Option 2: Include it from the the current machine if installed.
+# find_library(mlc::MachineLearningCompiler)
+
+# ===========================================
+
+add_executable(example 
+    Example.cpp
+)
+target_link_libraries(example mlc::MachineLearningCompiler)
\ No newline at end of file
diff --git a/cmake-library/example-project/Example.cpp b/cmake-library/example-project/Example.cpp
new file mode 100644
index 0000000..7a801ce
--- /dev/null
+++ b/cmake-library/example-project/Example.cpp
@@ -0,0 +1,355 @@
+#include <MachineLearningCompiler/Tensor.h>
+#include <iostream>
+
+/**
+ * Tensor object examples.
+ */
+void example_tensor()
+{
+  // Define tensors with different dimensions. The memory is allocated automatically based on the given dimensions and filled with zeros.
+  mlc::Tensor tensor1D({5});        // 1D tensor with 5 elements
+  mlc::Tensor tensor2D({3, 4});     // 2D tensor with 3 rows and 4 columns
+  mlc::Tensor tensor3D({2, 3, 4});  // 3D tensor with 2 layers, 3 rows and 4 columns
+
+  // Define a tensor with data
+  float data1[] = {1, 2, 3, 4, 5};
+  float data2[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  mlc::Tensor tensorWithData1(data1, {2, 2});     // 2x2 tensor with specific data
+  mlc::Tensor tensorWIthData2(data2, {3, 2, 2});  // 3D tensor with specific data
+
+  // Print dimensions and sizes of the tensors
+  std::cout << "Tensor 1D dim sizes: ";
+  for (const auto &dim : tensor1D.dim_sizes)
+  {
+    std::cout << dim << " ";
+  }
+  std::cout << std::endl;
+  std::cout << "Tensor 2D dim sizes: ";
+  for (const auto &dim : tensor2D.dim_sizes)
+  {
+    std::cout << dim << " ";
+  }
+  std::cout << std::endl;
+
+  // Print the sizes of the tensors
+  std::cout << "Tensor 1D Size: " << tensor1D.size() << std::endl;
+  std::cout << "Tensor 2D Size: " << tensor2D.size() << std::endl;
+  std::cout << "Tensor 3D Size: " << tensor3D.size() << std::endl;
+  std::cout << "Tensor with Data 1 Size: " << tensorWithData1.size() << std::endl;
+  std::cout << "Tensor with Data 2 Size: " << tensorWIthData2.size() << std::endl;
+
+  // Print the strides of the tensors
+  std::cout << "Tensor 1D Strides: ";
+  for (const auto &stride : tensor1D.strides)
+  {
+    std::cout << stride << " ";
+  }
+  std::cout << std::endl;
+  std::cout << "Tensor 2D Strides: ";
+  for (const auto &stride : tensor2D.strides)
+  {
+    std::cout << stride << " ";
+  }
+  std::cout << std::endl;
+
+  // Print the tensors to the console
+  std::cout << tensor1D.to_string("Tensor 1D") << std::endl;
+  std::cout << tensor2D.to_string("Tensor 2D") << std::endl;
+  std::cout << tensor3D.to_string("Tensor 3D") << std::endl;
+  std::cout << tensorWithData1.to_string("Tensor with Data 1") << std::endl;
+  std::cout << tensorWIthData2.to_string("Tensor with Data 2") << std::endl;
+}
+
+/**
+ * Methods that can be used to fill a tensor.
+ */
+void example_fill()
+{
+  // Fill the memory of the tensors with random values
+  mlc::Tensor tensorRandom({3, 3});
+  mlc::fill_random(tensorRandom);
+  std::cout << tensorRandom.to_string("Random") << std::endl;
+
+  // Fill the memory of the tensors with all 1s.
+  mlc::Tensor tensorSingleNumber({3, 3});
+  mlc::fill_number(tensorSingleNumber, 1.43);
+  std::cout << tensorSingleNumber.to_string("Ones") << std::endl;
+
+  // Fill the memory of the tensors with counting upwards data starting from 0.
+  mlc::Tensor tensorCountingUp({3, 3});
+  mlc::fill_counting_up(tensorCountingUp, 0, 1.0);
+  std::cout << tensorCountingUp.to_string("Counting Up") << std::endl;
+
+  // Fill the memory of the tensors with counting downwards data starting from 5.
+  mlc::Tensor tensorCountingDown({3, 3});
+  mlc::fill_counting_down(tensorCountingDown, 5, 0.1);
+  std::cout << tensorCountingDown.to_string("Counting Down") << std::endl;
+
+  // Fill the memory of the tensor based on a user defined expression. The tensor itself and current index of the data that is currently
+  // filled are given as additional parameter.
+  // Here the tensor is filled with 1 2 3, 1 2 3, 1 2 3
+  mlc::Tensor tensorLambda({3, 3});
+  mlc::fill_lambda(tensorLambda, 
+    [](const mlc::Tensor &self, size_t index) { return index % self.strides[0] + 1; });
+  std::cout << tensorLambda.to_string("Lambda 1 2 3") << std::endl;
+
+  // We can also fill the tensor using outside defined variable.
+  size_t size = tensorLambda.size();
+  mlc::fill_lambda(tensorLambda, [&size](const mlc::Tensor &self, size_t index) { return size; });
+  std::cout << tensorLambda.to_string("Lambda Outside") << std::endl;
+}
+
+/**
+ * A GEneral Matrix Matrix multiplication requires the tensors to be in a matrix shape i.e. exactly 2 dimensions.
+ */
+void example_gemm()
+{
+  mlc::Tensor in0({5, 3});  // IDs: 0,1
+  mlc::Tensor in1({2, 5});  // IDs: 2,0
+  mlc::Tensor out({2, 3});  // IDs: 2,1
+
+  // Fill the memory of the tensors with random values
+  mlc::fill_counting_up(in0, 0, 1);
+  mlc::fill_counting_up(in1, 0, 1);
+
+  mlc::Error error = mlc::gemm(in0, in1, out);
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    return;
+  }
+
+  std::cout << in0.to_string("in0") << std::endl;
+  std::cout << in1.to_string("in1") << std::endl;
+  std::cout << out.to_string("out") << std::endl;
+}
+
+/**
+ * A unary operation zero, identity and ReLU can be performed on a Tensor.
+ */
+void example_unary()
+{
+  // Performs a zero unary
+  mlc::Tensor tensorZero({3, 3});
+  mlc::fill_random(tensorZero);
+  mlc::Error error = mlc::unary_zero(tensorZero);
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    return;
+  }
+  std::cout << tensorZero.to_string("Unary Zero") << std::endl;
+
+  // Performs a identity unary
+  mlc::Tensor tensorIdentityIn({3, 3});
+  mlc::Tensor tensorIdentityOut({3, 3});
+  mlc::fill_random(tensorIdentityIn);
+  mlc::fill_number(tensorIdentityOut, 0);
+  error = mlc::unary_identity(tensorIdentityIn, tensorIdentityOut);  // identity = copy from input to output
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    return;
+  }
+  std::cout << tensorIdentityOut.to_string("Unary Identity Input") << std::endl;
+  std::cout << tensorIdentityOut.to_string("Unary Identity Output") << std::endl;
+
+  // Performs a ReLU unary
+  mlc::Tensor tensorReluIn({3, 3});
+  mlc::Tensor tensorReluOut({3, 3});
+  // Fills even indices with positive and odd indices with negative numbers
+  mlc::fill_lambda(tensorReluIn, 
+    [](const mlc::Tensor &, int64_t index) { return index * (2 * (index % 2) - 1); });
+  mlc::fill_number(tensorReluOut, 0);
+  error = mlc::unary_relu(tensorReluIn, tensorReluOut);  // ReLU = max(x, 0)
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    return;
+  }
+  std::cout << tensorReluIn.to_string("Unary ReLU Input") << std::endl;
+  std::cout << tensorReluOut.to_string("Unary ReLU Output") << std::endl;
+}
+
+/**
+ * A contraction of two tensors and add the result to the output.
+ */
+void example_contraction()
+{
+  mlc::Tensor in0({5, 4, 3});     // IDs: 0,1,2
+  mlc::Tensor in1({5, 2, 4});     // IDs: 3,4,1
+  mlc::Tensor out({5, 5, 2, 3});  // IDs: 0,3,4,2
+
+  mlc::fill_counting_up(in0, 0, 1);
+  mlc::fill_counting_down(in1, 0, 1);
+  mlc::fill_number(out, 1'000'000);
+
+  mlc::Error error = mlc::contraction(in0, in1, out, "[0,1,2],[3,4,1]->[0,3,4,2]");
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    return;
+  }
+
+  std::cout << in0.to_string("in0") << std::endl;
+  std::cout << in1.to_string("in1") << std::endl;
+  std::cout << out.to_string("out") << std::endl;
+}
+
+/**
+ * A contraction of two tensors with unarys that are executed before (first touch) or after (last touch) the contraction on the output
+ * tensor.
+ */
+void example_contraction_first_last_touch()
+{
+  mlc::Tensor in0({5, 4, 3});     // IDs: 0,1,2
+  mlc::Tensor in1({5, 2, 4});     // IDs: 3,4,1
+  mlc::Tensor out({5, 5, 2, 3});  // IDs: 0,3,4,2
+
+  mlc::fill_counting_up(in0, 0, 1);
+  mlc::fill_counting_down(in1, 20, 1);
+  // The out is default initialized with zeros.
+
+  mlc::Error error = mlc::contraction(in0, in1, out, "[0,1,2],[3,4,1]->[0,3,4,2]", mlc::UnaryType::None, mlc::UnaryType::ReLU);
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    return;
+  }
+
+  std::cout << in0.to_string("in0") << std::endl;
+  std::cout << in1.to_string("in1") << std::endl;
+  std::cout << out.to_string("out") << std::endl;
+}
+
+/**
+ * A simple einsum operation on three input tensors. The result is added to the output.
+ */
+void example_einsum()
+{
+  mlc::Tensor in0({5, 3});  // IDs: 0,1
+  mlc::Tensor in1({2, 5});  // IDs: 2,0
+  mlc::Tensor in2({3, 7});  // IDs: 1,3
+  mlc::Tensor out({2, 7});  // IDs: 2,3
+
+  mlc::fill_counting_up(in0, 0, 1);
+  mlc::fill_number(in1, 1);
+  mlc::fill_counting_down(in2, 0, 1);
+  mlc::fill_number(out, 1'000);
+
+  // Execute the defined einsum tree on the tensors.
+  mlc::Error error = mlc::einsum({in0, in1, in2}, out, "[[0,1],[2,0]->[2,1]],[1,3]->[2,3]");
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    return;
+  }
+
+  std::cout << in0.to_string("in0") << std::endl;
+  std::cout << in1.to_string("in1") << std::endl;
+  std::cout << in2.to_string("in2") << std::endl;
+  std::cout << out.to_string("out") << std::endl;
+}
+
+/**
+ * A einsum expression that is first defined by the shapes of the input and ouput tensors and can be multiple time called on any input and
+ * output tensors that matches the same shape. This can be used to save the costs to setup and optimize the given einsum tree. The result is
+ * added to the output.
+ */
+void example_einsum_operation()
+{
+  mlc::Tensor in0({5, 3});  // IDs: 0,1
+  mlc::Tensor in1({2, 5});  // IDs: 2,0
+  mlc::Tensor in2({3, 7});  // IDs: 1,3
+  mlc::Tensor out({2, 7});  // IDs: 2,3
+
+  mlc::fill_counting_down(in0, 0, 1);
+  mlc::fill_number(in1, 1);
+  mlc::fill_counting_down(in2, 0, 0.5);
+  mlc::fill_number(out, 1'000);
+
+  // Generates a tensor operation with fixed input and ouput tensor shapes.
+  mlc::TensorOperation *op =
+    mlc::einsum_operation({in0.dim_sizes, in1.dim_sizes, in2.dim_sizes}, out.dim_sizes, "[[0,1],[2,0]->[2,1]],[1,3]->[2,3]");
+
+  // Process any error that may occurs during the setup of the operation.
+  mlc::Error error = op->getSetupError();
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    delete op;
+    return;
+  }
+
+  // Execute the operation and check for any error that can happen during execution.
+  error = op->execute({in0, in1, in2}, out);
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    delete op;
+    return;
+  }
+
+  std::cout << in0.to_string("in0") << std::endl;
+  std::cout << in1.to_string("in1") << std::endl;
+  std::cout << in2.to_string("in2") << std::endl;
+  std::cout << out.to_string("out") << std::endl;
+
+  // Create new tensors of the same shape.
+  mlc::Tensor in0_2(in0.dim_sizes);  // IDs: 0,1
+  mlc::Tensor in1_2(in1.dim_sizes);  // IDs: 2,0
+  mlc::Tensor in2_2(in2.dim_sizes);  // IDs: 1,3
+  mlc::Tensor out_2(out.dim_sizes);  // IDs: 2,3
+
+  mlc::fill_counting_up(in0_2, 10.5f, 33);
+  mlc::fill_number(in1_2, 13);
+  mlc::fill_counting_down(in2_2, 5, 2);
+  mlc::fill_number(out_2, -111);
+
+  // Execute the operation again but on different tensors of the same size.
+  error = op->execute({in0_2, in1_2, in2_2}, out_2);
+  if (error.type != mlc::ErrorType::None)
+  {
+    std::cout << error.message << std::endl;
+    delete op;
+    return;
+  }
+
+  std::cout << in0_2.to_string("in0_2") << std::endl;
+  std::cout << in1_2.to_string("in1_2") << std::endl;
+  std::cout << in2_2.to_string("in2_2") << std::endl;
+  std::cout << out_2.to_string("out_2") << std::endl;
+
+  delete op;
+}
+
+int main(int argc, const char **argv)
+{
+  size_t sep = 50;
+
+  std::cout << std::string(sep, '=') << std::endl << "Tensors" << std::endl << std::string(sep, '=') << std::endl;
+  example_tensor();
+
+  std::cout << std::endl << std::string(sep, '=') << std::endl << "Fill Tensors" << std::endl << std::string(sep, '=') << std::endl;
+  example_fill();
+
+  std::cout << std::endl << std::string(sep, '=') << std::endl << "GEMM Operation" << std::endl << std::string(sep, '=') << std::endl;
+  example_gemm();
+
+  std::cout << std::endl << std::string(sep, '=') << std::endl << "Unary Operation" << std::endl << std::string(sep, '=') << std::endl;
+  example_unary();
+
+  std::cout << std::endl << std::string(sep, '=') << std::endl << "Contraction" << std::endl << std::string(sep, '=') << std::endl;
+  example_contraction();
+
+  std::cout << std::endl << std::string(sep, '=') << std::endl << "Contraction First & Last Touch" << std::endl << std::string(sep, '=') << std::endl;
+  example_contraction_first_last_touch();
+
+  std::cout << std::endl << std::string(sep, '=') << std::endl << "Einsum" << std::endl << std::string(sep, '=') << std::endl;
+  example_einsum();
+
+  std::cout << std::endl << std::string(sep, '=') << std::endl << "Einsum Operation" << std::endl << std::string(sep, '=') << std::endl;
+  example_einsum_operation();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/cmake-library/user-guide.md b/cmake-library/user-guide.md
new file mode 100644
index 0000000..9da6443
--- /dev/null
+++ b/cmake-library/user-guide.md
@@ -0,0 +1,222 @@
+# CMake Library
+
+In this user guide, we will cover our CMake library we made from the machine learning compiler project. This library was designed to simplify the usage of our machine learning compiler and to provide an easy to use interface for users.
+
+## Overview
+
+We will guide you through the process of integrating our CMake library into your project, highlight its features, and provide an example project to demonstrate its usage.
+
+- [Library Usage](#library-usage)
+  - [Integration into CMakeLists](#integration-into-cmakelists)
+  - [Installing the Library](#installing-the-library)
+- [Library Features](#library-features)
+  - [Tensor Object](#tensor-object)
+  - [Tensor Expressions](#tensor-expressions)
+    - [GEMM](#gemm)
+    - [Unary Operations](#unary-operations)
+    - [Contraction](#contraction)
+    - [Einsum](#einsum)
+- [Example Project](#example-project)
+
+# Library Usage
+
+### Integration into CMakeLists
+
+To integrate our CMake library into your project you can choose between two methods:
+
+1. Directly fetch the content of this library from github and build it with your cmake:
+
+    ```cmake
+    # Optional: Toggles if included libraries is build as shared or static libraries. Default is ON.
+    set(BUILD_SHARED_LIBS OFF)
+
+    # Optional: Toggles if OpenMP should be used by the library. Default is ON.
+    set(MLC_USE_OPENMP ON)
+
+    Include(FetchContent)
+    FetchContent_Declare(
+        MachineLearningCompiler
+        GIT_REPOSITORY https://github.com/Integer-Ctrl/machine-learning-compilers
+        GIT_TAG        individual-phase
+        EXCLUDE_FROM_ALL
+    )
+    FetchContent_MakeAvailable(MachineLearningCompiler)
+    ```
+
+    If needed, you can specify two CMake options:
+
+    1. `BUILD_SHARED_LIBS`: This option toggles if the included libraries are built as shared or static libraries. The default is `ON`, meaning shared libraries will be built.
+    2. `MLC_USE_OPENMP`: This option toggles if OpenMP should be used by the library. The default is `ON`, meaning OpenMP will be used for parallelization if available.
+
+2. Include it from the the current machine if installed on the system:
+
+    ```cmake
+    find_library(mlc::MachineLearningCompiler)
+    ```
+
+    If you want to install the library on your system, you can do this by following [Installing the Library](#installing-the-library).
+
+### Installing the Library
+
+  1. Clone the repository `git clone https://github.com/Integer-Ctrl/machine-learning-compilers.git`
+  2. Navigate to the directory `cd machine-learning-compilers`
+  3. Create a build directory `mkdir build && cd build`
+  4. Run CMake to configure the build `cmake ..` \
+    Optionally, you can specify the install directory with `cmake .. -DCMAKE_INSTALL_PREFIX=<installation_path>` (see [CMAKE_INSTALL_PREFIX](https://cmake.org/cmake/help/latest/variable/CMAKE_INSTALL_PREFIX.html))
+  5. Install the library `cmake --build . --target install`
+
+  Now you can use the library in your CMake project by using the `find_library` command as shown in [Integration into CMakeLists](#integration-into-cmakelists).
+
+## Library Features
+
+In this section, we will cover the features of our CMake library. The library provides a simple interface to work with tensors and tensor expressions. It supports various tensor operations such as GEMM, unary operations, contraction, and einsum.
+
+### Tensor Object
+
+The library provides a `Tensor` class that represents a multi-dimensional array of data. This class is used as the input type for all tensor operations. Since the tensor compiler only supports unit-stride tensors, meaning elements must be stored contiguously in memory without gaps, strides can not be explicitly defined. Instead, they are automatically computed based on the tensor’s dimensions.
+
+There are two ways to create a tensor. The first is to create a tensor with data and the suitable dimension sizes. The second is to create a tensor only by specifying the dimension sizes, which will allocate the data internally and fill it with zeros.
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+float data[] = {1, 2, 3, 4};
+
+mlc::Tensor tensor({2, 3, 4}); // 3D tensor with 2 layers, 3 rows and 4 columns initialized with zeros
+mlc::Tensor tensorWithData1(data, {2, 2}); // 2D tensor with specific data
+
+std::cout << "Tensor dimensions: " << tensor.dim_sizes << std::endl; // Dimensions of the tensor
+std::cout << "Tensor strides: " << tensor.strides << std::endl; // Strides of the tensor
+std::cout << tensor.to_string("Tensor") << std::endl; // String representation of the tensor
+```
+
+To fill a tensor with data a variety of functions are provided. Below are all available functions to fill a tensor with data:
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+mlc::Tensor tensor({2, 3, 4}); // 3D tensor with 2 layers, 3 rows and 4 columns initialized with zeros
+size_t size = tensor.size();
+
+mlc::fill_random(tensor); // Fill the tensor with random values
+mlc::fill_number(tensor, 3.2); // Fill the tensor with a single number, in this case 3.2
+mlc::fill_counting_up(tensor, 0.1, 0.1); // Fill the tensor with counting up values starting from 4 and increasing by 0.1
+mlc::fill_counting_down(tensor, 5, 1); // Fill the tensor with counting down values starting from 5 reducing by 1
+mlc::fill_lambda(tensor, [&size](const mlc::Tensor &self, size_t index) { return size; }); // Fill the tensor with a user defined function, in this case the size of the tensor
+```
+
+### Tensor Expressions
+
+Next we will cover the tensor expressions which the library provides. All tensor expressions return an `mlc::Error` object which contains the result of the operation. If the operation was successful, the `type` field of the `Error` object will be set to `mlc::ErrorType::None`. If there was an error, the `type` field will contain the type of error that occurred.
+
+#### GEMM
+
+To perform a general matrix-matrix multiplication (GEMM), three tensors are required: two input tensors and one output tensor. The input tensors must have compatible dimensions for matrix multiplication, and the output tensor must have the correct dimensions to store the result.
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+mlc::Tensor in0({5, 3});  // IDs: 0,1
+mlc::Tensor in1({2, 5});  // IDs: 2,0
+mlc::Tensor out({2, 3});  // IDs: 2,1
+
+mlc::Error error = mlc::gemm(in0, in1, out);
+```
+
+#### Unary Operations
+
+Our library supports three unary operations: **zero**, **identity** and **ReLU** (Rectified Linear Unit). **zero** receive one input tensor and produce one output tensor, while **ReLU** and **identity** receive one input tensor and and one output tensor which will be filled with the same data as the input tensor but with the ReLU or identity operation applied.
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+mlc::Tensor in({2, 2});
+mlc::Tensor out({2, 2});
+
+mlc::Error error = mlc::unary_zero(in);
+mlc::Error error = mlc::unary_identity(in, out);
+mlc::Error error = mlc::unary_relu(in, out);
+```
+
+#### Contraction
+
+To get more advanced, lets look at the contraction operation. This operation allows you to perform a contraction of two tensors based on a user defined expression. The expression defines which dimensions of the input tensors are contracted (reduce dimensions) and which dimensions are retained (output dimensions) in the output tensor. 
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+mlc::Tensor in0({5, 4, 3});     // IDs: 0,1,2
+mlc::Tensor in1({5, 2, 4});     // IDs: 3,4,1
+mlc::Tensor out({5, 5, 2, 3});  // IDs: 0,3,4,
+
+mlc::Error error = mlc::contraction(in0, in1, out, "[0,1,2],[3,4,1]->[0,3,4,2]");
+```
+
+In the example above, the contraction operation takes two input tensors `in0` and `in1`, and produces an output tensor `out`. The expression `"[0,1,2],[3,4,1]->[0,3,4,2]"` defines that the dimensions with IDs `0`, `2`, `3`and `4` are retained in the output tensor, while the dimensions with IDs `1` is contracted. The output tensor will have the dimensions `[5, 5, 2, 3]`.
+
+To further advance the contraction operation, a first touch primitive and a last touch primitive can be specified. The first touch primitive is applied to the output tensor before the contraction operation, while the last touch primitive is applied to the output tensor after the contraction operation. The supported primitives are `mlc::UnaryType::None`, `mlc::UnaryType::Zero`, `mlc::UnaryType::Identity` and `mlc::UnaryType::ReLu`.
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+mlc::Tensor in0({5, 4, 3});     // IDs: 0,1,2
+mlc::Tensor in1({5, 2, 4});     // IDs: 3,4,1
+mlc::Tensor out({5, 5, 2, 3});  // IDs: 0,3,4,2
+
+mlc::Error error = mlc::contraction(in0, in1, out, "[0,1,2],[3,4,1]->[0,3,4,2]", mlc::UnaryType::None, mlc::UnaryType::ReLU);
+```
+
+In the example above, the first touch primitive is set to `mlc::UnaryType::None`, meaning that no operation is applied to the output tensor before the contraction operation. The last touch primitive is set to `mlc::UnaryType::ReLU`, meaning that the **ReLU** operation is applied to the output tensor after the contraction operation.
+
+#### Einsum
+
+The last operation we will cover is the einsum operation, better known as **Einsum Tree**. This operation allows you to perform a contraction of multiple tensors based on a user defined expression. The expression defines which dimensions of the input tensors are contracted (reduce dimensions) and which dimensions are retained (output dimensions) in the output tensor. The expression is similar to the one used in the contraction operation, but it can handle multiple input tensors and a single output tensor. This allows you to perform multiple contractions in a single operation.
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+mlc::Tensor in0({5, 3});  // IDs: 0,1
+mlc::Tensor in1({2, 5});  // IDs: 2,0
+mlc::Tensor in2({3, 7});  // IDs: 1,3
+mlc::Tensor out({2, 7});  // IDs: 2,3
+
+mlc::Error error = mlc::einsum({in0, in1, in2}, out, "[[0,1],[2,0]->[2,1]],[1,3]->[2,3]");
+```
+
+The example above shows a einsum tree with three input tensors (leafs), one output tensor (root) and two contraction operations. The first contraction operation is defined by the expression `[[0,1],[2,0]->[2,1]]`, using the first two input tensors `in0`and `in1`. The second contraction operation uses the intermediate output of the first contraction and the third input tensor `in2`, defined by the expression `[2,1]],[1,3]->[2,3]`.
+
+Einsum trees can be increase in complexity very quickly, so jitting the expression every time can create an overhead. To avoid this it is possible to create a einsum tree once and reuse it. Therefore, the library provides the function `mlc::einsum_operation` which receives the shapes of the input tensors and the output tensor, as well as the expression. This function returns an `mlc::TensorOperation` object which can be used to execute the einsum tree multiple times with different input tensors.
+
+```cpp
+#include <MachineLearningCompiler/Tensor.h>
+
+mlc::Tensor in0({5, 3});  // IDs: 0,1
+mlc::Tensor in1({2, 5});  // IDs: 2,0
+mlc::Tensor in2({3, 7});  // IDs: 1,3
+mlc::Tensor out({2, 7});  // IDs: 2,3
+
+mlc::Tensor in0_2(in0.dim_sizes);  // IDs: 0,1
+mlc::Tensor in1_2(in1.dim_sizes);  // IDs: 2,0
+mlc::Tensor in2_2(in2.dim_sizes);  // IDs: 1,3
+mlc::Tensor out_2(out.dim_sizes);  // IDs: 2,3
+
+// Generates a tensor operation with fixed input and ouput tensor shapes.
+mlc::TensorOperation *op = mlc::einsum_operation({in0.dim_sizes, in1.dim_sizes, in2.dim_sizes}, out.dim_sizes, "[[0,1],[2,0]->[2,1]],[1,3]->[2,3]");
+
+// Process any error that may occurs during the setup of the operation.
+mlc::Error error = op->getSetupError();
+
+// Execute the operation.
+error = op->execute({in0, in1, in2}, out);
+
+// Execute the operation again but on different tensors of the same size.
+error = op->execute({in0_2, in1_2, in2_2}, out_2);
+
+delete op; // Don't forget to delete the operation object after you are done with it.
+```
+
+**Important**: Don't forget to delete the `mlc::TensorOperation` object after you are done with it to avoid memory leaks.
+
+## Example Project
+
+To demonstrate the usage of our CMake library, we have created an example project. This project showcases the features which we introduced in the previous section. You can find the example project in the `cmake-library/example-project` directory. There you can have a look at the `CMakeLists.txt` file and the `Example.cpp` file which contains the example code.
diff --git a/docs_sphinx/getting_started/building_project.rst b/docs_sphinx/getting_started/building_project.rst
index 5aa6ead..41b149c 100644
--- a/docs_sphinx/getting_started/building_project.rst
+++ b/docs_sphinx/getting_started/building_project.rst
@@ -120,6 +120,8 @@ Building
 6. Now we can build the project. The most desired command might be
 
     .. code-block:: bash
+
+        cmake --build . --target tests
         
     Options for ``--target`` are **benchmarks** and **tests**
 
diff --git a/include/MachineLearningCompiler/Error.h b/include/MachineLearningCompiler/Error.h
new file mode 100644
index 0000000..a70fea7
--- /dev/null
+++ b/include/MachineLearningCompiler/Error.h
@@ -0,0 +1,59 @@
+#ifndef MLC_ERROR_H
+#define MLC_ERROR_H
+#include <cstdint>
+#include <string>
+
+namespace mlc
+{
+  enum class ErrorType : int64_t
+  {
+    Undefined = -1,
+    None = 0,
+
+    // Parse Errors
+    ParseExpectedLeftBracket = 1,
+    ParseExpectedRightBracket = 2,
+    ParseExpectedArrow = 3,
+    ParseExpectedComma = 4,
+    ParseExpectedDimensionList = 5,
+    ParseNotAllowedToParseAgain = 6,
+    ParseUndefinedNode = 7,
+
+    // Einsum Errors
+    EinsumInvalidRoot = 8,
+    EinsumNotEnoughInputTensors = 9,
+    EinsumTooManyInputTensors = 10,
+    EinsumNullPtrAsInputTensor = 11,
+
+    // Execute Errors
+    ExecuteWrongDType = 101,
+    ExecuteWrongDimension = 102,
+    ExecuteWrongPrimitive = 103,
+    ExecuteFirstTouchPrimitive = 104,
+    ExecuteWrongFirstTouchPrimitive = 104,
+    ExecuteWrongMainPrimitive = 105,
+    ExecuteWrongLastTouchPrimitive = 106,
+    ExecuteTypeNotSupported = 107,
+    ExecuteInvalidPrimitiveConfiguration = 108,
+    ExecuteInvalidFirstTouchConfiguration = 109,
+    ExecuteInvalidMainConfiguration = 110,
+    ExecuteInvalidLastTouchConfiguration = 111,
+    ExecuteInvalidExecutionOrder = 112,
+    ExecuteInvalidStrides = 113,
+    ExecuteKDimensionMustNotBeShared = 114,
+    ExecuteSharedRequiredForParallelExecution = 115,
+
+    // Tensor Errors
+    TensorExpected2DTensor = 201,
+    ExpectedSingleContraction = 202,
+  };
+
+  struct Error
+  {
+    ErrorType type;
+    std::string message;
+  };
+
+}  // namespace mlc
+
+#endif  // MLC_ERROR_H
\ No newline at end of file
diff --git a/include/MachineLearningCompiler/Tensor.h b/include/MachineLearningCompiler/Tensor.h
index d70236c..ca1c9a4 100644
--- a/include/MachineLearningCompiler/Tensor.h
+++ b/include/MachineLearningCompiler/Tensor.h
@@ -1,7 +1,9 @@
 #ifndef MLC_TENSOR_H
 #define MLC_TENSOR_H
-
+#include "Error.h"
+#include "UnaryType.h"
 #include <cstdint>
+#include <functional>
 #include <string>
 #include <vector>
 
@@ -9,10 +11,118 @@ namespace mlc
 {
   struct Tensor
   {
-    float *data;
+    bool ownsData = false;
+    float *data = nullptr;
     std::vector<uint64_t> dim_sizes;
+    std::vector<uint64_t> strides;
+
+    // deletes the default constructor
+    Tensor() = delete;
+
+    /**
+     * @brief Construct a new Tensor with with a pointer to memory and the dimension sizes sorted in by stride in descending order.
+     *
+     * @param data The pointer to the data array.
+     * @param dim_sizes The dimension sizes sorted by stride in descending order.
+     */
+    Tensor(float *data, const std::vector<uint64_t> &dim_sizes) : data(data), dim_sizes(dim_sizes)
+    {
+      strides.resize(dim_sizes.size());
+      if (!dim_sizes.empty())
+      {
+        strides[dim_sizes.size() - 1] = 1;
+        for (size_t i = dim_sizes.size() - 1; i > 0; --i)
+        {
+          strides[i - 1] = strides[i] * dim_sizes[i];
+        }
+      }
+    };
+
+    /**
+     * @brief Construct a new Tensor with the dimension sizes sorted by stride in descending order.
+     *
+     * @param dim_sizes The dimension sizes sorted by stride in descending order.
+     */
+    Tensor(const std::vector<uint64_t> &dim_sizes) : dim_sizes(dim_sizes)
+    {
+      uint64_t size = 1;
+      for (auto dim : dim_sizes)
+      {
+        size *= dim;
+      }
+      data = new float[size]{0};
+      ownsData = true;
+
+      strides.resize(dim_sizes.size());
+      if (!dim_sizes.empty())
+      {
+        strides[dim_sizes.size() - 1] = 1;
+        for (size_t i = dim_sizes.size() - 1; i > 0; --i)
+        {
+          strides[i - 1] = strides[i] * dim_sizes[i];
+        }
+      }
+    };
 
-    inline Tensor(float *data, const std::vector<uint64_t> &dim_sizes) : data(data), dim_sizes(dim_sizes) {};
+    /**
+     * @brief Destroys the tensor.
+     */
+    ~Tensor()
+    {
+      if (ownsData && data != nullptr)
+      {
+        delete[] data;
+        data = nullptr;
+      }
+    }
+
+    /**
+     * @brief Converts the tensor into its string representation.
+     *
+     * @param name Name of the tensor that is printed
+     * @return std::string The string representation of the tensor.
+     */
+    std::string to_string(std::string name = "tensor");
+
+    /**
+     * @brief Returns the number of elements the tensor has.
+     *
+     * @return uint64_t The number of elements in the tensor.
+     */
+    uint64_t size();
+  };
+
+  class TensorOperation
+  {
+  public:
+    virtual ~TensorOperation()
+    {
+    }
+
+    /**
+     * @brief Executes the setup einsum expression with input tensor of the same size.
+     *
+     * @param inputs The inputs to be einsum calculation.
+     * @param output The output of the einsum calculation.
+     * @return Error The error code or ErrorType::None on success.
+     */
+    virtual Error execute(const std::vector<std::reference_wrapper<const Tensor>> &inputs, Tensor &output) = 0;
+
+    /**
+     * @brief Executes the setup einsum expression with input tensor of the same size.
+     *
+     * @param inputs The inputs to be einsum calculation.
+     * @param output The output of the einsum calculation.
+     * @return Error The error code or ErrorType::None on success.
+     */
+    virtual Error execute(const std::vector<const Tensor *> &inputs, Tensor &output) = 0;
+
+    /**
+     * @brief Gets the error that was produces during the setup of the tree.
+     *
+     * @return Error The error code or ErrorType::None on success.
+     */
+    virtual Error getSetupError() const = 0;
   };
 
   /**
@@ -22,15 +132,132 @@ namespace mlc
    */
   void fill_random(Tensor &tensor);
 
+  /**
+   * @brief Fills the tensor with the given number.
+   *
+   * @param tensor The tensor to fill.
+   * @param number The number used to fill the tensor.
+   */
+  void fill_number(Tensor &tensor, float number);
+
+  /**
+   * @brief Fills the tensor with counting upwards numbers.
+   *
+   * @param tensor The tensor to fill.
+   * @param start The number to start counting from.
+   * @param step The amount to increase everytime.
+   */
+  void fill_counting_up(Tensor &tensor, float start, float step);
+
+  /**
+   * @brief Fills the tensor with counting downwards numbers.
+   *
+   * @param tensor The tensor to fill.
+   * @param start The number to start counting from.
+   * @param step The amount to decrease everytime.
+   */
+  void fill_counting_down(Tensor &tensor, float start, float step);
+
+  /**
+   * @brief Fills the tensor based on the given function.
+   *
+   * @param tensor The tensor to fill.
+   * @param function The function that gets the current tensor and the current index of the tensor as input.
+   *            index = index0 * stride0 + index1 * stride1 + ... + indexN * strideN.
+   */
+  void fill_lambda(Tensor &tensor, std::function<float(const Tensor &, size_t)> function);
+
+  /**
+   * @brief
+   *
+   * @param inputs The input tensors.
+   * @param output The output tensor.
+   * @param tree The (nested) einsum tree to contract in the format [in0],[in1]->[out].
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error einsum(const std::vector<std::reference_wrapper<const Tensor>> &inputs, Tensor &output, const std::string &tree);
+
   /**
    * @brief Executes contractions based on the given tree.
    *
    * @param inputs The input tensors.
    * @param output The output tensor.
+   * @param tree The (nested) einsum tree to contract in the format [in0],[in1]->[out].
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error einsum(const std::vector<Tensor *> &inputs, Tensor &output, const std::string &tree);
+
+  /**
+   * @brief Sets up the einsum tree for contraction based on the given tensor dimensions and tree.
+   *
+   * @param inputs The input tensors shapes.
+   * @param output The output tensor shape.
    * @param tree The einsum tree to contract in the format [in0],[in1]->[out].
    */
-  void einsum(const std::vector<Tensor> &inputs, Tensor &output, const std::string &tree);
+  TensorOperation *einsum_operation(const std::vector<std::vector<uint64_t>> &inputs, const std::vector<uint64_t> &output,
+                                    const std::string &tree);
+
+  /**
+   * @brief Perform a binary contraction and adds it to the output.
+   *
+   * @param input0 The first input tensor.
+   * @param input1 The second input tensor.
+   * @param output The output to add the result to.
+   * @param contraction The string to show the dimension to be contracted in the format [in0],[in1]->[out].
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error contraction(const Tensor &input0, const Tensor &input1, Tensor &output, const std::string &contraction);
+
+  /**
+   * @brief Performs a contraction on two input tensor and one output tensor. Before and after the contraction, a first touch unary and a
+   * last touch unary are applied to the output tensor.
+   *
+   * @param input0 The first input tensor.
+   * @param input1 The second input tensor.
+   * @param output The output to add the result to.
+   * @param contraction The string to show the dimension to be contracted in the format [in0],[in1]->[out].
+   * @param firstTouch The unary that should be execute before the contraction.
+   * @param lastTouch The unary that should be executed after the contraction.
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error contraction(const Tensor &input0, const Tensor &input1, Tensor &output, const std::string &contraction, const UnaryType firstTouch,
+                    const UnaryType lastTouch);
 
+  /**
+   * @brief Perform a general matrix-matrix multiplication and adds it to the output.
+   *
+   * @param input0 The first input tensor in the form MxK where M is the leading dimension.
+   * @param input1 The second input tensor in the form KxN where K is the leading dimension.
+   * @param output The output to add the result to in the form MxN where M is the leading dimension.
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error gemm(const Tensor &input0, const Tensor &input1, Tensor &output);
+
+  /**
+   * @brief Performs a zero unary that sets the output tensor to zero.
+   *
+   * @param input The input tensor.
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error unary_zero(Tensor &input);
+
+  /**
+   * @brief Performs a relu unary that applies Rectified Linear Unit on the tensor input.
+   *
+   * @param input The input tensor.
+   * @param output The ouput tensor.
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error unary_relu(const Tensor &input, Tensor &output);
+
+  /**
+   * @brief Performs a identity unary that copies the input tensor to the output tensor
+   *
+   * @param input The input tensor.
+   * @param output The output tensor.
+   * @return Error The error code or ErrorType::None on success.
+   */
+  Error unary_identity(const Tensor &input, Tensor &output);
 }  // namespace mlc
 
 #endif  // MLC_TENSOR
\ No newline at end of file
diff --git a/include/MachineLearningCompiler/UnaryType.h b/include/MachineLearningCompiler/UnaryType.h
new file mode 100644
index 0000000..63f798c
--- /dev/null
+++ b/include/MachineLearningCompiler/UnaryType.h
@@ -0,0 +1,16 @@
+#ifndef MLC_UNARY_H
+#define MLC_UNARY_H
+#include <cstdint>
+
+namespace mlc
+{
+  enum class UnaryType : int64_t
+  {
+    None = 0,
+    Zero = 1,
+    ReLU = 2,
+    Identity = 3,
+  };
+}  // namespace mlc
+
+#endif  // MLC_UNARY_H
\ No newline at end of file
diff --git a/src/interface/Contraction.cpp b/src/interface/Contraction.cpp
new file mode 100644
index 0000000..7f826dc
--- /dev/null
+++ b/src/interface/Contraction.cpp
@@ -0,0 +1,46 @@
+#include "../../include/MachineLearningCompiler/Tensor.h"
+#include "../main/EinsumTree.h"
+#include "../main/TensorOperation.h"
+#include "Einsum.h"
+#include "TensorUtils.h"
+
+mlc::Error mlc::contraction(const Tensor &input0, const Tensor &input1, Tensor &output, const std::string &contraction)
+{
+  return internal::einsum<std::reference_wrapper<const Tensor>>({input0, input1}, output, contraction);
+}
+
+mlc::Error mlc::contraction(const Tensor &input0, const Tensor &input1, Tensor &output, const std::string &contraction,
+                            const UnaryType firstTouch, const UnaryType lastTouch)
+{
+  mini_jit::EinsumTree einsumTree(contraction);
+  mini_jit::EinsumTree::ErrorParse errorParse = einsumTree.parse_tree();
+  if (errorParse != mini_jit::EinsumTree::ErrorParse::None)
+  {
+    mlc::ErrorType type = internal::convertParseError(errorParse);
+    return {type, "Failed during parsing the given einsum tree."};
+  }
+  if (einsumTree.get_root()->left->type != mini_jit::EinsumTree::NodeType::Leaf ||
+      einsumTree.get_root()->right->type != mini_jit::EinsumTree::NodeType::Leaf)
+  {
+    return {mlc::ErrorType::ExpectedSingleContraction, "Expected the given einsum string to be a single string."};
+  }
+
+  std::vector<int64_t> sorted_dim_sizes;
+  internal::get_sorted_dimensions_sizes(einsumTree.get_root(), {input0, input1}, sorted_dim_sizes);
+  einsumTree.set_sorted_dim_sizes(sorted_dim_sizes);
+
+  mini_jit::TensorOperation op;
+  mini_jit::TensorConfig config = einsumTree.lower_node(einsumTree.get_root());
+  config.first_touch = internal::convertPrimitiveType(firstTouch);
+  config.last_touch = internal::convertPrimitiveType(lastTouch);
+
+  mini_jit::TensorOperation::error_t error = op.setup(config);
+  mlc::ErrorType errorType = internal::convertTensorOperationError(error);
+  if (errorType != mlc::ErrorType::None)
+  {
+    return {errorType, "Could not generate the kernels for the gemm operation."};
+  }
+
+  op.execute(input0.data, input1.data, output.data);
+  return {ErrorType::None, "Success"};
+}
\ No newline at end of file
diff --git a/src/interface/Einsum.cpp b/src/interface/Einsum.cpp
new file mode 100644
index 0000000..f7e964f
--- /dev/null
+++ b/src/interface/Einsum.cpp
@@ -0,0 +1,87 @@
+#include "Einsum.h"
+#include "../../include/MachineLearningCompiler/Tensor.h"
+#include "../main/EinsumTree.h"
+#include "utility"
+
+mlc::Error mlc::einsum(const std::vector<std::reference_wrapper<const Tensor>> &inputs, Tensor &output, const std::string &tree)
+{
+  return internal::einsum<std::reference_wrapper<const Tensor>>(inputs, output, tree);
+}
+
+mlc::Error mlc::einsum(const std::vector<Tensor *> &inputs, Tensor &output, const std::string &tree)
+{
+  return internal::einsum<Tensor *>(inputs, output, tree);
+}
+
+mlc::EinsumOperation::EinsumOperation(const std::vector<std::reference_wrapper<const Tensor>> &inputs, Tensor &, const std::string &tree)
+    : einsumTree(tree)
+{
+  mini_jit::EinsumTree::ErrorParse errorParse = einsumTree.parse_tree();
+  if (errorParse != mini_jit::EinsumTree::ErrorParse::None)
+  {
+    mlc::ErrorType type = internal::convertParseError(errorParse);
+    error = {type, "Failed to parse the tree."};
+  }
+
+  std::vector<int64_t> sorted_dim_sizes;
+  internal::get_sorted_dimensions_sizes<std::reference_wrapper<const Tensor>>(einsumTree.get_root(), inputs, sorted_dim_sizes);
+  einsumTree.set_sorted_dim_sizes(sorted_dim_sizes);
+
+  error = {mlc::ErrorType::None, "Success"};
+}
+
+mlc::Error mlc::EinsumOperation::getSetupError() const
+{
+  return error;
+}
+
+mlc::Error mlc::EinsumOperation::execute(const std::vector<std::reference_wrapper<const Tensor>> &inputs, Tensor &output)
+{
+  if (error.type != ErrorType::None)
+  {
+    return error;
+  }
+
+  Error checkError = hasSameDimensions<std::reference_wrapper<const Tensor>>(inputs);
+  if (checkError.type != ErrorType::None)
+  {
+    return checkError;
+  }
+
+  return execute<std::reference_wrapper<const Tensor>>(inputs, output);
+}
+
+mlc::Error mlc::EinsumOperation::execute(const std::vector<const Tensor *> &inputs, Tensor &output)
+{
+  if (error.type != ErrorType::None)
+  {
+    return error;
+  }
+
+  Error checkError = hasSameDimensions<const Tensor *>(inputs);
+  if (checkError.type != ErrorType::None)
+  {
+    return checkError;
+  }
+
+  return execute<const Tensor *>(inputs, output);
+}
+
+mlc::TensorOperation *mlc::einsum_operation(const std::vector<std::vector<uint64_t>> &inputs, const std::vector<uint64_t> &output,
+                                            const std::string &tree)
+{
+  std::vector<Tensor> rawTensor;
+  std::vector<std::reference_wrapper<const Tensor>> inputTensors;
+  rawTensor.reserve(inputs.size());
+  inputTensors.reserve(inputs.size());
+  for (const auto &shape : inputs)
+  {
+    // Create a dummy tensor with the given shape
+    rawTensor.emplace_back(nullptr, shape);
+    inputTensors.push_back(rawTensor.back());
+  }
+
+  Tensor outputTensor(output);
+  EinsumOperation *operation = new EinsumOperation(inputTensors, outputTensor, tree);
+  return operation;
+}
\ No newline at end of file
diff --git a/src/interface/Einsum.h b/src/interface/Einsum.h
new file mode 100644
index 0000000..a10a7d2
--- /dev/null
+++ b/src/interface/Einsum.h
@@ -0,0 +1,151 @@
+#ifndef MLC_EINSUM_H
+#define MLC_EINSUM_H
+
+#include "../../include/MachineLearningCompiler/Tensor.h"
+#include "../main/EinsumTree.h"
+#include "TensorUtils.h"
+#include <vector>
+
+namespace mlc
+{
+  namespace internal
+  {
+    /**
+     * @brief Executes the einsum expression with the given inputs to output based on the given einsum tree.
+     *
+     * @tparam T The type how the input tensors are passed to the einsum expression.
+     * @param inputs All inputs of the einsum expression.
+     * @param output The single output tensor of the einsum calculation.
+     * @param tree The tree how two tensors are contracted.
+     * @return mlc::Error The error code or ErrorType::None on success.
+     */
+    template <typename T> mlc::Error einsum(const std::vector<T> &inputs, mlc::Tensor &output, const std::string &tree)
+    {
+      mini_jit::EinsumTree einsumTree(tree);
+      mini_jit::EinsumTree::ErrorParse errorParse = einsumTree.parse_tree();
+      if (errorParse != mini_jit::EinsumTree::ErrorParse::None)
+      {
+        mlc::ErrorType type = convertParseError(errorParse);
+        return {type, "Failed during parsing the given einsum tree."};
+      }
+
+      std::vector<int64_t> sorted_dim_sizes;
+      get_sorted_dimensions_sizes(einsumTree.get_root(), inputs, sorted_dim_sizes);
+      einsumTree.set_sorted_dim_sizes(sorted_dim_sizes);
+
+      std::vector<void *> tensors(inputs.size() + 1);
+      for (size_t i = 0; i < inputs.size(); i++)
+      {
+        tensors[i] = getTensor<T>(inputs[i])->data;
+        assert(tensors[i] != nullptr);
+      }
+      tensors[inputs.size()] = output.data;
+
+      mini_jit::EinsumTree::ErrorExecute errorExecute = einsumTree.execute(tensors);
+      if (errorExecute != mini_jit::EinsumTree::ErrorExecute::None)
+      {
+        mlc::ErrorType type = convertErrorExecute(errorExecute);
+        return {type, "Failed during calculation of the einsum tree."};
+      }
+
+      return {mlc::ErrorType::None, "Success"};
+    }
+  }  // namespace internal
+
+  class EinsumOperation : public TensorOperation
+  {
+  public:
+    EinsumOperation(const std::vector<std::reference_wrapper<const Tensor>> &inputs, Tensor &output, const std::string &tree);
+
+    //! @copydoc mlc::TensorOperation::execute(const std::vector<std::reference_wrapper<const Tensor>> &, Tensor &)
+    virtual Error execute(const std::vector<std::reference_wrapper<const Tensor>> &inputs, Tensor &output) override;
+    virtual Error execute(const std::vector<const Tensor *> &inputs, Tensor &output) override;
+    virtual Error getSetupError() const override;
+
+  private:
+    /**
+     * @brief Executes the Einsum operation with the given inputs and output tensor.
+     */
+    template <typename T> Error execute(const std::vector<T> &inputs, Tensor &output);
+    template <typename T> Error hasSameDimensions(const std::vector<T> &inputs);
+
+    Error error;
+    mini_jit::EinsumTree einsumTree;
+  };
+
+  template <typename T> inline Error EinsumOperation::execute(const std::vector<T> &inputs, Tensor &output)
+  {
+    std::vector<void *> tensors(inputs.size() + 1);
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+      tensors[i] = internal::getTensor<T>(inputs[i])->data;
+    }
+    tensors[inputs.size()] = output.data;
+
+    mini_jit::EinsumTree::ErrorExecute errorExecute = einsumTree.execute(tensors);
+    if (errorExecute != mini_jit::EinsumTree::ErrorExecute::None)
+    {
+      mlc::ErrorType type = internal::convertErrorExecute(errorExecute);
+      return {type, "Failed to execute the einsum operation."};
+    }
+
+    return {mlc::ErrorType::None, "Success"};
+  }
+
+  template <typename T> inline Error EinsumOperation::hasSameDimensions(const std::vector<T> &inputs)
+  {
+    std::vector<mini_jit::EinsumTree::EinsumNode *> nodesToProcess = {einsumTree.get_root()};
+    auto &sortedDimSizes = einsumTree.get_sorted_dim_sizes();
+    uint32_t processedInputs = 0;
+    while (nodesToProcess.size() > 0)
+    {
+      mini_jit::EinsumTree::EinsumNode *node = nodesToProcess.back();
+      nodesToProcess.pop_back();
+
+      if (node->type == mini_jit::EinsumTree::NodeType::Leaf)
+      {
+        if (!(node->input_tensor_index < static_cast<int32_t>(inputs.size())))
+        {
+          return {ErrorType::EinsumTooManyInputTensors, "The was more input tensors than the original setup used."};
+        }
+
+        const Tensor *tensor = internal::getTensor<T>(inputs[node->input_tensor_index]);
+
+        if (tensor->dim_sizes.size() != node->output_dim_ids.size())
+        {
+          return {ErrorType::ExecuteWrongDimension, "The count of dimensions do not match."};
+        }
+
+        for (size_t i = 0; i < node->output_dim_ids.size(); i++)
+        {
+          if (tensor->dim_sizes[i] != static_cast<uint64_t>(sortedDimSizes[node->output_dim_ids[i]]))
+          {
+            return {ErrorType::ExecuteWrongDimension,
+                    "The input tensor dimension has a different size than the size than the tensor it was setup up with."};
+          }
+        }
+
+        processedInputs++;
+        continue;
+      }
+
+      if (node->left != nullptr)
+      {
+        nodesToProcess.push_back(node->left);
+      }
+
+      if (node->right != nullptr)
+      {
+        nodesToProcess.push_back(node->right);
+      }
+    }
+
+    if (processedInputs < inputs.size())
+    {
+      return {mlc::ErrorType::EinsumNotEnoughInputTensors, "There was less input tensors than the original setups used."};
+    }
+
+    return {mlc::ErrorType::None, "Success"};
+  }
+}  // namespace mlc
+#endif  // MLC_EINSUM_H
\ No newline at end of file
diff --git a/src/interface/Gemm.cpp b/src/interface/Gemm.cpp
new file mode 100644
index 0000000..197183d
--- /dev/null
+++ b/src/interface/Gemm.cpp
@@ -0,0 +1,54 @@
+#include "../../include/MachineLearningCompiler/Tensor.h"
+#include "../main/TensorOperation.h"
+#include "TensorUtils.h"
+
+mlc::Error mlc::gemm(const Tensor &input0, const Tensor &input1, Tensor &output)
+{
+  if (input0.dim_sizes.size() != 2 || input1.dim_sizes.size() != 2 || output.dim_sizes.size() != 2)
+  {
+    return {ErrorType::TensorExpected2DTensor, "GEMM requires input0 and input1 to be 2D tensors and output to be a 2D tensor."};
+  }
+
+  int64_t mSize = static_cast<int64_t>(input0.dim_sizes[1]);
+  int64_t nSize = static_cast<int64_t>(input1.dim_sizes[0]);
+  int64_t kSize = static_cast<int64_t>(input0.dim_sizes[0]);
+
+  if (static_cast<int64_t>(output.dim_sizes[1]) != mSize)
+  {
+    return {ErrorType::ExecuteWrongDimension, "Expected the output tensor to have the same m dimension size as the input0."};
+  }
+
+  if (static_cast<int64_t>(output.dim_sizes[0]) != nSize)
+  {
+    return {ErrorType::ExecuteWrongDimension, "Expected the output tensor to have the same n dimension size as the input1."};
+  }
+
+  if (static_cast<int64_t>(input1.dim_sizes[1]) != kSize)
+  {
+    return {ErrorType::ExecuteWrongDimension, "Expected the input1 tensor to have the same k dimension size as the input0."};
+  }
+
+  mini_jit::TensorOperation op;
+  mini_jit::TensorConfig config{
+    mini_jit::TensorConfig::prim_t::none,                                                                                // first_touch
+    mini_jit::TensorConfig::prim_t::gemm,                                                                                // main
+    mini_jit::TensorConfig::prim_t::none,                                                                                // last touch
+    {mini_jit::TensorConfig::dim_t::m, mini_jit::TensorConfig::dim_t::n, mini_jit::TensorConfig::dim_t::k},              // dim_types
+    {mini_jit::TensorConfig::exec_t::prim, mini_jit::TensorConfig::exec_t::prim, mini_jit::TensorConfig::exec_t::prim},  // exec_types
+    {mSize, nSize, kSize},                                                                                               // dim_sizes
+    {1, 0, mSize},                                                                                                       // strides_in0
+    {0, kSize, 1},                                                                                                       // strides_in1
+    {1, mSize, 0},                                                                                                       // strides_out
+    mini_jit::TensorConfig::dtype_t::fp32,                                                                               // dtype_t
+  };
+
+  mini_jit::TensorOperation::error_t error = op.setup(config);
+  mlc::ErrorType errorType = internal::convertTensorOperationError(error);
+  if (errorType != mlc::ErrorType::None)
+  {
+    return {errorType, "Could not generate the kernels for the gemm operation."};
+  }
+
+  op.execute(input0.data, input1.data, output.data);
+  return {ErrorType::None, "Success"};
+}
\ No newline at end of file
diff --git a/src/interface/Tensor.cpp b/src/interface/Tensor.cpp
index 67a7eea..0de161b 100644
--- a/src/interface/Tensor.cpp
+++ b/src/interface/Tensor.cpp
@@ -1,5 +1,5 @@
 #include "../../include/MachineLearningCompiler/Tensor.h"
-#include "../main/EinsumTree.h"
+#include "../main/TensorOperation.h"
 #include "TensorUtils.h"
 #include <iostream>
 
@@ -10,12 +10,11 @@ void mlc::fill_random(Tensor &tensor)
     return;
   }
 
-  uint64_t size = 1;
-  for (auto dim : tensor.dim_sizes)
-  {
-    size *= dim;
-  }
+  uint64_t size = internal::getTensorSize(&tensor);
 
+#ifdef MLC_USE_OPENMP
+#pragma omp parallel for
+#endif
   for (size_t i = 0; i < size; ++i)
   {
     float denominator = 1;
@@ -34,22 +33,135 @@ void mlc::fill_random(Tensor &tensor)
   }
 }
 
-void mlc::einsum(const std::vector<Tensor> &inputs, Tensor &output, const std::string &tree)
+void mlc::fill_number(Tensor &tensor, float number)
 {
-  mini_jit::EinsumTree einsumTree(tree);
-  mini_jit::EinsumTree::ErrorParse errorParse = einsumTree.parse_tree();
-  (void)(errorParse);
+  if (tensor.dim_sizes.size() == 0)
+  {
+    return;
+  }
 
-  std::vector<int64_t> sorted_dim_sizes;
-  ::get_sorted_dimensions_sizes(einsumTree.get_root(), inputs, sorted_dim_sizes);
-  einsumTree.set_sorted_dim_sizes(sorted_dim_sizes);
+  uint64_t size = internal::getTensorSize(&tensor);
+
+#ifdef MLC_USE_OPENMP
+#pragma omp parallel for
+#endif
+  for (size_t i = 0; i < size; i++)
+  {
+    tensor.data[i] = number;
+  }
+}
 
-  std::vector<void *> tensors(inputs.size() + 1);
-  for (size_t i = 0; i < inputs.size(); i++)
+void mlc::fill_counting_up(Tensor &tensor, float start, float step)
+{
+  if (tensor.dim_sizes.size() == 0)
   {
-    tensors[i] = inputs[i].data;
+    return;
   }
-  tensors[inputs.size()] = output.data;
 
-  einsumTree.execute(tensors);
+  int64_t size = internal::getTensorSize(&tensor);
+
+#ifdef MLC_USE_OPENMP
+#pragma omp parallel for
+#endif
+  for (int64_t i = 0; i < size; i++)
+  {
+    tensor.data[i] = start + i * step;
+  }
+}
+
+void mlc::fill_counting_down(Tensor &tensor, float start, float step)
+{
+  if (tensor.dim_sizes.size() == 0)
+  {
+    return;
+  }
+
+  int64_t size = internal::getTensorSize(&tensor);
+
+#ifdef MLC_USE_OPENMP
+#pragma omp parallel for
+#endif
+  for (int64_t i = 0; i < size; i++)
+  {
+    tensor.data[i] = start - i * step;
+  }
+}
+
+void mlc::fill_lambda(Tensor &tensor, std::function<float(const Tensor &, size_t)> function)
+{
+  if (tensor.dim_sizes.size() == 0)
+  {
+    return;
+  }
+
+  uint64_t size = internal::getTensorSize(&tensor);
+
+#ifdef MLC_USE_OPENMP
+#pragma omp parallel for
+#endif
+  for (size_t i = 0; i < size; i++)
+  {
+    tensor.data[i] = function(tensor, i);
+  }
+}
+
+void mlc::internal::tensor_dim_to_string(mlc::Tensor *tensor, std::string &str, size_t dim, size_t offset, std::string indent)
+{
+  if (dim == tensor->dim_sizes.size() - 1)
+  {
+    str += "[";
+    for (size_t i = 0; i < tensor->dim_sizes[dim]; ++i)
+    {
+      if (i > 0)
+      {
+        str += ", ";
+      }
+      if (tensor->data == nullptr)
+      {
+        str += "-";
+      }
+      else
+      {
+        str += std::to_string(tensor->data[offset + i]);
+      }
+    }
+    str += "]";
+  }
+  else
+  {
+    str += "[";
+    indent += " ";
+
+    for (size_t i = 0; i < tensor->dim_sizes[dim]; ++i)
+    {
+      if (i > 0)
+      {
+        str += ",\n" + indent;
+      }
+
+      tensor_dim_to_string(tensor, str, dim + 1, offset + i * tensor->strides[dim], indent);
+    }
+    str += "]";
+  }
+}
+
+std::string mlc::Tensor::to_string(std::string name)
+{
+  std::string str;
+  str += name + "(\n";
+  if (dim_sizes.empty())
+  {
+    str += "[]";
+  }
+  else
+  {
+    internal::tensor_dim_to_string(this, str, 0, 0, "");
+  }
+  str += ")";
+  return str;
+}
+
+uint64_t mlc::Tensor::size()
+{
+  return internal::getTensorSize(this);
 }
diff --git a/src/interface/TensorUtils.h b/src/interface/TensorUtils.h
index ebbc3dd..d35dc20 100644
--- a/src/interface/TensorUtils.h
+++ b/src/interface/TensorUtils.h
@@ -1,43 +1,299 @@
+#ifndef MLC_TENSORUTILS_H
+#define MLC_TENSORUTILS_H
 #include "../../include/MachineLearningCompiler/Tensor.h"
 #include "../main/EinsumTree.h"
+#include "../main/release_assert.h"
+#include <cassert>
+#include <cstdint>
+#include <functional>
 #include <iostream>
+#include <string>
 
-constexpr void get_sorted_dimensions_sizes(const mini_jit::EinsumTree::EinsumNode *root, const std::vector<mlc::Tensor> &inputs,
-                                           std::vector<int64_t> &sorted_dim_sizes)
+namespace mlc
 {
-  if (root->left != nullptr)
+  namespace internal
   {
-    if (root->left->type == mini_jit::EinsumTree::NodeType::Leaf)
+    /**
+     * @brief Function definition for converting a generic type to a pointer to a mlc::Tensor.
+     *
+     * @param T The type to convert.
+     * @return mlc::Tensor nullptr as it should not be possible to get here.
+     */
+    template <typename T> constexpr const mlc::Tensor *getTensor(const T &)
     {
-      const auto &dim_sizes = inputs[root->left->input_tensor_index].dim_sizes;
-      uint i = 0;
-      for (int64_t id : root->left->output_dim_ids)
+      static_assert(false, "No generic conversion of tensor possible.");
+      release_assert(false, "No generic conversion of tensor possible.");
+      return nullptr;
+    }
+
+    /**
+     * @brief Gets the pointer to the mlc::Tensor.
+     *
+     * @param tensor The tensor to get the pointer from.
+     * @return Pointer to the mlc::Tensor.
+     */
+    template <> constexpr const mlc::Tensor *getTensor<mlc::Tensor *>(mlc::Tensor *const &tensor)
+    {
+      return tensor;
+    }
+
+    /**
+     * @brief Gets the pointer to the mlc::Tensor.
+     *
+     * @param tensor The tensor to get the pointer from.
+     * @return Pointer to the mlc::Tensor.
+     */
+    template <> constexpr const mlc::Tensor *getTensor<const mlc::Tensor *>(const mlc::Tensor *const &tensor)
+    {
+      return tensor;
+    }
+
+    /**
+     * @brief Gets the pointer to the mlc::Tensor.
+     *
+     * @param tensor The tensor to get the pointer from.
+     * @return Pointer to the mlc::Tensor.
+     */
+    template <>
+    constexpr const mlc::Tensor *
+    getTensor<std::reference_wrapper<const mlc::Tensor>>(const std::reference_wrapper<const mlc::Tensor> &tensor)
+    {
+      return &(tensor.get());
+    }
+
+    /**
+     * @brief Get the dim sizes of the input tensors in increased order of their dimension ids.
+     *
+     * @tparam T The type of the input tensors, either mlc::Tensor* or std::reference_wrapper<const mlc::Tensor>.
+     * @param root The root of the EinsumNode tree.
+     * @param inputs The input tensors.
+     * @param sorted_dim_sizes The vector to store the sorted dimension sizes.
+     */
+    template <typename T>
+    constexpr void get_sorted_dimensions_sizes(const mini_jit::EinsumTree::EinsumNode *root, const std::vector<T> &inputs,
+                                               std::vector<int64_t> &sorted_dim_sizes)
+    {
+      if (root->left != nullptr)
       {
-        sorted_dim_sizes.resize(std::max(static_cast<int64_t>(sorted_dim_sizes.size()), id + 1));
-        sorted_dim_sizes[id] = dim_sizes[i++];
+        if (root->left->type == mini_jit::EinsumTree::NodeType::Leaf)
+        {
+          const auto &dim_sizes = getTensor<T>(inputs[root->left->input_tensor_index])->dim_sizes;
+          uint i = 0;
+          for (int64_t id : root->left->output_dim_ids)
+          {
+            sorted_dim_sizes.resize(std::max(static_cast<int64_t>(sorted_dim_sizes.size()), id + 1));
+            sorted_dim_sizes[id] = dim_sizes[i++];
+          }
+        }
+        else
+        {
+          get_sorted_dimensions_sizes<T>(root->left, inputs, sorted_dim_sizes);
+        }
       }
+
+      if (root->right != nullptr)
+      {
+        if (root->right->type == mini_jit::EinsumTree::NodeType::Leaf)
+        {
+          const auto &dim_sizes = getTensor<T>(inputs[root->right->input_tensor_index])->dim_sizes;
+          uint i = 0;
+          for (int64_t id : root->right->output_dim_ids)
+          {
+            sorted_dim_sizes.resize(std::max(static_cast<int64_t>(sorted_dim_sizes.size()), id + 1));
+            sorted_dim_sizes[id] = dim_sizes[i++];
+          }
+        }
+        else
+        {
+          get_sorted_dimensions_sizes<T>(root->right, inputs, sorted_dim_sizes);
+        }
+      }
+    }
+
+    /**
+     * @brief Get the dim sizes of the input tensors in increased order of their dimension ids.
+     *
+     * @param root The root of the EinsumNode tree.
+     * @param inputs The input tensors.
+     * @param sorted_dim_sizes The vector to store the sorted dimension sizes.
+     */
+    constexpr void get_sorted_dimensions_sizes(const mini_jit::EinsumTree::EinsumNode *root,
+                                               const std::vector<std::reference_wrapper<const mlc::Tensor>> &inputs,
+                                               std::vector<int64_t> &sorted_dim_sizes)
+    {
+      get_sorted_dimensions_sizes<std::reference_wrapper<const mlc::Tensor>>(root, inputs, sorted_dim_sizes);
     }
-    else
+
+    /**
+     * @brief Get the dim sizes of the input tensors in increased order of their dimension ids.
+     *
+     * @param root The root of the EinsumNode tree.
+     * @param inputs The input tensors.
+     * @param sorted_dim_sizes The vector to store the sorted dimension sizes.
+     */
+    constexpr void get_sorted_dimensions_sizes(const mini_jit::EinsumTree::EinsumNode *root, const std::vector<mlc::Tensor *> &inputs,
+                                               std::vector<int64_t> &sorted_dim_sizes)
     {
-      get_sorted_dimensions_sizes(root->left, inputs, sorted_dim_sizes);
+      get_sorted_dimensions_sizes<mlc::Tensor *>(root, inputs, sorted_dim_sizes);
     }
-  }
 
-  if (root->right != nullptr)
-  {
-    if (root->right->type == mini_jit::EinsumTree::NodeType::Leaf)
+    /**
+     * @brief Helper function to convert the parse error of the EinsumTree to the corresponding mlc::ErrorType.
+     *
+     * @param error The parse error of type mini_jit::EinsumTree::ErrorParse.
+     * @return constexpr mlc::ErrorType The error code or ErrorType::None on success.
+     */
+    constexpr mlc::ErrorType convertParseError(mini_jit::EinsumTree::ErrorParse error)
+    {
+      switch (error)
+      {
+      case mini_jit::EinsumTree::ErrorParse::None:
+        return mlc::ErrorType::None;
+      case mini_jit::EinsumTree::ErrorParse::ExpectedLeftBracket:
+        return mlc::ErrorType::ParseExpectedLeftBracket;
+      case mini_jit::EinsumTree::ErrorParse::ExpectedRightBracket:
+        return mlc::ErrorType::ParseExpectedRightBracket;
+      case mini_jit::EinsumTree::ErrorParse::ExpectedArrow:
+        return mlc::ErrorType::ParseExpectedArrow;
+      case mini_jit::EinsumTree::ErrorParse::ExpectedComma:
+        return mlc::ErrorType::ParseExpectedComma;
+      case mini_jit::EinsumTree::ErrorParse::ExpectedDimensionList:
+        return mlc::ErrorType::ParseExpectedDimensionList;
+      case mini_jit::EinsumTree::ErrorParse::NotAllowedToParseAgain:
+        return mlc::ErrorType::ParseNotAllowedToParseAgain;
+      case mini_jit::EinsumTree::ErrorParse::UndefinedNode:
+        return mlc::ErrorType::ParseUndefinedNode;
+      default:
+        return mlc::ErrorType::Undefined;
+      }
+    }
+
+    /**
+     * @brief Converts the error of the EinsumTree execution to the corresponding mlc::ErrorType.
+     *
+     * @param error The error of type mini_jit::EinsumTree::ErrorExecute.
+     * @return constexpr mlc::ErrorType The error code or ErrorType::None on success.
+     */
+    constexpr mlc::ErrorType convertErrorExecute(mini_jit::EinsumTree::ErrorExecute error)
     {
-      const auto &dim_sizes = inputs[root->right->input_tensor_index].dim_sizes;
-      uint i = 0;
-      for (int64_t id : root->right->output_dim_ids)
+      if (static_cast<int64_t>(error) > 100)
       {
-        sorted_dim_sizes.resize(std::max(static_cast<int64_t>(sorted_dim_sizes.size()), id + 1));
-        sorted_dim_sizes[id] = dim_sizes[i++];
+        return static_cast<mlc::ErrorType>(static_cast<int64_t>(error));
+      }
+
+      switch (error)
+      {
+      case mini_jit::EinsumTree::ErrorExecute::None:
+        return mlc::ErrorType::None;
+      case mini_jit::EinsumTree::ErrorExecute::InvalidRoot:
+        return mlc::ErrorType::EinsumInvalidRoot;
+      case mini_jit::EinsumTree::ErrorExecute::NotEnoughInputTensors:
+        return mlc::ErrorType::EinsumNotEnoughInputTensors;
+      case mini_jit::EinsumTree::ErrorExecute::TooManyInputTensors:
+        return mlc::ErrorType::EinsumTooManyInputTensors;
+      case mini_jit::EinsumTree::ErrorExecute::NullPtrAsInputTensor:
+        return mlc::ErrorType::EinsumNullPtrAsInputTensor;
+      default:
+        return mlc::ErrorType::Undefined;
       }
     }
-    else
+
+    /**
+     * @brief Converts the error of the TensorOperation to the corresponding mlc::ErrorType.
+     *
+     * @param error The error of type mini_jit::TensorOperation::error_t.
+     * @return constexpr mlc::ErrorType The converted error of the interface.
+     */
+    constexpr mlc::ErrorType convertTensorOperationError(mini_jit::TensorOperation::error_t error)
     {
-      get_sorted_dimensions_sizes(root->right, inputs, sorted_dim_sizes);
+      switch (error)
+      {
+      case mini_jit::TensorOperation::error_t::success:
+        return mlc::ErrorType::None;
+      case mini_jit::TensorOperation::error_t::err_wrong_dtype:
+        return mlc::ErrorType::ExecuteWrongDType;
+      case mini_jit::TensorOperation::error_t::err_wrong_dimension:
+        return mlc::ErrorType::ExecuteWrongDimension;
+      case mini_jit::TensorOperation::error_t::err_wrong_primitive:
+        return mlc::ErrorType::ExecuteWrongPrimitive;
+      case mini_jit::TensorOperation::error_t::err_wrong_first_touch_primitive:
+        return mlc::ErrorType::ExecuteFirstTouchPrimitive;
+      case mini_jit::TensorOperation::error_t::err_wrong_main_primitive:
+        return mlc::ErrorType::ExecuteWrongMainPrimitive;
+      case mini_jit::TensorOperation::error_t::err_wrong_last_touch_primitive:
+        return mlc::ErrorType::ExecuteWrongLastTouchPrimitive;
+      case mini_jit::TensorOperation::error_t::err_execution_type_not_supported:
+        return mlc::ErrorType::ExecuteTypeNotSupported;
+      case mini_jit::TensorOperation::error_t::err_invalid_primitive_configuration:
+        return mlc::ErrorType::ExecuteInvalidPrimitiveConfiguration;
+      case mini_jit::TensorOperation::error_t::err_invalid_first_touch_configuration:
+        return mlc::ErrorType::ExecuteInvalidFirstTouchConfiguration;
+      case mini_jit::TensorOperation::error_t::err_invalid_main_configuration:
+        return mlc::ErrorType::ExecuteInvalidMainConfiguration;
+      case mini_jit::TensorOperation::error_t::err_invalid_last_touch_configuration:
+        return mlc::ErrorType::ExecuteInvalidLastTouchConfiguration;
+      case mini_jit::TensorOperation::error_t::err_invalid_execution_order:
+        return mlc::ErrorType::ExecuteInvalidExecutionOrder;
+      case mini_jit::TensorOperation::error_t::err_invalid_strides:
+        return mlc::ErrorType::ExecuteInvalidStrides;
+      case mini_jit::TensorOperation::error_t::err_k_dimension_must_not_be_shared:
+        return mlc::ErrorType::ExecuteKDimensionMustNotBeShared;
+      case mini_jit::TensorOperation::error_t::err_shared_required_for_parallel_execution:
+        return mlc::ErrorType::ExecuteSharedRequiredForParallelExecution;
+      default:
+        return mlc::ErrorType::Undefined;
+      }
     }
-  }
-}
+
+    /**
+     * @brief Get the size of the given tensor.
+     *
+     * @param tensor The tensor to calculate the size from.
+     * @return constexpr uint64_t The size of the tensor.
+     */
+    constexpr uint64_t getTensorSize(const mlc::Tensor *tensor)
+    {
+      uint64_t size = 1;
+      for (auto dim : tensor->dim_sizes)
+      {
+        size *= dim;
+      }
+      return size;
+    }
+
+    /**
+     * @brief Converts a primitive type from the interface unary to a corresponding primitive of the tensor config.
+     *
+     * @param type The unary type to convert.
+     * @return constexpr mini_jit::TensorConfig::prim_t The converted primitive.
+     */
+    constexpr mini_jit::TensorConfig::prim_t convertPrimitiveType(mlc::UnaryType type)
+    {
+      switch (type)
+      {
+      case mlc::UnaryType::None:
+        return mini_jit::TensorConfig::prim_t::none;
+      case mlc::UnaryType::Identity:
+        return mini_jit::TensorConfig::prim_t::copy;
+      case mlc::UnaryType::Zero:
+        return mini_jit::TensorConfig::prim_t::zero;
+      case mlc::UnaryType::ReLU:
+        return mini_jit::TensorConfig::prim_t::relu;
+      default:
+        return mini_jit::TensorConfig::prim_t::none;
+      }
+    }
+
+    /**
+     * @brief Recursively converts the given tensor into a string format.
+     *
+     * @param tensor The tensor to convert.
+     * @param str The string to write to.
+     * @param dim The current processed dimension.
+     * @param offset The offset from the data to be processed.
+     * @param indent The indentation of the current dimension.
+     */
+    void tensor_dim_to_string(mlc::Tensor *tensor, std::string &str, size_t dim, size_t offset, std::string indent);
+  }  // namespace internal
+}  // namespace mlc
+#endif  // MLC_TENSORUTILS_H
diff --git a/src/interface/Unary.cpp b/src/interface/Unary.cpp
new file mode 100644
index 0000000..c8a07b3
--- /dev/null
+++ b/src/interface/Unary.cpp
@@ -0,0 +1,143 @@
+#include "../../include/MachineLearningCompiler/Tensor.h"
+#include "../main/TensorOperation.h"
+#include "TensorUtils.h"
+
+mlc::Error mlc::unary_zero(Tensor &input)
+{
+  int64_t stride = 1;
+  std::vector<int64_t> dimSizes(input.dim_sizes.size());
+  std::vector<int64_t> strides(input.dim_sizes.size());
+
+  for (int64_t i = input.dim_sizes.size() - 1; i >= 0; i--)
+  {
+    strides[i] = stride;
+    dimSizes[i] = static_cast<int64_t>(input.dim_sizes[i]);
+    stride *= input.dim_sizes[i];
+  }
+
+  mini_jit::TensorOperation op;
+  mini_jit::TensorConfig config{
+    mini_jit::TensorConfig::prim_t::none,                                      // first_touch
+    mini_jit::TensorConfig::prim_t::zero,                                      // main
+    mini_jit::TensorConfig::prim_t::none,                                      // last touch
+    std::vector(input.dim_sizes.size(), mini_jit::TensorConfig::dim_t::c),     // dim_types
+    std::vector(input.dim_sizes.size(), mini_jit::TensorConfig::exec_t::seq),  // exec_types
+    dimSizes,                                                                  // dim_sizes
+    strides,                                                                   // strides_in0
+    std::vector<int64_t>(input.dim_sizes.size(), 0),                           // strides_in1
+    strides,                                                                   // strides_out
+    mini_jit::TensorConfig::dtype_t::fp32,                                     // dtype_t
+  };
+
+  mini_jit::TensorOperation::error_t error = op.setup(config);
+  mlc::ErrorType errorType = internal::convertTensorOperationError(error);
+  if (errorType != mlc::ErrorType::None)
+  {
+    return {errorType, "Could not generate the kernels for the gemm operation."};
+  }
+
+  op.execute(input.data, nullptr, input.data);
+  return {ErrorType::None, "Success"};
+}
+
+mlc::Error mlc::unary_relu(const Tensor &input, Tensor &output)
+{
+  if (output.dim_sizes.size() != input.dim_sizes.size())
+  {
+    return {ErrorType::ExecuteWrongDimension, "Expected the output tensor to have the same number of dimension as the input."};
+  }
+
+  for (size_t i = 0; i < input.dim_sizes.size(); i++)
+  {
+    if (output.dim_sizes[i] != input.dim_sizes[i])
+    {
+      return {ErrorType::ExecuteWrongDimension, "Expected the output tensor to have the same number of dimension as the input."};
+    }
+  }
+
+  int64_t stride = 1;
+  std::vector<int64_t> dimSizes(input.dim_sizes.size());
+  std::vector<int64_t> strides(input.dim_sizes.size());
+
+  for (int64_t i = input.dim_sizes.size() - 1; i >= 0; i--)
+  {
+    strides[i] = stride;
+    dimSizes[i] = static_cast<int64_t>(input.dim_sizes[i]);
+    stride *= input.dim_sizes[i];
+  }
+
+  mini_jit::TensorOperation op;
+  mini_jit::TensorConfig config{
+    mini_jit::TensorConfig::prim_t::none,                                      // first_touch
+    mini_jit::TensorConfig::prim_t::relu,                                      // main
+    mini_jit::TensorConfig::prim_t::none,                                      // last touch
+    std::vector(input.dim_sizes.size(), mini_jit::TensorConfig::dim_t::c),     // dim_types
+    std::vector(input.dim_sizes.size(), mini_jit::TensorConfig::exec_t::seq),  // exec_types
+    dimSizes,                                                                  // dim_sizes
+    strides,                                                                   // strides_in0
+    std::vector<int64_t>(input.dim_sizes.size(), 0),                           // strides_in1
+    strides,                                                                   // strides_out
+    mini_jit::TensorConfig::dtype_t::fp32,                                     // dtype_t
+  };
+
+  mini_jit::TensorOperation::error_t error = op.setup(config);
+  mlc::ErrorType errorType = internal::convertTensorOperationError(error);
+  if (errorType != mlc::ErrorType::None)
+  {
+    return {errorType, "Could not generate the kernels for the gemm operation."};
+  }
+
+  op.execute(input.data, nullptr, output.data);
+  return {ErrorType::None, "Success"};
+}
+
+mlc::Error mlc::unary_identity(const Tensor &input, Tensor &output)
+{
+  if (output.dim_sizes.size() != input.dim_sizes.size())
+  {
+    return {ErrorType::ExecuteWrongDimension, "Expected the output tensor to have the same number of dimension as the input."};
+  }
+
+  for (size_t i = 0; i < input.dim_sizes.size(); i++)
+  {
+    if (output.dim_sizes[i] != input.dim_sizes[i])
+    {
+      return {ErrorType::ExecuteWrongDimension, "Expected the output tensor to have the same number of dimension as the input."};
+    }
+  }
+
+  int64_t stride = 1;
+  std::vector<int64_t> dimSizes(input.dim_sizes.size());
+  std::vector<int64_t> strides(input.dim_sizes.size());
+
+  for (int64_t i = input.dim_sizes.size() - 1; i >= 0; i--)
+  {
+    strides[i] = stride;
+    dimSizes[i] = static_cast<int64_t>(input.dim_sizes[i]);
+    stride *= input.dim_sizes[i];
+  }
+
+  mini_jit::TensorOperation op;
+  mini_jit::TensorConfig config{
+    mini_jit::TensorConfig::prim_t::none,                                      // first_touch
+    mini_jit::TensorConfig::prim_t::copy,                                      // main
+    mini_jit::TensorConfig::prim_t::none,                                      // last touch
+    std::vector(input.dim_sizes.size(), mini_jit::TensorConfig::dim_t::c),     // dim_types
+    std::vector(input.dim_sizes.size(), mini_jit::TensorConfig::exec_t::seq),  // exec_types
+    dimSizes,                                                                  // dim_sizes
+    strides,                                                                   // strides_in0
+    std::vector<int64_t>(input.dim_sizes.size(), 0),                           // strides_in1
+    strides,                                                                   // strides_out
+    mini_jit::TensorConfig::dtype_t::fp32,                                     // dtype_t
+  };
+
+  mini_jit::TensorOperation::error_t error = op.setup(config);
+  mlc::ErrorType errorType = internal::convertTensorOperationError(error);
+  if (errorType != mlc::ErrorType::None)
+  {
+    return {errorType, "Could not generate the kernels for the gemm operation."};
+  }
+
+  op.execute(input.data, nullptr, output.data);
+  return {ErrorType::None, "Success"};
+}
\ No newline at end of file
diff --git a/src/main/EinsumTree.cpp b/src/main/EinsumTree.cpp
index 61c58aa..37ab945 100644
--- a/src/main/EinsumTree.cpp
+++ b/src/main/EinsumTree.cpp
@@ -165,6 +165,11 @@ void mini_jit::EinsumTree::set_sorted_dim_sizes(const std::vector<int64_t> &sort
   EinsumTree::dim_sizes = sorted_dim_sizes;
 }
 
+const std::vector<int64_t> &mini_jit::EinsumTree::get_sorted_dim_sizes()
+{
+  return dim_sizes;
+}
+
 void mini_jit::EinsumTree::delete_tree(EinsumNode *node)
 {
   if (node == nullptr)
diff --git a/src/main/EinsumTree.h b/src/main/EinsumTree.h
index 73d0890..e556d8d 100644
--- a/src/main/EinsumTree.h
+++ b/src/main/EinsumTree.h
@@ -119,14 +119,6 @@ namespace mini_jit
     EinsumNode *parse_node(size_t &pos, const std::string &str);
 
     // Lowering
-    /**
-     * Lowers the given EinsumNode to a TensorConfig.
-     *
-     * @param node The EinsumNode to lower.
-     * @return A TensorConfig representing the lowered node.
-     */
-    TensorConfig lower_node(const EinsumNode *node);
-
     /**
      * Retrieves the dimension types and sizes for the given EinsumNode.
      *
@@ -251,6 +243,8 @@ namespace mini_jit
      */
     void set_sorted_dim_sizes(const std::vector<int64_t> &sorted_dim_sizes);
 
+    const std::vector<int64_t> &get_sorted_dim_sizes();
+
     /**
      * Parses the einsum tree string and builds the tree structure.
      *
@@ -309,6 +303,14 @@ namespace mini_jit
      * @return ErrorExecute indicating the result of the execution operation.
      */
     ErrorExecute execute(const std::vector<void *> &tensors);
+
+    /**
+     * Lowers the given EinsumNode to a TensorConfig.
+     *
+     * @param node The EinsumNode to lower.
+     * @return A TensorConfig representing the lowered node.
+     */
+    TensorConfig lower_node(const EinsumNode *node);
   };
 };  // namespace mini_jit
 
diff --git a/src/main/TensorOperation.cpp b/src/main/TensorOperation.cpp
index 5e34e40..88cb2ca 100644
--- a/src/main/TensorOperation.cpp
+++ b/src/main/TensorOperation.cpp
@@ -107,13 +107,14 @@ int32_t mini_jit::TensorOperation::findMatch(const std::span<const TensorConfig:
                                              TensorConfig::exec_t searchExec, uint32_t startIndex)
 {
   release_assert(dim.size() == exec.size(), "Expected the dimension types size to match the execution types size.");
-  release_assert(startIndex <= dim.size(), "Expected the start index to be less than the dimension types size.");
 
   if (startIndex >= dim.size())
   {
     return -1;
   }
 
+  release_assert(startIndex < dim.size(), "Expected the start index to be less than the dimension types size.");
+
   for (auto [iDim, iExec] = std::tuple{dim.begin() + startIndex, exec.begin() + startIndex}; iDim != dim.end(); ++iDim, ++iExec)
   {
     if (*iDim == searchDim && *iExec == searchExec)
diff --git a/src/main/TensorOptimization.cpp b/src/main/TensorOptimization.cpp
index a89d89e..e7f1588 100644
--- a/src/main/TensorOptimization.cpp
+++ b/src/main/TensorOptimization.cpp
@@ -93,7 +93,11 @@ void mini_jit::TensorOptimization::_primitive_identification(TensorConfig &confi
 
       if (fixed_k2 == false && (primitive_k2 == -1 || primitive_stride < primitive_k2_stride))
       {
-        primitive_k2 = std::distance(config.dim_types.begin(), iDim);
+        int32_t index = std::distance(config.dim_types.begin(), iDim);
+        if (index != primitive_k1)
+        {
+          primitive_k2 = index;
+        }
       }
     }
     else if (*iDim == TensorConfig::dim_t::m)
@@ -260,7 +264,7 @@ void mini_jit::TensorOptimization::_dimension_reordering_shared(TensorConfig &co
   }
   if (primitive_m != -1)
   {
-    int32_t new_index = config.dim_types.size() - 2 - (primitive_k1 != -1);
+    int32_t new_index = config.dim_types.size() - 1 - (primitive_n != -1) - (primitive_k1 != -1);
     _swap_elements(config, primitive_m, new_index);
     _reorder_helper_adjust_index(new_index, primitive_m, primitive_m, primitive_n, primitive_k1, primitive_k2);
     primitive_m = new_index;
@@ -410,7 +414,7 @@ void mini_jit::TensorOptimization::_dimension_reordering_fusing(TensorConfig &co
   }
 }
 
-void mini_jit::TensorOptimization::_swap_elements(TensorConfig &config, size_t index1, size_t index2)
+void mini_jit::TensorOptimization::_swap_elements(TensorConfig &config, int64_t index1, int64_t index2)
 {
   if (index1 == index2)
   {
@@ -424,8 +428,10 @@ void mini_jit::TensorOptimization::_swap_elements(TensorConfig &config, size_t i
   release_assert(config.dim_types.size() == config.strides_in0.size(), "Expected the dimension types size to match the strides_in0 size.");
   release_assert(config.dim_types.size() == config.strides_in1.size(), "Expected the dimension types size to match the strides_in1 size.");
   release_assert(config.dim_types.size() == config.strides_out.size(), "Expected the dimension types size to match the strides_out size.");
-  release_assert(index1 < config.dim_types.size(), "Expected the index1 to be less than the dimension types size.");
-  release_assert(index2 < config.dim_types.size(), "Expected the index2 to be less than the dimension types size.");
+  release_assert(index1 < static_cast<int64_t>(config.dim_types.size()), "Expected the index1 to be less than the dimension types size.");
+  release_assert(index2 < static_cast<int64_t>(config.dim_types.size()), "Expected the index2 to be less than the dimension types size.");
+  release_assert(index1 >= 0, "Expected the index1 to be larger equal than 0.");
+  release_assert(index2 >= 0, "Expected the index2 to be larger equal than 0.");
 
   std::iter_swap(config.dim_types.begin() + index1, config.dim_types.begin() + index2);
   std::iter_swap(config.dim_sizes.begin() + index1, config.dim_sizes.begin() + index2);
@@ -498,6 +504,11 @@ void mini_jit::TensorOptimization::_dimension_fusing(TensorConfig &config)
 {
   for (size_t i = 0; i + 1 < config.dim_sizes.size(); ++i)
   {
+    if (config.dim_sizes.size() <= 2)
+    {
+      return;
+    }
+
     // Check if adjacent dims have the same type and their product is less equal than 256
     // stride(X) = |Y| * stride(Y)
     if (config.dim_types[i] == config.dim_types[i + 1] && config.strides_in0[i] == (config.dim_sizes[i + 1] * config.strides_in0[i + 1]) &&
diff --git a/src/main/TensorOptimization.h b/src/main/TensorOptimization.h
index d018932..66ee34f 100644
--- a/src/main/TensorOptimization.h
+++ b/src/main/TensorOptimization.h
@@ -74,7 +74,7 @@ namespace mini_jit
      * @param index1 The index of element 1 to be set a position of index2.
      * @param index2 The index of element 2 ot be set a position of index1.
      */
-    void _swap_elements(TensorConfig &config, size_t index1, size_t index2);
+    void _swap_elements(TensorConfig &config, int64_t index1, int64_t index2);
 
     /**
      * @brief Moves an element from the old index to the new index position.
diff --git a/src/test/interface/Tensor.test.cpp b/src/test/interface/Tensor.test.cpp
index fec9b34..5d26805 100644
--- a/src/test/interface/Tensor.test.cpp
+++ b/src/test/interface/Tensor.test.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 #include <vector>
 
-TEST_CASE("Test tensor fill_random", "[tensor][correctness]")
+TEST_CASE("Test interface tensor fill_random", "[tensor][correctness]")
 {
   std::vector<uint64_t> shape1 = {3, 4};
 
@@ -28,7 +28,95 @@ TEST_CASE("Test tensor fill_random", "[tensor][correctness]")
   }
 }
 
-TEST_CASE("Test tensor einsum", "[tensor][correctness]")
+TEST_CASE("Test interface tensor fill_number", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  float *data1 = new float[total_size1];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = std::nanf("1");
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::fill_number(tensor1, 1);
+
+  for (size_t i = 0; i < total_size1; i++)
+  {
+    CAPTURE(i);
+    REQUIRE(tensor1.data[i] == 1);
+  }
+}
+
+TEST_CASE("Test interface tensor fill_counting_up", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  float *data1 = new float[total_size1];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = std::nanf("1");
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::fill_counting_up(tensor1, 5, 0.5);
+
+  for (size_t i = 0; i < total_size1; i++)
+  {
+    CAPTURE(i);
+    REQUIRE(tensor1.data[i] == (0.5f * i + 5));
+  }
+}
+
+TEST_CASE("Test interface tensor fill_counting_down", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  float *data1 = new float[total_size1];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = std::nanf("1");
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::fill_counting_down(tensor1, 5, 1.0);
+
+  for (int64_t i = 0; i < static_cast<int64_t>(total_size1); i++)
+  {
+    CAPTURE(i);
+    REQUIRE(tensor1.data[i] == (-i + 5));
+  }
+}
+
+TEST_CASE("Test interface tensor fill_lambda", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  float *data1 = new float[total_size1];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = std::nanf("1");
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::fill_lambda(tensor1, [](const mlc::Tensor &, size_t index) { return index; });
+
+  for (size_t i = 0; i < total_size1; i++)
+  {
+    CAPTURE(i);
+    REQUIRE(tensor1.data[i] == i);
+  }
+}
+
+TEST_CASE("Test interface tensor einsum reference", "[tensor][correctness]")
 {
   std::vector<uint64_t> shape1 = {3, 4};
   std::vector<uint64_t> shape2 = {4, 5};
@@ -59,5 +147,416 @@ TEST_CASE("Test tensor einsum", "[tensor][correctness]")
   mlc::Tensor tensor2(data2, shape2);
   mlc::Tensor tensor3(data2, shape3);
 
-  mlc::einsum({tensor1, tensor2}, tensor3, "[0,1],[1,2]->[0,2]");
+  REQUIRE(tensor1.strides.size() == 2);
+  REQUIRE(tensor1.strides[0] == 4);
+  REQUIRE(tensor1.strides[1] == 1);
+  REQUIRE(tensor2.strides.size() == 2);
+  REQUIRE(tensor2.strides[0] == 5);
+  REQUIRE(tensor2.strides[1] == 1);
+  REQUIRE(tensor3.strides.size() == 2);
+  REQUIRE(tensor3.strides[0] == 5);
+  REQUIRE(tensor3.strides[1] == 1);
+
+  mlc::Error err = mlc::einsum({tensor1, tensor2}, tensor3, "[0,1],[1,2]->[0,2]");
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  delete[] data1;
+  delete[] data2;
+  delete[] data3;
+}
+
+TEST_CASE("Test interface tensor einsum pointer", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+  std::vector<uint64_t> shape2 = {4, 5};
+  std::vector<uint64_t> shape3 = {3, 5};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  size_t total_size2 = shape2[0] * shape2[1];
+  size_t total_size3 = shape3[0] * shape3[1];
+
+  float *data1 = new float[total_size1];
+  float *data2 = new float[total_size2];
+  float *data3 = new float[total_size3];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = static_cast<float>(i);
+  }
+  for (size_t i = 0; i < total_size2; ++i)
+  {
+    data2[i] = static_cast<float>(2 * i);
+  }
+  for (size_t i = 0; i < total_size3; ++i)
+  {
+    data3[i] = static_cast<float>(3 * i);
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::Tensor tensor2(data2, shape2);
+  mlc::Tensor tensor3(data2, shape3);
+  std::vector<mlc::Tensor *> inputs{&tensor1, &tensor2};
+
+  REQUIRE(tensor1.strides.size() == 2);
+  REQUIRE(tensor1.strides[0] == 4);
+  REQUIRE(tensor1.strides[1] == 1);
+  REQUIRE(tensor2.strides.size() == 2);
+  REQUIRE(tensor2.strides[0] == 5);
+  REQUIRE(tensor2.strides[1] == 1);
+  REQUIRE(tensor3.strides.size() == 2);
+  REQUIRE(tensor3.strides[0] == 5);
+  REQUIRE(tensor3.strides[1] == 1);
+
+  CAPTURE(inputs);
+  mlc::Error err = mlc::einsum(inputs, tensor3, "[0,1],[1,2]->[0,2]");
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  delete[] data1;
+  delete[] data2;
+  delete[] data3;
+}
+
+TEST_CASE("Test interface tensor contraction", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+  std::vector<uint64_t> shape2 = {4, 5};
+  std::vector<uint64_t> shape3 = {3, 5};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  size_t total_size2 = shape2[0] * shape2[1];
+  size_t total_size3 = shape3[0] * shape3[1];
+
+  float *data1 = new float[total_size1];
+  float *data2 = new float[total_size2];
+  float *data3 = new float[total_size3];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = static_cast<float>(i);
+  }
+  for (size_t i = 0; i < total_size2; ++i)
+  {
+    data2[i] = static_cast<float>(2 * i);
+  }
+  for (size_t i = 0; i < total_size3; ++i)
+  {
+    data3[i] = static_cast<float>(3 * i);
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::Tensor tensor2(data2, shape2);
+  mlc::Tensor tensor3(data2, shape3);
+
+  REQUIRE(tensor1.strides.size() == 2);
+  REQUIRE(tensor1.strides[0] == 4);
+  REQUIRE(tensor1.strides[1] == 1);
+  REQUIRE(tensor2.strides.size() == 2);
+  REQUIRE(tensor2.strides[0] == 5);
+  REQUIRE(tensor2.strides[1] == 1);
+  REQUIRE(tensor3.strides.size() == 2);
+  REQUIRE(tensor3.strides[0] == 5);
+  REQUIRE(tensor3.strides[1] == 1);
+
+  mlc::Error err = mlc::contraction(tensor1, tensor2, tensor3, "[0,1],[1,2]->[0,2]");
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  delete[] data1;
+  delete[] data2;
+  delete[] data3;
+}
+
+TEST_CASE("Test interface tensor gemm", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {4, 3};  // k, m
+  std::vector<uint64_t> shape2 = {5, 4};  // n, k
+  std::vector<uint64_t> shape3 = {5, 3};  // n, m
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  size_t total_size2 = shape2[0] * shape2[1];
+  size_t total_size3 = shape3[0] * shape3[1];
+
+  float *data1 = new float[total_size1];
+  float *data2 = new float[total_size2];
+  float *data3 = new float[total_size3];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = static_cast<float>(i);
+  }
+  for (size_t i = 0; i < total_size2; ++i)
+  {
+    data2[i] = static_cast<float>(2 * i);
+  }
+  for (size_t i = 0; i < total_size3; ++i)
+  {
+    data3[i] = static_cast<float>(3 * i);
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::Tensor tensor2(data2, shape2);
+  mlc::Tensor tensor3(data2, shape3);
+
+  REQUIRE(tensor1.strides.size() == 2);
+  REQUIRE(tensor1.strides[0] == 3);
+  REQUIRE(tensor1.strides[1] == 1);
+  REQUIRE(tensor2.strides.size() == 2);
+  REQUIRE(tensor2.strides[0] == 4);
+  REQUIRE(tensor2.strides[1] == 1);
+  REQUIRE(tensor3.strides.size() == 2);
+  REQUIRE(tensor3.strides[0] == 3);
+  REQUIRE(tensor3.strides[1] == 1);
+
+  mlc::Error err = mlc::gemm(tensor1, tensor2, tensor3);
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  delete[] data1;
+  delete[] data2;
+  delete[] data3;
+}
+
+TEST_CASE("Test interface tensor gemm failure", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4, 5};  // Invalid shape for GEMM, should be 2D
+  std::vector<uint64_t> shape2 = {4, 5};
+  std::vector<uint64_t> shape3 = {3, 5};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  size_t total_size2 = shape2[0] * shape2[1];
+  size_t total_size3 = shape3[0] * shape3[1];
+
+  float *data1 = new float[total_size1];
+  float *data2 = new float[total_size2];
+  float *data3 = new float[total_size3];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = static_cast<float>(i);
+  }
+  for (size_t i = 0; i < total_size2; ++i)
+  {
+    data2[i] = static_cast<float>(2 * i);
+  }
+  for (size_t i = 0; i < total_size3; ++i)
+  {
+    data3[i] = static_cast<float>(3 * i);
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::Tensor tensor2(data2, shape2);
+  mlc::Tensor tensor3(data2, shape3);
+
+  REQUIRE(tensor1.strides.size() == 3);
+  REQUIRE(tensor1.strides[0] == 20);
+  REQUIRE(tensor1.strides[1] == 5);
+  REQUIRE(tensor1.strides[2] == 1);
+  REQUIRE(tensor2.strides.size() == 2);
+  REQUIRE(tensor2.strides[0] == 5);
+  REQUIRE(tensor2.strides[1] == 1);
+  REQUIRE(tensor3.strides.size() == 2);
+  REQUIRE(tensor3.strides[0] == 5);
+  REQUIRE(tensor3.strides[1] == 1);
+
+  mlc::Error err = mlc::gemm(tensor1, tensor2, tensor3);
+  REQUIRE(err.type == mlc::ErrorType::TensorExpected2DTensor);
+
+  delete[] data1;
+  delete[] data2;
+  delete[] data3;
+}
+
+TEST_CASE("Test interface tensor unary zero", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4, 5};
+
+  size_t total_size1 = shape1[0] * shape1[1] * shape1[2];
+
+  float *data1 = new float[total_size1];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = static_cast<float>(i);
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+
+  REQUIRE(tensor1.strides.size() == 3);
+  REQUIRE(tensor1.strides[0] == 20);
+  REQUIRE(tensor1.strides[1] == 5);
+  REQUIRE(tensor1.strides[2] == 1);
+
+  mlc::Error err = mlc::unary_zero(tensor1);
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  for (size_t i = 0; i < total_size1; i++)
+  {
+    CAPTURE(i);
+    REQUIRE(tensor1.data[i] == 0);
+  }
+
+  delete[] data1;
+}
+
+TEST_CASE("Test interface tensor unary relu", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4, 5};
+  std::vector<uint64_t> shape2 = {3, 4, 5};
+
+  size_t total_size1 = shape1[0] * shape1[1] * shape1[2];
+  size_t total_size2 = shape2[0] * shape2[1] * shape2[2];
+
+  float *data1 = new float[total_size1];
+  float *data2 = new float[total_size2];
+
+  for (int64_t i = 0; i < static_cast<int64_t>(total_size1); ++i)
+  {
+    data1[i] = static_cast<float>(i * (2 * (i % 2) - 1));
+  }
+  for (size_t i = 0; i < total_size2; ++i)
+  {
+    data2[i] = 0;
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::Tensor tensor2(data2, shape2);
+
+  REQUIRE(tensor1.strides.size() == 3);
+  REQUIRE(tensor1.strides[0] == 20);
+  REQUIRE(tensor1.strides[1] == 5);
+  REQUIRE(tensor1.strides[2] == 1);
+  REQUIRE(tensor2.strides.size() == 3);
+  REQUIRE(tensor2.strides[0] == 20);
+  REQUIRE(tensor2.strides[1] == 5);
+  REQUIRE(tensor2.strides[2] == 1);
+
+  mlc::Error err = mlc::unary_relu(tensor1, tensor2);
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  for (size_t i = 0; i < total_size1; i++)
+  {
+    CAPTURE(i);
+    REQUIRE(tensor2.data[i] == std::max(0.0f, tensor1.data[i]));
+  }
+
+  delete[] data1;
+  delete[] data2;
+}
+
+TEST_CASE("Test interface tensor unary identity", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4, 5};
+  std::vector<uint64_t> shape2 = {3, 4, 5};
+
+  size_t total_size1 = shape1[0] * shape1[1] * shape1[2];
+  size_t total_size2 = shape2[0] * shape2[1] * shape2[2];
+
+  float *data1 = new float[total_size1];
+  float *data2 = new float[total_size2];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = static_cast<float>(i);
+  }
+  for (size_t i = 0; i < total_size2; ++i)
+  {
+    data2[i] = 0;
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::Tensor tensor2(data2, shape2);
+
+  REQUIRE(tensor1.strides.size() == 3);
+  REQUIRE(tensor1.strides[0] == 20);
+  REQUIRE(tensor1.strides[1] == 5);
+  REQUIRE(tensor1.strides[2] == 1);
+  REQUIRE(tensor2.strides.size() == 3);
+  REQUIRE(tensor2.strides[0] == 20);
+  REQUIRE(tensor2.strides[1] == 5);
+  REQUIRE(tensor2.strides[2] == 1);
+
+  mlc::Error err = mlc::unary_identity(tensor1, tensor2);
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  for (size_t i = 0; i < total_size1; i++)
+  {
+    CAPTURE(i);
+    REQUIRE(tensor1.data[i] == tensor2.data[i]);
+  }
+
+  delete[] data1;
+  delete[] data2;
+}
+
+TEST_CASE("Test interface tensor contraction first+last", "[tensor][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+  std::vector<uint64_t> shape2 = {4, 5};
+  std::vector<uint64_t> shape3 = {3, 5};
+
+  size_t total_size1 = shape1[0] * shape1[1];
+  size_t total_size2 = shape2[0] * shape2[1];
+  size_t total_size3 = shape3[0] * shape3[1];
+
+  float *data1 = new float[total_size1];
+  float *data2 = new float[total_size2];
+  float *data3 = new float[total_size3];
+
+  for (size_t i = 0; i < total_size1; ++i)
+  {
+    data1[i] = static_cast<float>(i);
+  }
+  for (size_t i = 0; i < total_size2; ++i)
+  {
+    data2[i] = static_cast<float>(2 * i);
+  }
+  for (size_t i = 0; i < total_size3; ++i)
+  {
+    data3[i] = static_cast<float>(3 * i);
+  }
+
+  mlc::Tensor tensor1(data1, shape1);
+  mlc::Tensor tensor2(data2, shape2);
+  mlc::Tensor tensor3(data2, shape3);
+
+  REQUIRE(tensor1.strides.size() == 2);
+  REQUIRE(tensor1.strides[0] == 4);
+  REQUIRE(tensor1.strides[1] == 1);
+  REQUIRE(tensor2.strides.size() == 2);
+  REQUIRE(tensor2.strides[0] == 5);
+  REQUIRE(tensor2.strides[1] == 1);
+  REQUIRE(tensor3.strides.size() == 2);
+  REQUIRE(tensor3.strides[0] == 5);
+  REQUIRE(tensor3.strides[1] == 1);
+
+  mlc::Error err = mlc::contraction(tensor1, tensor2, tensor3, "[0,1],[1,2]->[0,2]", mlc::UnaryType::None, mlc::UnaryType::None);
+  REQUIRE(err.type == mlc::ErrorType::None);
+
+  delete[] data1;
+  delete[] data2;
+  delete[] data3;
+}
+
+TEST_CASE("Test interface tensor einsum operation", "[setup][correctness]")
+{
+  std::vector<uint64_t> shape1 = {3, 4};
+  std::vector<uint64_t> shape2 = {4, 5};
+  std::vector<uint64_t> shape3 = {3, 5};
+
+  mlc::Tensor tensor1(shape1);
+  mlc::Tensor tensor2(shape2);
+  mlc::Tensor tensor3(shape3);
+
+  mlc::TensorOperation *setup = mlc::einsum_operation({shape1, shape2}, shape3, "[0,1],[1,2]->[0,2]");
+
+  mlc::Error error = setup->execute({tensor1, tensor2}, tensor3);
+  INFO(error.message);
+  REQUIRE(error.type == mlc::ErrorType::None);
+
+  error = setup->execute({tensor1, tensor2}, tensor3);
+  INFO(error.message);
+  REQUIRE(error.type == mlc::ErrorType::None);
+
+  error = setup->execute({tensor1, tensor2}, tensor3);
+  INFO(error.message);
+  REQUIRE(error.type == mlc::ErrorType::None);
+  delete setup;
 }
\ No newline at end of file
diff --git a/src/test/interface/TensorUtils.test.cpp b/src/test/interface/TensorUtils.test.cpp
index 9e75ea2..9748cdc 100644
--- a/src/test/interface/TensorUtils.test.cpp
+++ b/src/test/interface/TensorUtils.test.cpp
@@ -5,7 +5,7 @@
 #include <catch2/generators/catch_generators_range.hpp>
 #include <vector>
 
-TEST_CASE("Test tensor utils get_sorted_dimensions_sizes", "[tensor][correctness]")
+TEST_CASE("Test interface tensor utils get_sorted_dimensions_sizes", "[tensor][correctness]")
 {
   std::vector<uint64_t> shape1 = {3, 4};
   std::vector<uint64_t> shape2 = {4, 5};
@@ -40,7 +40,7 @@ TEST_CASE("Test tensor utils get_sorted_dimensions_sizes", "[tensor][correctness
   tree.parse_tree();
 
   std::vector<int64_t> sorted_dimensions_sizes;
-  get_sorted_dimensions_sizes(tree.get_root(), {tensor1, tensor2}, sorted_dimensions_sizes);
+  mlc::internal::get_sorted_dimensions_sizes(tree.get_root(), {tensor1, tensor2}, sorted_dimensions_sizes);
 
   std::vector<int64_t> expected = {3, 4, 5};