diff --git a/.gitmodules b/.gitmodules
index 2a53974a..62ea8cff 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "libs/cnkalman"]
-	path = libs/cnkalman
-	url = ../cnkalman.git
+[submodule "libs/cnkalman/libs/pybind11"]
+	path = libs/cnkalman/libs/pybind11
+	url = git@github.com:pybind/pybind11.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f40af1f4..71f0e937 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,12 +188,7 @@ SET(CMAKE_SKIP_BUILD_RPATH  FALSE)
 SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 
-if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/libs/cnkalman/CMakeLists.txt")
-  find_program(GIT git)
-  message("Submodule seems empty; populating...")
-  execute_process(COMMAND ${GIT} submodule update --init --recursive WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-endif()
-
+add_subdirectory(libs/cnmatrix)
 add_subdirectory(libs/cnkalman)
 
 add_subdirectory(redist)
diff --git a/libs/cnkalman b/libs/cnkalman
deleted file mode 160000
index a3aa1cec..00000000
--- a/libs/cnkalman
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a3aa1cec8ec2f6a1db8ed2c0e4836da329b32659
diff --git a/libs/cnkalman/.dockerignore b/libs/cnkalman/.dockerignore
new file mode 100644
index 00000000..8eee68fe
--- /dev/null
+++ b/libs/cnkalman/.dockerignore
@@ -0,0 +1 @@
+cmake-build-*
diff --git a/libs/cnkalman/.github/workflows/build_wheels.yml b/libs/cnkalman/.github/workflows/build_wheels.yml
new file mode 100644
index 00000000..2b5df81b
--- /dev/null
+++ b/libs/cnkalman/.github/workflows/build_wheels.yml
@@ -0,0 +1,52 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Build Wheels
+
+on:
+  push:
+  create:
+    tags: [ master ]
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-2019]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+          submodules: recursive
+
+      - uses: actions/setup-python@v2
+        name: Install Python
+        with:
+          python-version: '3.7'
+
+      - name: Get Dependencies
+        run: pip install twine build
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.11.2
+        env:
+          CIBW_SKIP: "*-win32 *-manylinux_i686 cp36-*"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          path: ./wheelhouse/*.whl
+
+      - name: Upload with twine
+        run: |
+          python -m pip install twine
+          python -m twine upload wheelhouse/*.whl --skip-existing
+        continue-on-error: true
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.TWINE_TOKEN }}
diff --git a/libs/cnkalman/.github/workflows/cmake.yml b/libs/cnkalman/.github/workflows/cmake.yml
new file mode 100644
index 00000000..4b37f3c9
--- /dev/null
+++ b/libs/cnkalman/.github/workflows/cmake.yml
@@ -0,0 +1,112 @@
+name: Build and Test
+
+on:
+  pull_request:
+  push:
+  release:
+    types:
+      - created
+
+env:
+  BUILD_TYPE: Release
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [Release]
+        os: [ubuntu-18.04, windows-latest]
+        include:
+        - os: ubuntu-18.04
+          build_type: Debug
+        - os: windows-latest
+          build_type: Debug
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+        submodules: recursive
+    - if: contains(matrix.os, 'ubuntu')
+      name: Get Dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install p7zip-full build-essential zlib1g-dev liblapacke-dev libopenblas-dev libatlas-base-dev python3 python3-pip apt-transport-https python-virtualenv ninja-build libeigen3-dev googletest ${{ matrix.deps }} python3-setuptools
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Get python Dependencies
+      run: pip3 install filterpy numpy sympy symengine
+    - name: Setup windows flags
+      if: contains(matrix.os, 'windows') && matrix.build_type == 'Release'
+      id: flags
+      run: echo "::set-output name=SETUP_PY_FLAGS::-G 'Visual Studio 16 2019'"
+    - name: Get mac dependencies
+      if: contains(matrix.os, 'macos')
+      run: |
+        brew install freeglut lapack openblas
+    - name: Create Build Environment
+      # Some projects don't allow in-source building, so create a separate build directory
+      # We'll use this as our working directory for all subsequent commands
+      run: cmake -E make_directory ${{runner.workspace}}/build
+
+    - name: Configure CMake
+      shell: bash
+      working-directory: ${{runner.workspace}}/build
+      run: cmake $GITHUB_WORKSPACE -DUSE_EIGEN=On -DDOWNLOAD_EIGEN=On -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_TESTS=ON -DCMAKE_INSTALL_PREFIX=install_root ${{matrix.config_flags}}
+
+    - name: Build
+      working-directory: ${{runner.workspace}}/build
+      shell: bash
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build . --config $BUILD_TYPE -v
+    
+    - name: Install
+      working-directory: ${{runner.workspace}}/build
+      shell: bash
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build . --config $BUILD_TYPE --target install
+
+    - name: Set bundle name
+      id: bundle
+      run: echo "::set-output name=BUNDLE_FILE_NAME::cnkalman-$(git describe --tags)-${{ matrix.os }}.zip"
+
+    - name: Bundle
+      if: matrix.build_type == 'Release'
+      working-directory: ${{runner.workspace}}/cnkalman
+      run: 7z a ${{runner.workspace}}/build/${{ steps.bundle.outputs.BUNDLE_FILE_NAME }} ${{runner.workspace}}/build/install_root/*
+
+    - uses: actions/upload-artifact@v2
+      name: Upload
+      if: matrix.build_type == 'Release'
+      with:
+        name: cnkalman-${{ matrix.os }}-${{ matrix.build_type }}
+        path: | 
+          ${{runner.workspace}}/build/${{steps.bundle.outputs.BUNDLE_FILE_NAME}}
+    - name: Get release
+      id: get_release
+      if: github.event_name == 'release'
+      uses: bruceadams/get-release@v1.2.2
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Upload Release Asset
+      id: upload-release-asset
+      continue-on-error: true
+      if: github.event_name == 'release'
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        upload_url: ${{ steps.get_release.outputs.upload_url }}
+        asset_path: ${{runner.workspace}}/build/${{steps.bundle.outputs.BUNDLE_FILE_NAME}}
+        asset_name: ${{steps.bundle.outputs.BUNDLE_FILE_NAME}}
+        asset_content_type: application/zip
+
+    - name: Test
+      working-directory: ${{runner.workspace}}/build
+      shell: bash
+      # Execute tests defined by the CMake configuration.
+      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+      run: ctest -C $BUILD_TYPE --output-on-failure -j30
diff --git a/libs/cnkalman/.gitignore b/libs/cnkalman/.gitignore
new file mode 100644
index 00000000..bcd03e76
--- /dev/null
+++ b/libs/cnkalman/.gitignore
@@ -0,0 +1,43 @@
+# Linux
+.idea
+*.o
+*.so
+lib
+cmake-build-*
+build*
+
+# Temp files
+*~
+*/\#*\#
+\#*\#
+
+
+# Windows specific
+*.dll
+*.exe
+obj/
+bin/
+x64/
+packages
+.vs/
+Debug/
+Release/
+*.VC.db
+*.user
+*.ipch
+*.tlog
+*.d
+build
+
+.options
+.zip
+.avi
+
+*/__pycache__
+*.pyc
+
+venv
+.eggs/
+dist/
+
+**/*.egg-info/
\ No newline at end of file
diff --git a/libs/cnkalman/Android.mk b/libs/cnkalman/Android.mk
new file mode 100644
index 00000000..4fbd807d
--- /dev/null
+++ b/libs/cnkalman/Android.mk
@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE                := libcnkalman
+LOCAL_SRC_FILES             := src/iekf.c src/kalman.c src/model.cc src/numerical_diff.c
+LOCAL_CPP_EXTENSION         := .cc
+LOCAL_CFLAGS                := -Wno-error=unused-parameter -Wno-error=unused-variable
+LOCAL_MODULE_CLASS          := STATIC_LIBRARIES
+LOCAL_C_INCLUDES            := $(LOCAL_PATH)/include $(LOCAL_PATH)/src
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include
+LOCAL_STATIC_LIBRARIES      := libcnmatrix
+LOCAL_PROPRIETARY_MODULE    := true
+include $(BUILD_STATIC_LIBRARY)
diff --git a/libs/cnkalman/CMakeLists.txt b/libs/cnkalman/CMakeLists.txt
new file mode 100644
index 00000000..5b489ca8
--- /dev/null
+++ b/libs/cnkalman/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(cnkalman LANGUAGES C CXX)
+include(CTest)
+
+include(CheckIncludeFile)
+include (CheckSymbolExists)
+
+set(CMAKE_CXX_STANDARD 17)
+
+option(USE_ASAN "Use address sanitizer" OFF)
+
+if(NOT PYTHON_EXECUTABLE)
+  find_package (Python 3.8 COMPONENTS Interpreter Development REQUIRED)
+endif()
+
+if(UNIX)
+  add_compile_options(-fPIC -Wall -Wno-unused-variable -Wno-switch -Wno-parentheses -Wno-missing-braces -Werror=return-type -fvisibility=hidden -Werror=vla -fno-math-errno -Werror=pointer-arith)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SHARED_FLAGS} -std=gnu99 -Werror=incompatible-pointer-types -Werror=implicit-function-declaration -Werror=missing-field-initializers ")
+
+  if(USE_ASAN)
+    add_compile_options(-fsanitize=address)
+    add_link_options(-fsanitize=address)
+  endif()
+endif()
+
+find_package(sciplot)
+if(sciplot_FOUND)
+  message("Using sciplot...")
+  add_compile_definitions(HAS_SCIPLOT)
+else()
+  message("Can't use sciplot...")
+endif()
+
+add_subdirectory(libs/pybind11)
+
+add_subdirectory(src)
+
+set(cnkalman_root_source_dir ${CMAKE_CURRENT_SOURCE_DIR} CACHE INTERNAL "")
+execute_process(COMMAND  ${CMAKE_COMMAND} -E env PYTHONPATH=${cnkalman_root_source_dir} python3 -c "import cnkalman.codegen as cg"
+        RESULT_VARIABLE INVALID_PYTHON3 ERROR_QUIET)
+set(INVALID_PYTHON3 ${INVALID_PYTHON3} CACHE BOOL "Whether the version of python supports generation or not" FORCE)
+function(cnkalman_generate_code FILE)
+  get_filename_component(DIR ${CMAKE_CURRENT_SOURCE_DIR}/${FILE} DIRECTORY)
+  get_filename_component(NAME ${CMAKE_CURRENT_SOURCE_DIR}/${FILE} NAME_WE)
+
+  if(NOT INVALID_PYTHON3)
+    add_custom_command(OUTPUT ${DIR}/${NAME}.gen.h COMMAND
+            ${CMAKE_COMMAND} -E env PYTHONPATH=${cnkalman_root_source_dir} python3 ${CMAKE_CURRENT_SOURCE_DIR}/${FILE} --cnkalman-generate-source
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${FILE} ${cnkalman_root_source_dir}/cnkalman/codegen.py
+            COMMENT "Generating ${DIR}/${NAME}.gen.h...")
+  else()
+    message("Using provided ${DIR}/${NAME}.gen.h")
+  endif()
+endfunction()
+
+if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
+if(ENABLE_TESTS)
+  add_subdirectory(tests)
+endif()
+endif()
+
+
+install(DIRECTORY include/cnkalman DESTINATION include)
+include(GNUInstallDirs)
+configure_file(cnkalman.pc.in cnkalman.pc @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cnkalman.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+
diff --git a/libs/cnkalman/LICENSE b/libs/cnkalman/LICENSE
new file mode 100644
index 00000000..356654ec
--- /dev/null
+++ b/libs/cnkalman/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016-2018 <>< Charles Lohr, J. David Berger, Michael W. Turvey
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/libs/cnkalman/README.md b/libs/cnkalman/README.md
new file mode 100644
index 00000000..976bac61
--- /dev/null
+++ b/libs/cnkalman/README.md
@@ -0,0 +1,119 @@
+# CNKalman [![Build and Test](https://github.com/cntools/cnkalman/actions/workflows/cmake.yml/badge.svg)](https://github.com/cntools/cnkalman/actions/workflows/cmake.yml)
+
+This is a relatively low level implementation of a kalman filter; with support for extended and iterative extended
+kalman filters. The goals of the project are to provide a numerically stable, robust EKF implementation which is both
+fast and portable. 
+
+The main logic is written in C and only needs the associated matrix library to work; and there are C++ wrappers provided 
+for convenience.
+
+## Tutorial on Kalman Filter Theory
+
+There was originally going to be a more in depth discussion of the theoretical side but there isn't much one could do to 
+improve on the very in depth [tutorial by Roger Labbe](https://github.com/rlabbe/Kalman-and-Bayesian-Filters-in-Python/).
+
+## Features
+
+- Support for [extended kalman filter](https://en.wikipedia.org/wiki/Extended_Kalman_filter), [linear kalman filters](https://en.wikipedia.org/wiki/Kalman_filter), and [Iterate Extended Kalman Filter](https://en.wikipedia.org/wiki/Extended_Kalman_filter#Iterated_extended_Kalman_filter) ([paper](https://www.diva-portal.org/smash/get/diva2:844060/FULLTEXT01.pdf))
+- Support for [adaptive measurement covariance](https://arxiv.org/pdf/1702.00884.pdf)
+- Support for [error-state kalman filter](http://www.iri.upc.edu/people/jsola/JoanSola/objectes/notes/kinematics.pdf)  
+- Built-in support for numerical-based jacobians, and an option to debug user provided jacobians by using 
+  the numerical results
+- Minimal heap allocations  
+- Supports multiple measurement models per filter, which can be integrated at varying frequencies
+- C++ bindings for objected oriented applications
+- Automatic code generation for analytical jacobians
+
+## Quick start
+
+[kalman_example.c](https://github.com/cntools/cnkalman/blob/develop/tests/kalman_example.c)
+```C
+#include <cnkalman/kalman.h>
+#include <stdio.h>
+
+static inline void kalman_transition_model_fn(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0,
+struct CnMat *x1, struct CnMat *F) {
+    // Logic to fill in the next state x1 and the associated transition matrix F
+}
+
+static inline void kalman_process_noise_fn(void *user, FLT dt, const struct CnMat *x, struct CnMat *Q) {
+    // Logic to fill in the process covariance Q
+}
+
+static inline bool kalman_measurement_model_fn(void *user, const struct CnMat *Z, const struct CnMat *x_t,
+struct CnMat *y, struct CnMat *H_k) {
+    // Logic to fill in the residuals `y`, and the jacobian of the predicted measurement function `h`
+    return false; // This should return true if the jacobian and evaluation were valid.
+}
+
+int main() {
+    int state_cnt = 1;
+    cnkalman_state_t kalman_state = { 0 };
+    cnkalman_state_init(&kalman_state, state_cnt, kalman_transition_model_fn, kalman_process_noise_fn, 0, 0);
+    // Uncomment the next line if you want to use numerical jacobians for the transition matrix
+    //kalman_state.transition_jacobian_mode = cnkalman_jacobian_mode_two_sided; 
+    
+    cnkalman_meas_model_t kalman_meas_model = { 0 };
+    cnkalman_meas_model_init(&kalman_state, "Example Measurement", &kalman_meas_model, kalman_measurement_model_fn);
+    // Uncomment the next line if you want to use numerical jacobians for this measurement
+    // kalman_meas_model.meas_jacobian_mode = cnkalman_jacobian_mode_two_sided; 
+    
+    CnMat Z, R;
+    // Logic to fill in measurement matrix Z and measurement covariance matrix R
+    cnkalman_meas_model_predict_update(1, &kalman_meas_model, 0, &Z, &R);
+    
+    printf("Output:%f\n", cn_as_vector(&kalman_state.state)[0]);
+    return 0;
+}
+```
+
+# Code Generation
+
+One of the more difficult things about extended kalman filters is the fact that calculating the jacobian for even a simple
+measurement or prediction function can be tedious and error prone. An optional portion of cnkalman is easy integration 
+of symengine in such a way that you can write the objective function in python and it'll generate the C implementation 
+of both the function itself as well as it's jacobian with each of it's inputs. 
+
+[BikeLandmarks.py](https://github.com/cntools/cnkalman/blob/develop/tests/models/BikeLandmarks.py)
+```python
+from symengine import atan2, asin, cos, sin, tan, sqrt
+import cnkalman.codegen as cg
+
+@cg.generate_code(state = 3, u = 2)
+def predict_function(dt, wheelbase, state, u):
+    x, y, theta = state
+    v, alpha = u
+    d = v * dt
+    R = wheelbase/tan(alpha)
+    beta = d / wheelbase * tan(alpha)
+
+    return [x + -R * sin(theta) + R * sin(theta + beta),
+            y + R * cos(theta) - R * cos(theta + beta),
+            theta + beta]
+
+@cg.generate_code(state = 3, landmark = 2)
+def meas_function(state, landmark):
+    x, y, theta = state
+    px, py = landmark
+
+    hyp = (px-x)**2 + (py-y)**2
+    dist = sqrt(hyp)
+
+    return [dist, atan2(py - y, px - x) - theta]
+```
+
+There are limitations in what type of logic is permissible here -- it must be something that is analytically tractable --
+but most objective functions themselves are not too complicated to write. 
+
+Notice that for array-type input like `state` and `u` above, you must specify a size hint.
+
+When ran, this python script generates the companion `BikeLandmarks.gen.h` which has the generated code, and callouts
+such as:
+```c
+static inline void gen_predict_function(CnMat* out, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u);
+static inline void gen_predict_function_jac_state(CnMat* Hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u);
+```
+
+If you include this project in as a cmake project, a cmake function `cnkalman_generate_code` is available that makes this
+an optional part of your build process. 
+
diff --git a/libs/cnkalman/cnkalman.pc.in b/libs/cnkalman/cnkalman.pc.in
new file mode 100644
index 00000000..95c1b9fd
--- /dev/null
+++ b/libs/cnkalman/cnkalman.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/include/cnkalman
+
+Name: @CMAKE_PROJECT_NAME@
+Description: A C and C++ extended kalman filter library
+Version: 0
+Libs: -L${libdir} -lcnkalman @EXTRA_LIBS@
+Cflags: -I${includedir} -I${includedir}/redist
diff --git a/libs/cnkalman/cnkalman/__init__.py b/libs/cnkalman/cnkalman/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/libs/cnkalman/cnkalman/codegen.py b/libs/cnkalman/cnkalman/codegen.py
new file mode 100644
index 00000000..9054c56b
--- /dev/null
+++ b/libs/cnkalman/cnkalman/codegen.py
@@ -0,0 +1,1144 @@
+import dataclasses
+import logging
+import math
+import sys
+import types
+from collections.abc import Iterable
+
+import symengine.lib.symengine_wrapper
+import sympy
+import sys
+
+from symengine import cse
+#from symengine import atan2, sqrt, cos, sin, Matrix, Pow, Mul, asin, Symbol, symbols, Abs, tan, acos
+from symengine import Matrix, Mul, Symbol, symbols
+
+import logging
+import os
+
+use_symbolic_eval = False
+dirty_python_hack = False
+
+function_expose_list = [(symengine.atan2, math.atan2),
+                        (symengine.sqrt, math.sqrt),
+                        (symengine.cos, math.cos),
+                        (symengine.sin, math.sin),
+                        (symengine.Pow, math.pow),
+                        (symengine.asin, math.asin),
+                        (symengine.Abs, abs),
+                        (symengine.tan, math.tan),
+                        (symengine.acos, math.acos),
+                        ]
+
+for sym_f, f in function_expose_list:
+    current_module = sys.modules[__name__]
+
+    def capture(sym_f, f):
+        def eval_f(*args, **kwargs):
+            global use_symbolic_eval
+            return sym_f(*args, **kwargs) if use_symbolic_eval else f(*args, **kwargs)
+        return eval_f
+    setattr(current_module, f.__name__, capture(sym_f, f))
+
+def isinstance_namedtuple(obj) -> bool:
+    return (
+            isinstance(obj, tuple) and
+            hasattr(obj, '_asdict') and
+            hasattr(obj, '_fields')
+    )
+
+def flatten_args(bla, prefix=''):
+    output = []
+    if hasattr(bla, "__iter__"):
+        for i, item in enumerate(bla):
+            output += flatten_args(item, prefix=prefix+'['+str(i)+']') if hasattr(item, "__iter__") or isinstance_namedtuple(item) else [(prefix+"["+str(i)+"]", item)]
+    elif hasattr(bla, "__dict__"):
+        for k,vall in bla.__dict__.items():
+            if hasattr(vall, "__iter__") or isinstance_namedtuple(vall):
+                for ik, iv in flatten_args(vall, prefix=prefix+'.'+k):
+                    output.append((ik, iv))
+            else:
+                output.append((prefix + "." + k, vall))
+
+    return output
+
+
+def make_sympy(expressions):
+    flatten = []
+
+    if type(expressions) == symengine.MutableDenseMatrix:
+        return expressions
+    if type(expressions) == sympy.Matrix:
+        return expressions
+
+    if hasattr(expressions, "atoms"):
+        return [expressions]
+
+    if isinstance(expressions, list) or isinstance(expressions, np.ndarray):
+        return symengine.MutableDenseMatrix([make_sympy(a) for a in expressions])
+    if hasattr(expressions, 'symengine_type'):
+        return expressions.symengine_type()
+
+    if not hasattr(expressions, "_sympy_"):
+        if isinstance(expressions, Iterable):
+            for col in expressions:
+                if hasattr(col, '_sympy_') or hasattr(col, 'free_symbols'):
+                    flatten.append(col)
+                else:
+                    for cell in col:
+                        flatten.append(cell)
+        else:
+            return [expressions]
+    else:
+        flatten.append(expressions)
+    return flatten
+
+def expand_pow(x):
+    pass
+
+def number(x):
+    if x.is_Number:
+        return float(x)
+    return None
+
+def clean_parens(txt):
+    if txt[0] == '(' and txt[-1] == ')':
+        cnt = 0
+        for c in txt[1:-1]:
+            if c == '(':
+                cnt += 1
+            if c == ')':
+                cnt -= 1
+            if cnt < 0:
+                return txt
+        return clean_parens(txt[1:-1])
+    return txt
+
+class c_formatter:
+    @staticmethod
+    def format_ternary(test, true_path, false_path, note = ""):
+        return "(%s ? %s : %s)%s" % (test, true_path, false_path, f" /* {note} */" if note else "")
+
+class pyx_formatter:
+    @staticmethod
+    def format_ternary(test, true_path, false_path, note = ""):
+        return f"({true_path} if {test} else {false_path})"
+
+def code_wrapper(formatter, item, depth = 0):
+    #return symengine.ccode(item)
+
+    if item.is_Atom:
+        if item == True:
+            return "true"
+        if item == False:
+            return "false"
+        return symengine.ccode(item)
+
+    newargs = list(map(lambda x: code_wrapper(formatter, x, depth+1), item.args))
+
+    infixes = {
+        Mul: '*',
+        symengine.Add: '+',
+        symengine.GreaterThan: '>=',
+        symengine.StrictGreaterThan: '>',
+        symengine.LessThan: '<=',
+        symengine.StrictLessThan: '<'
+    }
+
+    if item.__class__ == symengine.Pow:
+        # Basically it's always faster to never call pow
+        if item.args[1].is_Number and abs(item.args[1]) < 20:
+            invert = item.args[1] < 0
+            num, den = abs(item.args[1]).get_num_den()
+
+            if den == 1 or den == 2:
+                mul_cnt = num if den == 1 else (num - 1) / 2
+                muls = [newargs[0]] * int(mul_cnt)
+                if den == 2:
+                    muls.append("sqrt(" + clean_parens(newargs[0]) + ")")
+                v = " * ".join(muls)
+                if len(muls) > 1:
+                    v = "(" + v + ")"
+                if invert:
+                    v = "(1. / " + v + ")"
+                return v
+        return "pow(%s, %s)" % tuple(newargs)
+    elif item.__class__ is symengine.Abs:
+        return f"fabs({newargs[0]})"
+    elif item.__class__ in infixes:
+        return "(" + (" " + infixes[item.__class__] + " ").join(newargs) + ")"
+    elif item.__class__ == symengine.Piecewise:
+        if item.args[1] == True:
+            return newargs[0]
+        if item.args[1] == False:
+            return newargs[2]
+        return formatter.format_ternary(newargs[1], newargs[0], newargs[2])
+    elif isinstance(item, symengine.Derivative):
+        if item.args[0] == symengine.Abs(item.args[1]):
+            return formatter.format_ternary(item.args[1] > 0, 1, -1, note="Note: Maybe not valid for == 0")
+            #return "((%s) > 0 ? 1 : -1) /* Note: Maybe not valid for == 0 */" % (item.args[1])
+
+    known_c_funcs = [ 'asin', 'cos', 'sin', 'atan2', 'tan', 'atan', 'acos']
+    if item.__class__.__name__ not in known_c_funcs:
+        print("Warning: Unhandled type " + item.__class__.__name__, file=sys.stderr)
+    return item.__class__.__name__ + "(" + ", ".join(map(clean_parens, newargs)) + ")"
+    #raise Exception("Unhandled type " + item.__class__.__name__)
+
+def ccode(item):
+    return clean_parens(code_wrapper(c_formatter, item))
+
+def pyxcode(item):
+    return clean_parens(code_wrapper(pyx_formatter, item))
+
+def sanitize_name(n):
+    if len(n) > 1 and n[0] == 'x' and str(n[1:]).isdigit():
+        return "_" + n
+    return n
+
+class WrapIterable:
+    def __init__(self):
+        pass
+
+    def apply_operator(self, op, other):
+        if isinstance(other, Iterable):
+            return np.array([op(x[0], x[1]) for x in zip(self, other)])
+        return np.array([op(x, other) for x in self])
+
+    def __add__(self, other): return self.apply_operator(lambda x, y: x + y, other)
+    def __radd__(self, other): return self.apply_operator(lambda x, y: y + x, other)
+    def __sub__(self, other): return self.apply_operator(lambda  x, y: x - y, other)
+    def __rsub__(self, other): return self.apply_operator(lambda  x, y: y - x, other)
+    def __sub__(self, other): return self.apply_operator(lambda x, y: x - y, other)
+    def __mul__(self, other): return self.apply_operator(lambda x, y: x * y, other)
+    def __div__(self, other): return self.apply_operator(lambda x, y: x / y, other)
+    def __truediv__(self, other): return self.apply_operator(lambda x, y: x / y, other)
+
+class WrapTuple(WrapIterable):
+    def __init__(self, n, t):
+        super().__init__()
+        self.n = sanitize_name(n)
+        self.t = t
+
+    def __getitem__(self, item): return self.t[item]
+    def __iter__(self): yield from self.t
+    def __str__(self): return self.n
+    def __repr__(self): return self.n
+
+class WrapBase:
+    def __init__(self, parent):
+        self._parent = parent
+    def root(self):
+        if self._parent is None:
+            return self
+        return self._parent.root()
+
+    def accessor(self, stopat = None):
+        if self._parent is None:
+            global dirty_python_hack
+            return f"(*{self._name})" if not dirty_python_hack else self._name
+        if self._parent is stopat:
+            return self._name
+        return self._parent.accessor(stopat) + "." + self._name
+    def offsetof(self):
+        r = self.root()
+        if self == r:
+            return None
+        if hasattr(r, '_type'):
+            if dirty_python_hack:
+                return 0
+            return f"offsetof({r._type.__name__}, {self.accessor(r)})/sizeof(FLT)"
+        return None
+    def symengine_type(self):
+        return Symbol(self.accessor())
+    @staticmethod
+    def as_symengine(x):
+        if hasattr(x, 'symengine_type'):
+            return x.symengine_type()
+        return x
+    def __add__(self, other): return self.symengine_type() + WrapBase.as_symengine(other)
+    def __radd__(self, other): return WrapBase.as_symengine(other) + self.symengine_type()
+    def __sub__(self, other): return symengine.Add(self.symengine_type(), -WrapBase.as_symengine(other))
+    def __rsub__(self, other): return symengine.Add(WrapBase.as_symengine(other), -self.symengine_type())
+    def __mul__(self, other): return self.symengine_type() * WrapBase.as_symengine(other)
+    def __div__(self, other): return self.symengine_type() / WrapBase.as_symengine(other)
+    def __truediv__(self, other): return self.symengine_type() / WrapBase.as_symengine(other)
+    def __lt__(self, other):
+        if self._parent == other._parent:
+            return self.id() < other.id()
+        return self._parent < other._parent
+
+    def __str__(self):
+        return self._name
+
+class WrapIndex(WrapBase):
+    def __init__(self, parent, item):
+        super().__init__(parent)
+        self._item = item
+
+    def accessor(self, stopat = None):
+        return f"{self._parent.accessor(stopat)}[{self._item}]"
+    def __str__(self):
+        return f"{self._parent.accessor()}[{self._item}]"
+    def __repr__(self): return self.__str__()
+
+    def id(self): return self._item
+
+    def __eq__(self, other):
+        return self.__hash__() == other.__hash__()
+
+    def __hash__(self):
+        return str(self).__hash__()
+
+class WrapArray(WrapIterable, WrapBase, Iterable):
+    def __init__(self, name, parent, default):
+        WrapIterable.__init__(self)
+        WrapBase.__init__(self, parent)
+        self._default = None
+        self._length = -1
+        self._name = sanitize_name(name)
+        self._array = list()
+        if default is not dataclasses.MISSING and default is not None:
+            self._default = default
+            self._length = len(default)
+            [self[x] for x in range(self._length)]
+
+    def accessor(self, stopat = None):
+        if self._parent is None:
+            return f"{self._name}"
+        return WrapBase.accessor(self, stopat)
+
+    def ensure_size(self, idx):
+        while len(self._array) <= idx:
+            self._array.append(None)
+
+    def __getitem__(self, item):
+        if isinstance(item, slice):
+            return self._array[item]
+
+        name = f"{self.accessor()}[{item}]"
+        rtn = Symbol(name)
+        self.ensure_size(item)
+        if self._array[item] == None:
+            self._array[item] = WrapIndex(self, item)
+        return rtn
+    def id(self): return self._name
+
+    def __iter__(self):
+        yield from self._array
+    def __len__(self):
+        if self._length == -1:
+            raise Exception(f"Need length annotation for {self._name}")
+        return self._length
+
+class WrapMember(WrapBase):
+    def __init__(self, name, type, parent):
+        super().__init__(parent)
+        self._name = sanitize_name(name)
+        self._type = type
+    def _sympy_(self):
+        return self.symengine_type()
+    def __str__(self):
+        return self._name
+    def id(self): return self._name
+    def accessor(self, stopat = None):
+        if self._parent is None:
+            return f"{self._name}"
+        if self._parent is stopat:
+            return self._name
+        return self._parent.accessor(stopat) + "." + self._name
+    def __repr__(self): return self.__str__()
+
+class WrapObject(WrapBase):
+    def __init__(self, name, type, parent):
+        super().__init__(parent)
+        self._name = sanitize_name(name)
+        self._type = type
+        for k,f in type.__dataclass_fields__.items():
+            setattr(self, k, get_argument(k, None, f.type, parent=self, default=f.default))
+
+    def id(self): return self._name
+
+    def __lt__(self, other):
+        if self._parent == other._parent:
+            return self._name < other._name
+        if self._parent is None:
+            return True
+        if other._parent is None:
+            return False
+        return self._parent < other._parent
+
+    def __str__(self):
+        return self._name
+
+def parse_type(n, type, parent, default):
+    if type is list or type is np.array:
+        return WrapArray(n, parent, default)
+    if hasattr(type, '__dataclass_fields__'):
+        return WrapObject(n, type, parent)
+    return WrapMember(n, type, parent)
+
+def get_argument(n, argument_specs, annotation, parent=None, default = None):
+    if argument_specs is not None and n in argument_specs:
+        a = argument_specs[n]
+        digits = math.floor(math.log(a))
+        if isinstance(a, tuple):
+            return WrapTuple(n, a)
+        if isinstance(a, int):
+            return WrapTuple(n, [ symbols(f"{sanitize_name(n)}{str(i).zfill(digits)}") for i in range(a)])
+        return a
+    if annotation is not None:
+        return parse_type(n, annotation, parent, default)
+    if n in globals():
+        return globals()[n]
+    return symbols(n)
+
+def SymbolizeType(type):
+    return parse_type(type.__name__, type, None, None)
+
+def get_name(a):
+    if type(a) == list:
+        return "_".join(map(get_name, a))
+    if hasattr(a, '__name__'):
+        return a.__name__
+    return str(a)
+
+import inspect
+
+
+def flatten_func(func, name=None, args=None, suffix = None, argument_specs ={}):
+    if callable(func):
+        name = func.__qualname__.replace(".", "_")
+        annotations = inspect.getfullargspec(func).annotations
+        args = [get_argument(n, argument_specs, annotations.get(n)) for n in inspect.getfullargspec(func).args]
+
+    if suffix is not None:
+        name = name + "_" + suffix
+
+    if isinstance(func, types.FunctionType):
+        #try:
+        func = func(*map_arg(args))
+        #except Exception as e:
+        #    sys.stderr.write(f"Error evaluating {name}. Likely a variable needs a length annotation: {e}\n")
+        #    traceback.print_exception(*sys.exc_info(), file=sys.stderr)
+        #    return None, None
+
+    if hasattr(func, '__dataclass_fields__'):
+        return dataclass2dictionary(func), args
+
+    return make_sympy(func), args
+
+def dataclass2dictionary(func):
+    dict = {}
+    def process(prefix, obj, root = None):
+        if hasattr(obj, '__dataclass_fields__'):
+            for k,f in obj.__dataclass_fields__.items():
+                process(prefix + "." + f.name, getattr(obj, f.name), obj if root is None else root)
+        else:
+            if isinstance(obj, Iterable):
+                for idx, item in enumerate(obj):
+                    dict[(root.__class__.__name__, f"{prefix.strip('.')}[{idx}]")] = item
+            elif isinstance(obj, (symengine.MutableDenseMatrix, sympy.MutableDenseMatrix, np.ndarray)):
+                for idx, item in enumerate(list(obj)):
+                    dict[(root.__class__.__name__, f"{prefix.strip('.')}[{idx}]")] = item
+            else:
+                dict[(root.__class__.__name__, prefix.strip('.'))] = obj
+    process('', func)
+    dict['$original'] = func
+    return dict
+
+def get_type(a):
+    if callable(a):
+        return get_type(a())
+    if hasattr(a, "__iter__"):
+        ty = get_type(a[0])
+        if ty[-1] != "*":
+            ty += "*"
+        return ty
+    if isinstance(a, WrapObject):
+        return a._type.__name__ + "*"
+    if isinstance_namedtuple(a):
+        return a.__class__.__name__ + "*"
+    return "FLT"
+
+def arg_str(arg):
+    a = arg[1]
+    return "const %s %s" % (get_type(a), get_name(a))
+
+def arg_str_py(arg):
+    a = arg[1]
+    t = get_type(a)
+    t = t.replace("FLT*", "floating[:]")
+    t = t.replace("FLT", "floating")
+    return "const %s %s" % (t, get_name(a))
+
+def generate_args_string(args, as_call = False):
+    return ", ".join(map(lambda x: get_name(x[1]) if as_call else arg_str, enumerate(args)))
+
+def generate_pyxcode(func, name=None, args=None, suffix = None, argument_specs ={}, outputs = None, preamble = "", file=None, input_keys = None, prefix = ""):    
+    def emit_code(*args, **kwargs):
+        if file is not None:
+            print(*args, **kwargs, file=file[0])
+            file[0].flush()
+
+    def emit_header(*args, **kwargs):
+        if file is not None:
+            print(*args, **kwargs, file=file[1])
+            file[1].flush()
+    print(f"\tGenerating {name}...", file=sys.stderr)
+    flatten, args = flatten_func(func, name, args, suffix, argument_specs)
+    if flatten is None:
+        return None
+
+    if outputs is None:
+        if hasattr(flatten, "shape"):
+            outputs = [("out", flatten.shape)]
+        else:
+            outputs = [("out", -1)]
+
+    if callable(func):
+        name = func.__qualname__.replace(".", "_")
+        annotations = inspect.getfullargspec(func).annotations
+        args = [get_argument(n, argument_specs, annotations.get(n)) for n in inspect.getfullargspec(func).args]
+
+    if suffix is not None:
+        name = name + "_" + suffix
+
+    keys = None
+    free_symbols = set()
+    def update_free_symbols(v):
+        if hasattr(v, 'free_symbols'):
+            free_symbols.update({k.__str__() for k in v.free_symbols})
+            return
+
+        if isinstance(v, Iterable):
+            for v1 in v:
+                update_free_symbols(v1)
+
+    type_name = "floating[:,:]"
+    type = None
+    if isinstance(flatten, dict):
+        type_name = flatten["$original"].__class__.__name__
+        type = flatten["$original"].__class__
+        flatten.pop("$original")
+        keys = list(flatten.keys())
+        values = [flatten[k] for k in keys]
+        keys = [k[1] for k in keys]
+        values = [ a.symengine_type() if hasattr(a, 'symengine_type') else a for a in values ]
+        print(f"\tCSE for {len(values)} dict values", file=sys.stderr)
+        cse_output = cse(symengine.Matrix(values))
+        update_free_symbols(values)
+
+    else:
+        print(f"\tCSE for {len(flatten)} values", file=sys.stderr)
+        cse_output = cse(symengine.Matrix(flatten))
+        update_free_symbols(flatten)
+
+    print(f"\t{len(free_symbols)} free symbols...", file=sys.stderr)
+    func_name = name
+    emit_header("cdef void %s%s_nogil(%s, %s) nogil " % (prefix,
+                                                  name,
+                                                  ", ".join([f"{type_name} " + s[0] for s in outputs]),
+                                                  ", ".join(map(arg_str_py, enumerate(args)))
+                                                  ))
+
+    emit_code("cdef void %s%s_nogil(%s, %s) nogil: " % (prefix,
+                                        name,
+                                        ", ".join([f"{type_name} " + s[0] for s in outputs]),
+                                        ", ".join(map(arg_str_py, enumerate(args)))
+                                        ))
+
+    if preamble:
+        emit_code(preamble.strip("\r\n"))
+
+    # Unroll struct types
+    for idx, a in enumerate(args):
+        if callable(a):
+            name = get_name(a)
+            for k, v in flatten_args(a()):
+                if f"{name}{k.strip('[]')}" in free_symbols:
+                    emit_code("\tcdef floating %s = %s%s" % (str(v), "("+name+")" if isinstance_namedtuple(a()) else name, k))
+        elif isinstance(a, WrapTuple):
+            name = get_name(a)
+            digits = math.floor(math.log(len(a.t)))
+            for k, v in flatten_args(a.t):
+                idx = k.strip('[]')
+                if f"{name}{str(idx).zfill(digits)}" in free_symbols:
+                    emit_code("\tcdef floating %s = %s%s" % (str(v), name, k))
+
+    for item in cse_output[0]:
+        stripped_line = pyxcode(item[1]).replace("\n", " ").replace("\t", " ")
+        emit_code(f"\tcdef floating {symengine.ccode(item[0])} = {stripped_line}")
+
+    output_idx = 0
+    outputs_idx = 0
+
+    count_zeros = 0
+    for item_idx, item in enumerate(cse_output[1]):
+        if item == 0:
+            count_zeros += 1
+    needs_set_zero = False#count_zeros > len(cse_output[1]) / 4
+
+    current_shape = None
+    if keys is None:
+        current_shape = outputs[outputs_idx][1] if isinstance(outputs[outputs_idx][1], tuple) else [outputs[outputs_idx][1], 1]
+        var = outputs[outputs_idx][0]
+        emit_code(
+            f"\t### cdef np.ndarray[float, ndim=2] {outputs[outputs_idx][0]} = np.zeros(({current_shape[0]},{current_shape[1]}), dtype=np.float32)")
+    else:
+        pass
+    
+    for item_idx, item in enumerate(cse_output[1]):
+        if keys is None:
+            current_shape = outputs[outputs_idx][1] if isinstance(outputs[outputs_idx][1], tuple) else [outputs[outputs_idx][1], 1]
+            current_row = output_idx // current_shape[1]
+            current_col = output_idx % current_shape[1]
+
+            def get_col_str():
+                if len(outputs[outputs_idx]) > 2 and hasattr(outputs[outputs_idx][2][current_col], 'offsetof'):
+                    offset_of = outputs[outputs_idx][2][current_col].offsetof()
+                    if offset_of is not None:
+                        return offset_of
+                return str(current_col)
+            def get_row_str():
+                if input_keys is not None:
+                    root, path = input_keys[current_row]
+                    return 0
+                    return f"offsetof({root}, {path})/sizeof(FLT)"
+                return str(current_row)
+            if hasattr(item, "tolist"):
+                for item1 in sum(item.tolist(), []):
+                    emit_code("\t%s[%s,%s] = %s" % (outputs[outputs_idx][0], get_row_str(), get_col_str(), output_idx, pyxcode(item1).replace("\n", " ").replace("\t", " ")))
+                    output_idx += 1
+                    current_row = output_idx / current_shape[1]
+                    current_col = output_idx % current_shape[1]
+            else:
+                if item != 0 or not needs_set_zero:
+                    emit_code("\t%s[%s,%s] = %s" % (outputs[outputs_idx][0], get_row_str(), get_col_str(), pyxcode(item).replace("\n", " ").replace("\t", " ")))
+                output_idx += 1
+            if output_idx >= math.prod(current_shape) > 0:
+                #emit_code(f"\treturn {outputs[outputs_idx][0]}")
+                outputs_idx += 1
+                output_idx = 0
+        else:
+            nl = "\n"
+            emit_code(f"\tout.{keys[item_idx]}={pyxcode(item).replace(nl, '')}")
+
+    emit_code("")
+
+    release_gil = len(cse_output[1]) > 200
+    emit_code("cpdef void %s%s(%s, %s): " % (prefix,
+                                                  func_name,
+                                             ", ".join([f"{type_name} " + s[0] for s in outputs]),
+                                                  ", ".join(map(arg_str_py, enumerate(args)))
+                                                  ))
+    call_args = ",".join([s[0] for s in outputs]) + ", " + generate_args_string(args, as_call=True)
+    emit_code(f"\t{prefix}{func_name}_nogil({call_args})")
+    emit_code("")
+
+    if current_shape:
+        emit_header("cpdef np.ndarray %s%s_allocate(%s) " % (prefix,
+                                                  func_name,
+                                                  ", ".join(map(arg_str_py, enumerate(args)))
+                                                  ))
+
+        emit_code("cpdef np.ndarray %s%s_allocate(%s): " % (prefix,
+                                                 func_name,
+                                                 ", ".join(map(arg_str_py, enumerate(args)))
+                                                 ))
+        emit_code(f"\tcdef np.ndarray[double, ndim=2] {outputs[0][0]} = np.zeros(({abs(current_shape[0])},{current_shape[1]}), dtype=np.float64)")
+        emit_code(f"\tcdef floating[:,:] _{outputs[0][0]} = {outputs[0][0]}")
+        call_args = ",".join([f"_{s[0]}" for s in outputs]) + ", " + generate_args_string(args, as_call=True)
+        emit_code(f"\t{'with nogil: ' if release_gil else ''}{prefix}{func_name}_nogil({call_args})")
+        if current_shape[1] == 1:
+            emit_code(f"\treturn {', '.join([s[0] for s in outputs])}.reshape(-1)")
+        else:
+            emit_code(f"\treturn {', '.join([s[0] for s in outputs])}")
+
+        for vectorize_over in argument_specs.get('_vectorize', []):
+            def get_vec_arg(a):
+                t = get_type(a)
+                name = get_name(a)
+                if name == vectorize_over:
+                    t += "*"
+                t = t.replace("FLT**", "floating[:,:]")
+                t = t.replace("FLT*", "floating[:]")
+                t = t.replace("FLT", "floating")
+                return "const %s %s" % (t, name)
+
+            output_shape = f"{vectorize_over}.shape[0]", abs(current_shape[0]), current_shape[1]
+            if output_shape[-1] == 1:
+                output_shape = output_shape[:-1]
+
+            dtype = "np.float32 if floating is float else np.float64"
+            vec_args = ', '.join(map(get_vec_arg, args))
+            call_args = ",".join([f"_{s[0]}[idx,:,:]" for s in outputs] + [f"{get_name(a)}[idx]" if vectorize_over == get_name(a) else get_name(a) for a in args])
+            emit_header(f"cpdef np.ndarray {prefix}{func_name}_vectorize_{vectorize_over}({vec_args}) ")
+            emit_code(f"""
+cpdef np.ndarray {prefix}{func_name}_vectorize_{vectorize_over}({vec_args}):            
+    cdef np.ndarray[floating, ndim=3] {outputs[0][0]} = np.zeros(({vectorize_over}.shape[0],{abs(current_shape[0])},{current_shape[1]}), dtype={dtype})
+    cdef floating[:,:,:] _{outputs[0][0]} = {outputs[0][0]}
+    cdef int idx
+    with nogil:
+        for idx in range({vectorize_over}.shape[0]):
+            {prefix}{func_name}_nogil({call_args})
+    return {outputs[0][0]}.reshape({', '.join(map(str,output_shape))})
+""".replace("    ", "\t"))
+
+    emit_header("cpdef void %s%s(%s, %s)" % (prefix,
+                                             func_name,
+                                             ", ".join([f"{type_name} " + s[0] for s in outputs]),
+                                             ", ".join(map(arg_str_py, enumerate(args)))
+                                             ))
+    emit_code("")
+
+    return flatten
+
+def generate_ccode(func, name=None, args=None, suffix = None, argument_specs ={}, outputs = None, preamble = "", file=None, input_keys = None, prefix = ""):
+    def emit_code(*args, **kwargs):
+        if file is not None:
+            print(*args, **kwargs, file=file)
+
+    flatten, args = flatten_func(func, name, args, suffix, argument_specs)
+    if flatten is None:
+        return None
+
+    if outputs is None:
+        if hasattr(flatten, "shape"):
+            outputs = [("out", flatten.shape)]
+        else:
+            outputs = [("out", -1)]
+
+    if callable(func):
+        name = func.__qualname__.replace(".", "_")
+        annotations = inspect.getfullargspec(func).annotations
+        args = [get_argument(n, argument_specs, annotations.get(n)) for n in inspect.getfullargspec(func).args]
+
+    if suffix is not None:
+        name = name + "_" + suffix
+
+    singular_return = len(flatten) == 1
+
+    keys = None
+    free_symbols = set()
+    def update_free_symbols(v):
+        if hasattr(v, 'free_symbols'):
+            free_symbols.update({k.__str__() for k in v.free_symbols})
+            return
+
+        if isinstance(v, Iterable):
+            for v1 in v:
+                update_free_symbols(v1)
+
+    type = "CnMat"
+    if isinstance(flatten, dict):
+        type = flatten["$original"].__class__.__name__
+        flatten.pop("$original")
+        keys = list(flatten.keys())
+        values = [flatten[k] for k in keys]
+        keys = [k[1] for k in keys]
+        values = [ a.symengine_type() if hasattr(a, 'symengine_type') else a for a in values ]
+        cse_output = cse(symengine.Matrix(values))
+        update_free_symbols(values)
+    else:
+        cse_output = cse(symengine.Matrix(flatten))
+        update_free_symbols(flatten)
+
+    if singular_return:
+        emit_code("static inline FLT %s%s(%s) {" % (prefix, name, ", ".join(map(arg_str, enumerate(args)))))
+    else:
+        emit_code("static inline void %s%s(%s, %s) {" % (prefix, name, ", ".join([type + "* " + s[0] for s in outputs]), ", ".join(map(arg_str, enumerate(args)))))
+
+    if preamble:
+        emit_code(preamble.strip("\r\n"))
+
+    # Unroll struct types
+    for idx, a in enumerate(args):
+        if callable(a):
+            name = get_name(a)
+            for k, v in flatten_args(a()):
+                if f"{name}{k.strip('[]')}" in free_symbols:
+                    emit_code("\tconst FLT %s = %s%s;" % (str(v), "(*"+name+")" if isinstance_namedtuple(a()) else name, k))
+        elif isinstance(a, WrapTuple):
+            name = get_name(a)
+            for k, v in flatten_args(a.t):
+                if f"{name}{k.strip('[]')}" in free_symbols:
+                    emit_code("\tconst FLT %s = %s%s;" % (str(v), name, k))
+
+    for item in cse_output[0]:
+        stripped_line = ccode(item[1]).replace("\n", " ").replace("\t", " ")
+        emit_code(f"\tconst FLT {symengine.ccode(item[0])} = {stripped_line};")
+
+    output_idx = 0
+    outputs_idx = 0
+
+    count_zeros = 0
+    for item_idx, item in enumerate(cse_output[1]):
+        if item == 0:
+            count_zeros += 1
+    needs_set_zero = count_zeros > len(cse_output[1]) / 4
+
+    if keys is None and not singular_return:
+        current_shape = outputs[outputs_idx][1] if isinstance(outputs[outputs_idx][1], tuple) else [outputs[outputs_idx][1], 1]
+        var = outputs[outputs_idx][0]
+        if needs_set_zero:
+            emit_code(f"\tcnSetZero({var});")
+    for item_idx, item in enumerate(cse_output[1]):
+        if keys is None:
+            current_shape = outputs[outputs_idx][1] if isinstance(outputs[outputs_idx][1], tuple) else [outputs[outputs_idx][1], 1]
+            current_row = output_idx // current_shape[1]
+            current_col = output_idx % current_shape[1]
+
+            def get_col_str():
+                if len(outputs[outputs_idx]) > 2 and hasattr(outputs[outputs_idx][2][current_col], 'offsetof'):
+                    offset_of = outputs[outputs_idx][2][current_col].offsetof()
+                    if offset_of is not None:
+                        return offset_of
+                return str(current_col)
+            def get_row_str():
+                if input_keys is not None:
+                    root, path = input_keys[current_row]
+                    return f"offsetof({root}, {path})/sizeof(FLT)"
+                return str(current_row)
+            if hasattr(item, "tolist"):
+                for item1 in sum(item.tolist(), []):
+                    emit_code("\tcnMatrixOptionalSet(%s, %s, %s, %s);" % (outputs[outputs_idx][0], get_row_str(), get_col_str(), output_idx, ccode(item1).replace("\n", " ").replace("\t", " ")))
+                    output_idx += 1
+                    current_row = output_idx / current_shape[1]
+                    current_col = output_idx % current_shape[1]
+            else:
+                if singular_return:
+                    emit_code("\treturn %s;" % (ccode(item).replace("\n", " ").replace("\t", " ")))
+                else:
+                    if item != 0 or not needs_set_zero:
+                        emit_code("\tcnMatrixOptionalSet(%s, %s, %s, %s);" % (outputs[outputs_idx][0], get_row_str(), get_col_str(), ccode(item).replace("\n", " ").replace("\t", " ")))
+                output_idx += 1
+            if output_idx >= math.prod(current_shape) > 0:
+                outputs_idx += 1
+                output_idx = 0
+        else:
+            nl = "\n"
+            emit_code(f"\tout->{keys[item_idx]}={ccode(item).replace(nl, '')};")
+
+    emit_code("}")
+    emit_code("")
+    return flatten
+
+
+def jacobian(v, of):
+    of = [ a.symengine_type() if hasattr(a, 'symengine_type') else a for a in of]
+    if hasattr(v, 'jacobian'):
+        return v.jacobian(symengine.Matrix(of))
+    if isinstance(v, np.ndarray):
+        v = list(v)
+    return Matrix([v]).jacobian(symengine.Matrix(of))
+
+def map_arg(arg):
+    if callable(arg):
+        return map_arg(arg())
+    elif isinstance(arg, list):
+        return list(map(map_arg, arg))
+    elif isinstance(arg, tuple):
+        return tuple(map(map_arg, arg))
+    elif isinstance(arg, WrapTuple):
+        return np.array([*arg])
+    return arg
+
+def flat_values(a):
+    if isinstance(a, str):
+        return []
+    if isinstance(a, WrapArray):
+        return [b for b in a]
+    if isinstance(a, WrapMember):
+        return [a]
+    if isinstance(a, Iterable):
+        return sum([flat_values(it) for it in a], [])
+    if hasattr(a, '__dict__'):
+        return flat_values([v for k,v in a.__dict__.items() if not k.startswith("_")])
+    return [a]
+
+
+def generate_jacobians(func, suffix=None,transpose=False,jac_all=False, jac_over=None, argument_specs={}, file=None, prefix="", codegen=generate_ccode):
+    def emit_code(*args, **kwargs):
+        if file is not None:
+            print(*args, **kwargs, file=file)
+
+    rtn = {}
+
+    fx, func_args = flatten_func(func, argument_specs=argument_specs)
+
+    #annotations = inspect.getfullargspec(func).annotations
+    #func_args = [get_argument(n, argument_specs, annotations.get(n)) for n in inspect.getfullargspec(func).args]
+    jac_of = {}
+    if jac_over is not None:
+        jac_of[get_name(jac_over)] = flat_values(map_arg(jac_over))
+    else:
+        jac_args = {get_name(arg): sorted(flat_values(map_arg(arg)), key=str) for arg in func_args}
+        jac_of.update(jac_args)
+
+    if jac_all:
+        jac_of['all'] = sum(list(jac_of.values()), [])
+
+    feval = (func(*map_arg(func_args)))
+
+    keys = None
+    if hasattr(feval, '__dataclass_fields__'):
+        dict = dataclass2dictionary(feval)
+        #type = dict["$original"].__class__.__name__
+        dict.pop("$original")
+        keys = list(dict.keys())
+        values = [dict[k] for k in keys]
+        feval = [ a.symengine_type() if hasattr(a, 'symengine_type') else a for a in values ]
+
+    for name, jac_value in jac_of.items():
+        fname = func.__qualname__.replace(".", "_")  + '_jac_' + name.strip('_')
+        if type(feval) == list or type(feval) == tuple or type(feval) == np.ndarray:
+            feval = make_sympy(feval)
+        this_jac = jacobian(feval, jac_value)
+        if transpose:
+            this_jac = this_jac.transpose()
+
+        jac_shape = this_jac.shape
+        jac_size = this_jac.shape[0] * this_jac.shape[1]
+        # 
+        # if jac_size == 1:
+        #     continue
+
+        #emit_code("// Jacobian of", func.__qualname__.replace(".", "_"), "wrt", jac_value)
+        codegen(this_jac, fname, func_args, argument_specs=argument_specs, suffix=suffix, outputs=[('Hx', jac_shape, jac_value)], input_keys=keys,file=file, prefix=prefix)
+
+        #jac_with_hx = this_jac.reshape(jac_size, 1).col_join(fxm.reshape(fx_size, 1))
+
+        #emit_code("// Full version Jacobian of", func.__qualname__.replace(".", "_"), "wrt", jac_value)
+
+        fn_suffix = ""
+        if suffix is not None:
+            fn_suffix = "_" + suffix
+
+        if keys is not None:
+            continue
+
+        gen_call = f"{prefix}{func.__qualname__.replace('.', '_')}{fn_suffix}(hx, {generate_args_string(func_args, True)});"
+        if len(fx) == 1:
+            gen_call = f"hx->data[0] = {prefix}{func.__qualname__.replace('.', '_')}{fn_suffix}({generate_args_string(func_args, True)});"
+
+    #     preamble = f"""
+    # if(Hx == 0) {{
+    #     {gen_call}
+    #     return;
+    # }}
+    # if(hx == 0) {{
+    #     gen_{fname}{fn_suffix}(Hx, {generate_args_string(func_args, True)});
+    #     return;
+    # }}"""
+    #     generate_ccode(jac_with_hx, fname + "_with_hx", func_args, suffix=suffix, outputs=[('Hx', jac_shape, jac_value), ('hx', this_jac.shape[0] - jac_size)], preamble=preamble, file)
+        outputs = [('Hx', jac_shape, jac_value), ('hx', this_jac.shape[0] - jac_size)]
+        if codegen == generate_ccode:
+            emit_code(f"""
+    static inline void {prefix}{fname}_with_hx({", ".join(["CnMat* " + s[0] for s in outputs])}, {", ".join(map(arg_str, enumerate(func_args)))}) {{
+        if(hx != 0) {{ 
+            {gen_call}
+        }}
+        if(Hx != 0) {{ 
+            {prefix}{fname}{fn_suffix}(Hx, {generate_args_string(func_args, True)});
+        }}
+    }}""")
+
+        rtn[name] = this_jac.reshape(*jac_shape)
+    return rtn, func_args
+
+def can_generate_jacobian(f):
+    if hasattr(f, 'shape'):
+        return f.shape[0] == 1 or f.shape[1] == 1
+    if isinstance(f, dict):
+        return True
+    if isinstance(f, Iterable):
+        return all(map(can_generate_jacobian, f))
+    return True
+
+def generate_code_and_jacobians(f,transpose=False, jac_all=False, jac_over=None, argument_specs = {}, file=None, prefix="", codegen=generate_ccode):
+    f_eval = codegen(f, argument_specs = argument_specs, file=file, prefix=prefix)
+    if can_generate_jacobian(f_eval):
+        return generate_jacobians(f, argument_specs = argument_specs, transpose=transpose, jac_all=jac_all, jac_over=jac_over, file=file, prefix=prefix, codegen=codegen)
+    return None, None
+
+from pathlib import Path
+
+codegen_enable_generate_pyx = False
+def enable_generate_pyx():
+    global codegen_enable_generate_pyx
+    codegen_enable_generate_pyx = True
+
+generate_code_files = {}
+def get_pyx_file(fn):
+    global codegen_enable_generate_pyx
+    force_generate = '--cnkalman-generate-source-pyx' in sys.argv
+    if not codegen_enable_generate_pyx and not force_generate:
+        return None
+    global dirty_python_hack
+    dirty_python_hack = True
+    if fn != sys.argv[0] and not codegen_enable_generate_pyx:
+        return None
+    if fn in generate_code_files:
+        return generate_code_files[fn]
+    path = Path(fn)
+    new_stem = f"{path.stem}_gen"
+    if path.stem == "__init__":
+        new_stem = f"gen"
+
+    full_path = f'{path.parent.as_posix()}/{new_stem}.pyx'
+    if os.path.exists(full_path):
+        #os.path.getmtime(full_path) > os.path.getmtime(__file__) and \
+        if not force_generate and \
+                os.path.getmtime(full_path) > os.path.getmtime(fn) and \
+                os.path.getmtime(f'{path.parent.as_posix()}/{new_stem}.pxd') > os.path.getmtime(fn):
+            return None
+
+    print(f"Generating {path.parent.as_posix()}/{new_stem}.pyx...", file=sys.stderr)
+    f = open(f"{path.parent.as_posix()}/{new_stem}.pyx", 'w')
+    f.write(
+        """# NOTE: This is a generated file; do not edit.
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True, nonecheck=False, overflowcheck=False
+# clang-format off
+import cython
+import numpy as np
+cimport numpy as np
+from libc.math cimport *
+from libc.stdint cimport *
+from libc cimport *
+from cython cimport floating
+""")
+
+    g = open(f"{path.parent.as_posix()}/{new_stem}.pxd", 'w')
+    g.write(
+        """# NOTE: This is a generated file; do not edit.
+# cython: language_level=3
+import numpy as np
+cimport numpy as np
+
+from cython cimport floating
+
+""")
+    generate_code_files[fn] = (f, g)
+    return generate_code_files[fn]
+
+def get_file(fn):
+    if not '--cnkalman-generate-source' in sys.argv:
+        return get_pyx_file(fn)
+    if fn != sys.argv[0]:
+        return None
+    if fn in generate_code_files:
+        return generate_code_files[fn]
+    path = Path(fn)
+    logging.info(f"Generating {path.parent.as_posix()}/{path.stem}.gen.h...", file=sys.stderr)
+    f = generate_code_files[fn] = open(f"{path.parent.as_posix()}/{path.stem}.gen.h", 'w')
+    f.write(
+    """/// NOTE: This is a generated file; do not edit.
+#pragma once
+#include <cnkalman/generated_header.h>
+
+// clang-format off
+""")
+    return generate_code_files[fn]
+
+import numpy as np
+def functionify(args_info, jac):
+    def f(*args, force_resolve=True):
+        subset = {}
+        for i, info in enumerate(args_info):
+            if isinstance(info, WrapTuple):
+                for j, s in enumerate(info.t):
+                    v = args[i][j]
+                    subset[s.__str__()] = v
+            else:
+                subset[info.__str__()] = args[i]
+        rtn = np.array(jac.subs(subset))
+        if force_resolve:
+            return rtn.astype(np.float64)
+        return rtn
+    return f
+
+def expand_hint(v, length):
+    return [v[a] for a in range(length)]
+
+def nan_if_complex(x):
+    if hasattr(x , 'is_real') and x.is_real == False:
+        return np.nan
+    return float(x)
+
+def has_free_symbols(x):
+    if isinstance(x, Iterable):
+        return any([has_free_symbols(y) for y in x])
+    if hasattr(x, 'free_symbols'):
+        return len(x.free_symbols) > 0
+    return False
+
+
+class LazyObject:
+    def __init__(self, f):
+        self.v = None
+        self.eval = False
+        self.f = f
+
+    def __call__(self):
+        if not self.eval:
+            self.v = self.f()
+        return self.v
+
+class Jacobians(object):
+    def __init__(self, func, prefix, kwargs):
+        self.func = func
+        self._jacs = None
+        self._args = None
+        self._evaled = False
+        self.prefix = prefix
+        self.kwargs = kwargs        
+        self.f = get_file(inspect.getfile(func))
+        if self.f is not None:
+            self.eval()
+            
+    def eval(self, reeval=False):
+        if self._evaled and not reeval:
+            return
+        self._evaled = True
+        func = self.func
+        f = self.f
+
+        global use_symbolic_eval
+        old_use_symbolic_eval = use_symbolic_eval
+        use_symbolic_eval = True
+        is_pyx = isinstance(f, tuple)
+        jacs, args = generate_code_and_jacobians(func, argument_specs=self.kwargs, jac_all=self.kwargs.get('_all', False),
+                                                 file=f, prefix=self.prefix, codegen = generate_pyxcode if is_pyx else generate_ccode)
+        use_symbolic_eval = old_use_symbolic_eval
+        self._jacs = jacs
+        self._args = args
+
+        if jacs is not None:
+            for k, v in jacs.items():
+                setattr(self, k, functionify(args, v))
+
+    def __getattr__(self, item):
+        self.eval()
+        return super().__getattribute__(item)
+
+def generate_code(prefix="", **kwargs):
+    def f(func):
+        try:            
+            def g(*args, **g_kwargs):
+                grtn = func(*args, **g_kwargs)
+                if type(grtn) == symengine.MutableDenseMatrix:
+                    return grtn
+                if isinstance(grtn, Iterable) and not has_free_symbols(grtn):
+                    return np.array([nan_if_complex(x) for x in grtn], dtype=np.float64)
+                return grtn
+            try:
+                g.jacobians = Jacobians(func, prefix, kwargs)
+            except TypeError as e:
+                # This happens if it is compiled code
+                pass
+            return g        
+        except Exception as e:
+            logging.warning(f"Could not generate source level jacobian for {func.__qualname__}: {e}")
+            return func
+    return f
diff --git a/libs/cnkalman/include/cnkalman/ModelPlot.h b/libs/cnkalman/include/cnkalman/ModelPlot.h
new file mode 100644
index 00000000..0d229e9b
--- /dev/null
+++ b/libs/cnkalman/include/cnkalman/ModelPlot.h
@@ -0,0 +1,31 @@
+#pragma once
+#include <string>
+#include "cnmatrix/cn_matrix.h"
+#include "cnkalman/model.h"
+#if HAS_SCIPLOT
+#include <sciplot/sciplot.hpp>
+#endif
+namespace cnkalman {
+    struct ModelPlot {
+        FLT range[4] = {INFINITY, -INFINITY, INFINITY, -INFINITY};
+        bool show = false;
+        bool lock_range = false;
+        std::string name;
+        int cnt = 0;
+#ifdef HAS_SCIPLOT
+        sciplot::Plot plot;
+        sciplot::Plot map;
+#endif
+        ModelPlot(const std::string &name = "plot", bool show = false);
+
+        void plot_cov(const cnkalman::KalmanModel &model, FLT deviations, const std::string &color = "red");
+
+        void include_point_in_range(const FLT *X);
+
+        void get_view(FLT &x, FLT &y, FLT &w, FLT &h) const;
+
+        void include_point_in_range(FLT x, FLT y);
+
+        ~ModelPlot();
+    };
+}
\ No newline at end of file
diff --git a/libs/cnkalman/include/cnkalman/generated_header.h b/libs/cnkalman/include/cnkalman/generated_header.h
new file mode 100644
index 00000000..d47062e3
--- /dev/null
+++ b/libs/cnkalman/include/cnkalman/generated_header.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <stddef.h>
+#include <math.h>
+static inline double __cnkalman_safe_sqrt(double x) { return x > 0 ? sqrt(x) : 0; }
+#define sqrt __cnkalman_safe_sqrt
+static inline double __cnkalman_safe_asin(double x) { return asin( (x > 1) ? 1 : ((x < -1) ? -1 : x)); }
+#define asin __cnkalman_safe_asin
+#ifndef WIN32
+#ifndef ANDROID
+#include <complex.h>
+static inline double __cnkalman_safe_pow(double x, double y) { return x >= 0 ? pow(x, y) : creal(cpow(x, y)); }
+#define pow __cnkalman_safe_pow
+#endif
+#endif
+#define GEN_FLT FLT
diff --git a/libs/cnkalman/include/cnkalman/kalman.h b/libs/cnkalman/include/cnkalman/kalman.h
new file mode 100644
index 00000000..2e1cc55c
--- /dev/null
+++ b/libs/cnkalman/include/cnkalman/kalman.h
@@ -0,0 +1,268 @@
+#pragma once
+
+#include "cnmatrix/cn_matrix.h"
+#include "numerical_diff.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This file contains a generic kalman implementation.
+ *
+ * This implementation tries to use the same nomenclature as:
+ * https://en.wikipedia.org/wiki/Kalman_filter#Underlying_dynamical_system_model and
+ * https://en.wikipedia.org/wiki/Extended_Kalman_filter.
+ *
+ * This implementation supports both nonlinear prediction models and nonlinear measurement models. Each phase
+ * incorporates a time delta to approximate a continous model.
+ *
+ * Adaptive functionality:
+ *
+ * https://arxiv.org/pdf/1702.00884.pdf
+ *
+ * The R matrix should be initialized to reasonable values on the first all and then is updated based on the residual
+ * error -- higher error generates higher variance values:
+ *
+ * R_k = a * R_k-1 + (1 - a) * (e*e^t + H * P_k-1 * H^t)
+ *
+ * a is set to .3 for this implementation.
+
+ */
+
+struct cnkalman_state_s;
+
+typedef void (*kalman_normalize_fn_t)(void *user, struct CnMat *x);
+
+// Given state x0 and time delta; gives the new state x1. For a linear model, this is just x1 = F * x0 and f_out is a
+// constant / time dependent
+typedef void (*kalman_transition_model_fn_t)(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0,
+                                             struct CnMat *x1, struct CnMat *f_out);
+
+// Given time and current state, generate the process noise Q_k.
+typedef void (*kalman_process_noise_fn_t)(void *user, FLT dt, const struct CnMat *x, struct CnMat *Q_out);
+
+// Given a measurement Z, and state X_t, generates both the y difference term and the H jacobian term.
+typedef bool (*kalman_measurement_model_fn_t)(void *user, const struct CnMat *Z, const struct CnMat *x_t,
+											  struct CnMat *y, struct CnMat *H_k);
+
+/***
+ * Given a state x0 and a model update Ky, generate the new state. Typically this is ~ x1 = x0 + error_state. Also
+ * optionally generate the jacobian of the operation; d(x1)/d(error_state).
+ */
+typedef void (*kalman_integrate_update_fn_t)(void *user, const struct CnMat *x0, const struct CnMat *error_state,
+										 struct CnMat *x1, struct CnMat * dX_wrt_error_state);
+
+//typedef void (*kalman_error_state_model_fn_t)(void *user, const struct CnMat *x_t, struct CnMat *X_jac_E);
+/***
+ * Given states x0 and x1, generate the error state. Typically this is ~ error_state = x1 - x0. Also optionally generate
+ * the jacobian of the operation; d(error_state) / d(x1)
+ * @param x1
+ * @param x0 Only provided if error_state is required; otherwise both are null
+ * @param Fill in or the parameter state
+ * @param X_jac_e Jacobian of the model state with respect to the error state
+ */
+typedef void (*kalman_error_state_model_fn_t)(void *user, const struct CnMat *x0, const struct CnMat *x1,
+											  struct CnMat *error_state, struct CnMat *E_jac_x1);
+
+typedef struct term_criteria_t {
+	int max_iterations;
+
+	// Absolute step size tolerance
+	FLT minimum_step;
+
+	// Minimum difference in errors
+	FLT xtol;
+
+	FLT mtol;
+
+	FLT max_error;
+} term_criteria_t;
+
+enum cnkalman_update_extended_termination_reason {
+	cnkalman_update_extended_termination_reason_none = 0,
+	cnkalman_update_extended_termination_reason_invalid_jacobian,
+	cnkalman_update_extended_termination_reason_too_high_error,
+	cnkalman_update_extended_termination_reason_maxiter,
+	cnkalman_update_extended_termination_reason_xtol,
+	cnkalman_update_extended_termination_reason_step,
+	cnkalman_update_extended_termination_reason_mtol,
+	cnkalman_update_extended_termination_reason_MAX
+};
+CN_EXPORT_FUNCTION const char * cnkalman_update_extended_termination_reason_to_str(enum cnkalman_update_extended_termination_reason reason);
+
+typedef struct cnkalman_update_extended_total_stats_t {
+	FLT bestnorm_acc, orignorm_acc, bestnorm_meas_acc, bestnorm_delta_acc, orignorm_meas_acc;
+	int total_iterations, total_fevals, total_hevals;
+	int total_runs;
+	int total_failures;
+	FLT step_acc;
+	int step_cnt;
+	size_t stop_reason_counts[cnkalman_update_extended_termination_reason_MAX];
+} cnkalman_update_extended_total_stats_t;
+
+struct cnkalman_update_extended_stats_t {
+	FLT bestnorm, bestnorm_meas, bestnorm_delta;
+	FLT orignorm, orignorm_meas;
+	FLT origerror, besterror;
+	int iterations;
+	int fevals, hevals;
+	enum cnkalman_update_extended_termination_reason stop_reason;
+
+	cnkalman_update_extended_total_stats_t *total_stats;
+};
+
+/**
+ * This scheme heavily borrowed from mpfit
+ */
+enum cnkalman_jacobian_mode {
+    cnkalman_jacobian_mode_user_fn = 0,
+    cnkalman_jacobian_mode_two_sided = cnkalman_numerical_differentiate_mode_two_sided,
+    cnkalman_jacobian_mode_one_sided_plus = cnkalman_numerical_differentiate_mode_one_sided_plus,
+    cnkalman_jacobian_mode_one_sided_minus = cnkalman_numerical_differentiate_mode_one_sided_minus,
+    cnkalman_jacobian_mode_debug = -1,
+};
+
+typedef struct cnkalman_state_s {
+	// The number of states stored. For instance, something that tracked position and velocity would have 6 states --
+	// [x, y, z, vx, vy, vz]
+	int state_cnt;
+
+	void *user;
+
+    enum cnkalman_jacobian_mode transition_jacobian_mode;
+
+	bool error_state_transition;
+	kalman_transition_model_fn_t Transition_fn;
+
+	struct CnMat state_variance_per_second;
+
+	kalman_process_noise_fn_t Q_fn;
+	kalman_normalize_fn_t normalize_fn;
+	kalman_integrate_update_fn_t Update_fn;
+	kalman_error_state_model_fn_t ErrorState_fn;
+	int error_state_size;
+
+	// Store the current covariance matrix (state_cnt x state_cnt)
+	struct CnMat P;
+
+	// Actual state matrix and whether its stored on the heap. Make no assumptions about how this matrix is organized.
+	// it is always size of state_cnt*sizeof(FLT) though.
+	bool State_is_heap;
+	struct CnMat state;
+
+	// Current time
+	FLT t;
+
+	int log_level;
+	void *datalog_user;
+	void (*datalog)(const struct cnkalman_state_s *state, const char *name, const FLT *v, size_t length);
+} cnkalman_state_t;
+
+#define CNKALMAN_STATES_PER_MODEL 8
+typedef struct cnkalman_meas_model {
+	cnkalman_state_t* ks[CNKALMAN_STATES_PER_MODEL];
+	size_t ks_cnt;
+	//cnkalman_state_t *k;
+    enum cnkalman_jacobian_mode meas_jacobian_mode;
+	size_t numeric_misses, numeric_calcs;
+	FLT numeric_step_size;
+
+	const char *name;
+	kalman_measurement_model_fn_t Hfn;
+	bool error_state_model;
+	bool adaptive;
+
+	struct term_criteria_t term_criteria;
+	cnkalman_update_extended_total_stats_t stats;
+} cnkalman_meas_model_t;
+
+CN_EXPORT_FUNCTION FLT cnkalman_meas_model_predict_update_stats(FLT t, cnkalman_meas_model_t *mk, void *user,
+																  const struct CnMat *Z, CnMat *R,
+																  struct cnkalman_update_extended_stats_t *stats);
+CN_EXPORT_FUNCTION FLT cnkalman_meas_model_predict_update(FLT t, cnkalman_meas_model_t *mk, void *user,
+															const struct CnMat *Z, CnMat *R);
+
+/**
+ * Predict the state at a given delta; doesn't update the covariance matrix
+ * @param t delta time
+ * @param k kalman state info
+ * @param index Which state vector to pull out
+ * @param out Pre allocated output buffer.
+ */
+CN_EXPORT_FUNCTION void cnkalman_extrapolate_state(FLT t, const cnkalman_state_t *k, CnMat*x1, CnMat* P);
+
+CN_EXPORT_FUNCTION void cnkalman_predict_state(FLT t, cnkalman_state_t *k);
+
+/**
+ * Run predict and update, updating the state matrix. This is for purely linear measurement models.
+ *
+ * @param t absolute time
+ * @param k kalman state info
+ * @param z measurement -- CnMat of n x 1
+ * @param H Input observation model -- CnMat of n x state_cnt
+ * @param R Observation noise -- The diagonal of the measurement covariance matrix; length n
+ * @param adapative Whether or not R is an adaptive matrix. When true, R should be a full n x n matrix.
+ *
+ */
+CN_EXPORT_FUNCTION FLT cnkalman_predict_update_state(FLT t, cnkalman_state_t *k, const struct CnMat *Z,
+													   const struct CnMat *H, CnMat *R, bool adaptive);
+
+/**
+ * Run predict and update, updating the state matrix. This is for non-linear measurement models.
+ *
+ * @param t absolute time
+ * @param k kalman state info
+ * @param z measurement -- CnMat of n x 1
+ * @param R Observation noise -- The diagonal of the measurement covariance matrix; length n
+ * @param extended_params parameters for the non linear update
+ * @param stats store stats if requested
+ *
+ * @returns Returns the average residual error
+ */
+/*
+CN_EXPORT_FUNCTION FLT
+cnkalman_predict_update_state_extended(FLT t, cnkalman_state_t *k, const struct CnMat *Z, CnMat* R,
+											 const cnkalman_update_extended_params_t *extended_params,
+											 struct cnkalman_update_extended_stats_t *stats);
+*/
+/**
+ * Initialize a kalman state object
+ * @param k object to initialize
+ * @param state_cnt Length of state vector
+ * @param F Transition function
+ * @param q_fn Noise function
+ * @param user pointer to give to user functions
+ * @param state Optional state. Pass 0 to malloc one. Otherwise should point to a vector of at least state_cnt FLTs.
+ *
+ * @returns Returns the average residual error
+ */
+CN_EXPORT_FUNCTION void cnkalman_state_init(cnkalman_state_t *k, size_t state_cnt,
+											kalman_transition_model_fn_t F, kalman_process_noise_fn_t q_fn,
+											void *user, FLT *state);
+
+CN_EXPORT_FUNCTION void cnkalman_error_state_init(cnkalman_state_t *k, size_t state_cnt, size_t error_state_cnt,
+												  kalman_transition_model_fn_t F,
+												  kalman_process_noise_fn_t q_fn,
+												  kalman_error_state_model_fn_t Err_F,
+												  void *user, FLT *state);
+
+CN_EXPORT_FUNCTION void cnkalman_meas_model_init(cnkalman_state_t *k, const char *name,
+												   cnkalman_meas_model_t *mk, kalman_measurement_model_fn_t Hfn);
+CN_EXPORT_FUNCTION void cnkalman_meas_model_multi_init(cnkalman_state_t **k, size_t k_cnt, const char *name,
+                                                 cnkalman_meas_model_t *mk, kalman_measurement_model_fn_t Hfn);
+
+CN_EXPORT_FUNCTION void cnkalman_state_reset(cnkalman_state_t *k);
+
+CN_EXPORT_FUNCTION void cnkalman_state_free(cnkalman_state_t *k);
+CN_EXPORT_FUNCTION void cnkalman_set_P(cnkalman_state_t *k, const FLT *d);
+CN_EXPORT_FUNCTION void cnkalman_set_logging_level(cnkalman_state_t *k, int verbosity);
+
+CN_EXPORT_FUNCTION void cnkalman_linear_update(struct CnMat *F, const struct CnMat *x0, struct CnMat *x1);
+
+CN_EXPORT_FUNCTION void cnkalman_linear_transition_fn(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0, struct CnMat *x1, struct CnMat *f_out);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/libs/cnkalman/include/cnkalman/model.h b/libs/cnkalman/include/cnkalman/model.h
new file mode 100644
index 00000000..5b5ad91d
--- /dev/null
+++ b/libs/cnkalman/include/cnkalman/model.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include "cnkalman/kalman.h"
+#ifdef __cplusplus
+#include <vector>
+#include <memory>
+#include <string>
+
+namespace cnkalman {
+    struct ModelPlot;
+
+    struct KalmanModel;
+    struct CN_EXPORT_CLASS KalmanMeasurementModel {
+        size_t meas_cnt;
+        cnkalman_meas_model meas_mdl = {};
+
+        KalmanMeasurementModel(KalmanModel* kalmanModel, const std::string& name, size_t meas_cnt);
+        KalmanMeasurementModel(KalmanModel* kalmanModel, size_t meas_cnt) : KalmanMeasurementModel(kalmanModel, "meas", meas_cnt) {}
+        virtual ~KalmanMeasurementModel() = default;
+
+        /***
+         * @param x current state
+         * @param z measurement prediction
+         * @param h measurement jacobian wrt x
+         * @return Whether the residual / jacobian is valid
+         */
+        virtual bool predict_measurement(const CnMat& x, CnMat* z, CnMat* h) = 0;
+
+        /***
+         * @param Z observed measurement
+         * @param x current state
+         * @param z measurement prediction
+         * @param h measurement jacobian wrt x
+         * @return Whether the residual / jacobian is valid
+         */
+        virtual bool residual(const CnMat& Z, const CnMat& x, CnMat* y, CnMat* h);
+
+        cnkalman_update_extended_stats_t update(FLT t, const struct CnMat& Z, CnMat& R);
+
+        virtual std::ostream& write(std::ostream&) const;
+        /***
+         * Given an assumed state x and measurement variance R, generate a plausible Z
+         */
+        virtual void sample_measurement(const CnMat& x, struct CnMat& Z, const CnMat& R);
+        virtual cnmatrix::Matrix default_R() {
+            auto rtn = cnmatrix::Matrix(meas_cnt, meas_cnt);
+            cn_set_diag_val(rtn, .1 * .1);
+            return rtn;
+        }
+
+        virtual void draw(ModelPlot& p) {}
+    };
+
+    struct CN_EXPORT_CLASS KalmanModel {
+        std::string name;
+
+        cnkalman_state_t kalman_state = {};
+        size_t state_cnt;
+        FLT* state;
+        CnMat* stateM;
+        std::vector<std::shared_ptr<struct KalmanMeasurementModel>> measurementModels;
+
+        virtual std::ostream& write(std::ostream&) const;
+
+        KalmanModel(const std::string& name, size_t state_cnt);
+        KalmanModel(size_t state_cnt) : KalmanModel("mdl", state_cnt) {}
+        virtual void reset();
+        virtual ~KalmanModel();
+
+		virtual void predict(FLT dt, const CnMat& x0, CnMat* x1, CnMat* cF) = 0;
+		void predict(FLT dt, const CnMat& x0, CnMat* x1) { return predict(dt, x0, x1, 0); }
+
+        virtual void process_noise(FLT dt, const struct CnMat &x, struct CnMat &Q_out) = 0;
+
+        virtual void sample_state(FLT dt, const struct CnMat &x0, struct CnMat &x1, const struct CnMat* Q = 0);
+
+        void update(FLT t);
+
+        void bulk_update(FLT t, const std::vector<CnMat>& Zs, const std::vector<cnmatrix::Matrix>& Rs);
+        virtual void draw(ModelPlot& p) {
+            for(auto& m : measurementModels) m->draw(p);
+        }
+    };
+
+    struct CN_EXPORT_CLASS KalmanLinearPredictionModel : public KalmanModel {
+        virtual const CnMat& F() const = 0;
+        KalmanLinearPredictionModel(const std::string &name, size_t stateCnt);
+        KalmanLinearPredictionModel(size_t stateCnt) : KalmanLinearPredictionModel("mdl", stateCnt) {}
+        void predict(FLT dt, const CnMat& x0, CnMat* x1, CnMat* cF) override;
+    };
+
+    struct CN_EXPORT_CLASS KalmanLinearMeasurementModel : public KalmanMeasurementModel {
+        CnMat H;
+
+        KalmanLinearMeasurementModel(KalmanModel* kalmanModel, const std::string& name, const CnMat& H);
+        KalmanLinearMeasurementModel(KalmanModel* kalmanModel, const CnMat& H) : KalmanLinearMeasurementModel(kalmanModel, "meas", H) {}
+        ~KalmanLinearMeasurementModel() override;
+        bool predict_measurement(const CnMat &x, CnMat *z, CnMat *h) override;
+    };
+}
+
+#endif
\ No newline at end of file
diff --git a/libs/cnkalman/include/cnkalman/numerical_diff.h b/libs/cnkalman/include/cnkalman/numerical_diff.h
new file mode 100644
index 00000000..ee5caabd
--- /dev/null
+++ b/libs/cnkalman/include/cnkalman/numerical_diff.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cnmatrix/cn_matrix.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    enum cnkalman_numerical_differentiate_mode {
+        cnkalman_numerical_differentiate_mode_two_sided = 1,
+        cnkalman_numerical_differentiate_mode_one_sided_plus = 2,
+        cnkalman_numerical_differentiate_mode_one_sided_minus = 3
+    };
+
+typedef bool (*cnkalman_eval_fn_t)(void * user, const struct CnMat *x, struct CnMat *y);
+
+bool cnkalman_numerical_differentiate(void * user, enum cnkalman_numerical_differentiate_mode m, cnkalman_eval_fn_t fn, const CnMat* x, CnMat* H);
+bool cnkalman_numerical_differentiate_step_size(void * user, enum cnkalman_numerical_differentiate_mode m, FLT step_size, cnkalman_eval_fn_t fn, const CnMat* x, CnMat* H);
+#ifdef __cplusplus
+}
+#endif
diff --git a/libs/cnkalman/libs/pybind11 b/libs/cnkalman/libs/pybind11
new file mode 160000
index 00000000..b1bd7f26
--- /dev/null
+++ b/libs/cnkalman/libs/pybind11
@@ -0,0 +1 @@
+Subproject commit b1bd7f2600d650bf59c88800c637703dd89317f3
diff --git a/libs/cnkalman/pyproject.toml b/libs/cnkalman/pyproject.toml
new file mode 100644
index 00000000..131fd0c7
--- /dev/null
+++ b/libs/cnkalman/pyproject.toml
@@ -0,0 +1,20 @@
+[dependencies]
+opencv-contrib-python = "^4.5.1.48"
+scipy="^1.7.0"
+numpy="^1.19.5"
+opencv-python="^4.5.5.62"
+
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel",
+    "setuptools-git-versioning",
+    "cmake",
+]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools-git-versioning]
+enabled = true
+dev_template = "{tag}.{ccount}"
+dirty_template = "{tag}.{ccount}"
+template = "{tag}.{ccount}"
diff --git a/libs/cnkalman/setup.cfg b/libs/cnkalman/setup.cfg
new file mode 100644
index 00000000..cc6a9221
--- /dev/null
+++ b/libs/cnkalman/setup.cfg
@@ -0,0 +1,24 @@
+[metadata]
+name = cnkalman
+version = 0.1.0
+author = Justin Berger
+author_email = jdavidberger@gmail.com
+description = Support tools for cnkalman
+long_description = file: README.md
+long_description_content_type = text/markdown
+classifiers =
+    Programming Language :: Python :: 3
+    Operating System :: OS Independent
+
+[options]
+package_dir =
+    = .
+packages = find:
+python_requires = >=3.6
+install_requires =
+    sympy
+    symengine
+    cmake
+
+[options.packages.find]
+where = .
diff --git a/libs/cnkalman/setup.py b/libs/cnkalman/setup.py
new file mode 100644
index 00000000..6385ea07
--- /dev/null
+++ b/libs/cnkalman/setup.py
@@ -0,0 +1,72 @@
+import glob
+import os
+import pathlib
+import shutil
+import sys
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext as build_ext_orig
+
+cmake_path = os.environ.get('CMAKE_PATH', "cmake")
+class CMakeExtension(Extension):
+
+    def __init__(self, name):
+        # don't invoke the original build_ext for this special extension
+        super().__init__(name, sources=[])
+
+
+class build_ext(build_ext_orig):
+
+    def run(self):
+        for ext in self.extensions:
+            self.build_cmake(ext)
+        super().run()
+
+    def build_cmake(self, ext):
+        cwd = pathlib.Path().absolute()
+
+        # these dirs will be created in build_py, so if you don't have
+        # any python sources to bundle, the dirs will be missing
+        build_temp = pathlib.Path(self.build_temp)
+        build_temp.mkdir(parents=True, exist_ok=True)
+        extdir = pathlib.Path(self.get_ext_fullpath(ext.name))
+        extdir.mkdir(parents=True, exist_ok=True)
+
+        config = 'Release'
+        cmake_args = [
+            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + str(extdir.parent.absolute()),
+            '-DCMAKE_BUILD_TYPE=' + config,
+            '-DCMAKE_INSTALL_PREFIX=' + str(build_temp.absolute()),
+            '-DPYTHON_EXECUTABLE=' + sys.executable
+        ]
+
+        # example of build args
+        build_args = [
+            '--config', config,
+        ]
+
+        build_temp_dir = str(build_temp.absolute())
+        ext_dir_parent = str(extdir.parent.absolute())
+        os.chdir(str(build_temp.absolute()))
+        self.spawn([cmake_path, str(cwd)] + cmake_args)
+        if not self.dry_run:
+            self.spawn([cmake_path, '--build', '.', '--target', 'install'] + build_args)
+            print(build_temp_dir + "/lib/*.so")
+            for file in glob.glob(build_temp_dir + "/lib/*.so"):
+                print(file)
+                shutil.copy(file, ext_dir_parent + "/cnkalman")
+
+        # Troubleshooting: if fail on line above then delete all possible
+        # temporary CMake files including "CMakeCache.txt" in top level dir.
+        os.chdir(str(cwd))
+
+
+setup(
+    name='cnkalman',
+    packages=['cnkalman'],
+    ext_modules=[CMakeExtension('.')],
+    cmdclass={
+        'build_ext': build_ext,
+    },
+    setup_requires=["setuptools-git-versioning"],
+)
\ No newline at end of file
diff --git a/libs/cnkalman/src/CMakeLists.txt b/libs/cnkalman/src/CMakeLists.txt
new file mode 100644
index 00000000..48db30fc
--- /dev/null
+++ b/libs/cnkalman/src/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_library(cnkalman STATIC kalman.c ./model.cc ./numerical_diff.c ./iekf.c ./ModelPlot.cc
+        ../include/cnkalman/kalman.h)
+
+target_include_directories(cnkalman PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include/>
+  $<INSTALL_INTERFACE:include/>
+  )
+
+target_link_libraries(cnkalman cnmatrix)
+
+install(TARGETS cnkalman DESTINATION lib)
+
+pybind11_add_module(filter cnkalman_python_bindings.cpp)
+target_link_libraries(filter PUBLIC cnkalman)
+install(TARGETS filter DESTINATION lib)
diff --git a/libs/cnkalman/src/ModelPlot.cc b/libs/cnkalman/src/ModelPlot.cc
new file mode 100644
index 00000000..b342c4b1
--- /dev/null
+++ b/libs/cnkalman/src/ModelPlot.cc
@@ -0,0 +1,95 @@
+#include "cnkalman/ModelPlot.h"
+#include <algorithm>
+
+namespace cnkalman {
+  ModelPlot::ModelPlot(const std::string &name, bool show) : show(show), name(name) {
+#ifdef HAS_SCIPLOT
+        plot.gnuplot("set title \"" + name + "\"");
+        map.gnuplot("set title \"" + name + "\"");
+        map.palette("paired");
+
+        map.size(1600, 1600);
+        map.gnuplot("set size square");
+        map.border().none();
+
+        plot.size(1600, 1200);
+#endif
+    }
+
+    void ModelPlot::include_point_in_range(const FLT *X) {
+        if(lock_range) return;
+
+        for (int x = 0; x < 2; x++) {
+            range[x * 2] = std::min(range[x * 2], X[x]);
+            range[x * 2 + 1] = std::max(range[x * 2 + 1], X[x]);
+        }
+    }
+
+    void ModelPlot::include_point_in_range(FLT x, FLT y) {
+        FLT Xs[] = {x, y};
+        include_point_in_range(Xs);
+    }
+
+    void ModelPlot::get_view(FLT &x, FLT &y, FLT &w, FLT &h) const {
+        FLT dx = fmax(1, range[1] - range[0]);
+        FLT dy = fmax(1, range[3] - range[2]);
+        FLT R0 = range[0] - .1 * dx;
+        FLT R1 = range[1] + .1 * dx;
+        FLT R2 = range[2] - .1 * dy;
+        FLT R3 = range[3] + .1 * dy;
+        w = R1 - R0, h = R3 - R2;
+        x = R1 - w / 2, y = R3 - h / 2;
+        w = fmax(w, h);
+        h = w;
+    }
+
+    ModelPlot::~ModelPlot() {
+
+        FLT dx = fmax(1e-3, range[1] - range[0]);
+        FLT dy = fmax(1e-3, range[3] - range[2]);
+        range[0] -= .1 * dx;
+        range[1] += .1 * dx;
+        range[2] -= .1 * dy;
+        range[3] += .1 * dy;
+        FLT w = range[1] - range[0], h = range[3] - range[2];
+        FLT x = range[1] - w / 2, y = range[3] - h / 2;
+        w = fmax(w, h);
+        range[0] = x - w / 2;
+        range[1] = x + w / 2.;
+        range[2] = y - w / 2;
+        range[3] = y + w / 2.;
+#ifdef HAS_SCIPLOT
+        map.xrange(range[0], range[1]);
+        map.yrange(range[2], range[3]);
+
+        if (show) {
+            plot.show();
+            map.show();
+        }
+        plot.save(name + "-plot.svg");
+        map.save(name + ".svg");
+        map.save(name + ".png");
+#endif
+    }
+
+    void ModelPlot::plot_cov(const cnkalman::KalmanModel &model, FLT deviations, const std::string &color) {
+#ifdef HAS_SCIPLOT
+        CN_CREATE_STACK_MAT(Pp, 2, 2);
+        CN_CREATE_STACK_MAT(Evec, 2, 2);
+        CN_CREATE_STACK_VEC(Eval, 2);
+        static int idx = 1;
+
+        cnCopy(&model.kalman_state.P, &Pp, 0);
+        cnSVD(&Pp, &Eval, &Evec, 0, (enum cnSVDFlags)0);
+        FLT angle = atan2(_Evec[2], _Evec[0]) *57.2958;
+        FLT v1 = deviations*2*sqrt(_Eval[0]), v2 = deviations*2*sqrt(_Eval[1]);
+        std::stringstream ss;
+        ss << "set obj " << idx++ << " ellipse fc rgb \"" << color << "\" fs transparent solid .5 center "
+           << (model.state[0]) << "," <<
+           (model.state[1]) << " size "
+           << v1 << "," << v2 <<
+           " angle " << angle << " front\n";
+        map.gnuplot(ss.str());
+#endif
+    }
+}
diff --git a/libs/cnkalman/src/cnkalman_internal.h b/libs/cnkalman/src/cnkalman_internal.h
new file mode 100644
index 00000000..17fc5689
--- /dev/null
+++ b/libs/cnkalman/src/cnkalman_internal.h
@@ -0,0 +1,30 @@
+#include <cnkalman/kalman.h>
+
+typedef struct CnMat cnkalman_gain_matrix;
+
+CnMat *cnkalman_find_residual(cnkalman_meas_model_t *mk, void *user, const struct CnMat *Z,
+                                    const struct CnMat *x, CnMat *y, CnMat *H);
+
+FLT cnkalman_run_iterations(cnkalman_meas_model_t *mk, const struct CnMat *Z, const struct CnMat *R,
+                                  void *user, const CnMat *x_k_k1, CnMat *K,
+                                  CnMat *H, CnMat *x_k_k, struct cnkalman_update_extended_stats_t *stats);
+void cnkalman_update_state(void* user, cnkalman_state_t *k, const CnMat* x0, FLT scale, const CnMat* error_state, CnMat* x1);
+void cnkalman_find_k(const struct cnkalman_meas_model *mk, cnkalman_gain_matrix *K, const struct CnMat *H,
+                                  const CnMat *R);
+
+void kalman_print_mat_v(const cnkalman_state_t *k, int ll, const char *name, const CnMat *M, bool newlines);
+int cnkalman_model_state_count(const cnkalman_meas_model_t *mk);
+int cnkalman_model_filter_count(const cnkalman_meas_model_t *mk);
+
+static inline bool sane_covariance(const CnMat *P) {
+#ifndef NDEBUG
+    for (int i = 0; i < P->rows; i++) {
+        if (cnMatrixGet(P, i, i) < 0)
+            return false;
+    }
+#ifdef USE_EIGEN
+    return cnDet(P) > -1e-10;
+#endif
+#endif
+    return true;
+}
diff --git a/libs/cnkalman/src/cnkalman_python_bindings.cpp b/libs/cnkalman/src/cnkalman_python_bindings.cpp
new file mode 100644
index 00000000..55292932
--- /dev/null
+++ b/libs/cnkalman/src/cnkalman_python_bindings.cpp
@@ -0,0 +1,275 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+#include "cnkalman/model.h"
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+static inline CnMat from_py_array(py::array_t<double> b) {
+    if (b.is_none()) {
+        return CnMat {};
+    }
+    /* Request a buffer descriptor from Python */
+    py::buffer_info info = b.request();
+
+    /* Some basic validation checks ... */
+    if (info.format != py::format_descriptor<double>::format())
+        throw std::runtime_error("Incompatible format: expected a double array!");
+
+    return CnMat {
+            static_cast<int>(info.strides[0] / sizeof(double)),
+            static_cast<double *>(info.ptr),
+            static_cast<int>(info.shape[0]),
+            static_cast<int>(info.ndim == 2 ? info.shape[1] : 1)
+    };
+}
+
+static inline py::buffer_info to_py_array_buffer(CnMat& m) {
+    if(m.cols == 1) {
+        return py::buffer_info(
+                m.data,                               /* Pointer to buffer */
+                sizeof(double),                          /* Size of one scalar */
+                py::format_descriptor<double>::format(), /* Python struct-style format descriptor */
+                1,                                      /* Number of dimensions */
+                { (Py_ssize_t)m.rows },                 /* Buffer dimensions */
+                { (Py_ssize_t)(m.step * sizeof(double)) }
+        );
+    }
+    return py::buffer_info(
+            m.data,                               /* Pointer to buffer */
+            sizeof(double),                          /* Size of one scalar */
+            py::format_descriptor<double>::format(), /* Python struct-style format descriptor */
+            2,                                      /* Number of dimensions */
+            { (Py_ssize_t)m.rows, (Py_ssize_t)m.cols },                 /* Buffer dimensions */
+            { (Py_ssize_t)(m.step * sizeof(double)),             /* Strides (in bytes) for each index */
+              (Py_ssize_t)sizeof(double ) }
+    );
+}
+
+static inline py::buffer_info to_py_array_buffer(const CnMat& m) {
+    if(m.cols == 1) {
+        return py::buffer_info(
+                m.data,                               /* Pointer to buffer */
+                sizeof(double),                          /* Size of one scalar */
+                py::format_descriptor<double>::format(), /* Python struct-style format descriptor */
+                1,                                      /* Number of dimensions */
+                { (Py_ssize_t)m.rows },                 /* Buffer dimensions */
+                { (Py_ssize_t)(m.step * sizeof(double)) },
+                true
+        );
+    }
+    return py::buffer_info(
+            m.data,                               /* Pointer to buffer */
+            sizeof(double),                          /* Size of one scalar */
+            py::format_descriptor<double>::format(), /* Python struct-style format descriptor */
+            2,                                      /* Number of dimensions */
+            { (Py_ssize_t)m.rows, (Py_ssize_t)m.cols },                 /* Buffer dimensions */
+            { (Py_ssize_t)(m.step * sizeof(double)),             /* Strides (in bytes) for each index */
+              (Py_ssize_t)sizeof(double ) },
+              true
+    );
+}
+
+static inline py::array_t<double> to_py_array(CnMat& m) {
+    return py::array_t<double>(to_py_array_buffer(m));
+}
+
+static inline py::array_t<double> to_py_array(const CnMat& m) {
+    return py::array_t<double>(to_py_array_buffer(m));
+}
+
+namespace cnkalman {
+    class PyKalmanMeasurementModel : public KalmanMeasurementModel {
+    public:
+        using KalmanMeasurementModel::KalmanMeasurementModel;
+        virtual py::array_t<double> predict_measurement(py::array_t<double> x) {
+            PYBIND11_OVERRIDE_PURE(
+                    py::array_t<double>, /* Return type */
+                    PyKalmanMeasurementModel,      /* Parent class */
+                    predict_measurement,          /* Name of function in C++ (must match Python name) */
+                    x     /* Argument(s) */
+            );
+        }
+
+        virtual py::array_t<double> predict_measurement_jac(py::array_t<double> x) {
+            PYBIND11_OVERRIDE_PURE(
+                    py::array_t<double>, /* Return type */
+                    PyKalmanMeasurementModel,      /* Parent class */
+                    predict_measurement_jac,          /* Name of function in C++ (must match Python name) */
+                    x     /* Argument(s) */
+            );
+        }
+
+        bool predict_measurement(const CnMat &x, CnMat *z, CnMat *h) override {
+            if(z) {
+                auto _z = from_py_array(predict_measurement(to_py_array(x)));
+                cn_matrix_copy(z, &_z);
+            }
+            if(h) {
+                auto _h = from_py_array(predict_measurement_jac(to_py_array(x)));
+                cn_matrix_copy(h, &_h);
+            }
+            return true;
+        }
+    };
+
+    class PyKalmanModel : public KalmanModel {
+    public:
+        using KalmanModel::KalmanModel;
+
+        virtual py::array_t<double> predict(double dt, py::array_t<double> x0) {
+            PYBIND11_OVERRIDE_PURE(
+                    py::array_t<double>, /* Return type */
+                    PyKalmanModel,      /* Parent class */
+                    predict,          /* Name of function in C++ (must match Python name) */
+                    dt, x0      /* Argument(s) */
+            );
+        }
+        virtual py::array_t<double> predict_jac(double dt, py::array_t<double> x0) {
+            PYBIND11_OVERRIDE_PURE(
+                    py::array_t<double>, /* Return type */
+                    PyKalmanModel,      /* Parent class */
+                    predict_jac,          /* Name of function in C++ (must match Python name) */
+                    dt, x0      /* Argument(s) */
+            );
+        }
+
+        void predict(double dt, const CnMat &x0, CnMat *x1, CnMat *cF) override {
+            if(x1) {
+                auto _x1 = from_py_array(predict(dt, to_py_array(x0)));
+                cn_matrix_copy(x1, &_x1);
+            }
+            if(cF) {
+                auto _f1 = from_py_array(predict_jac(dt, to_py_array(x0)));
+                cn_matrix_copy(cF, &_f1);
+            }
+        }
+
+        virtual py::array_t<double> process_noise(double dt, py::array_t<double> x0) {
+            PYBIND11_OVERRIDE_PURE(
+                    py::array_t<double>, /* Return type */
+                    PyKalmanModel,      /* Parent class */
+                    process_noise,          /* Name of function in C++ (must match Python name) */
+                    dt, x0      /* Argument(s) */
+            );
+        }
+
+        void process_noise(double dt, const CnMat &x, CnMat &Q_out) override {
+            auto q = from_py_array(process_noise(dt, to_py_array(x)));
+            cn_matrix_copy(&Q_out, &q);
+        }
+    };
+}
+
+PYBIND11_MODULE(filter, m) {
+    using namespace cnkalman;
+
+    py::class_<CnMat>(m, "CnMat", py::buffer_protocol())
+            .def(py::init(&from_py_array))
+            .def_buffer([](CnMat &m) -> py::buffer_info {
+                return to_py_array_buffer(m);
+            })
+            .def_readonly("rows", &CnMat::rows)
+            .def_readonly("cols", &CnMat::cols)
+            ;
+
+    py::class_<cnkalman_state_s>(m, "filter_state")
+            .def_property("P", [](cnkalman_state_s& self){
+                return to_py_array(self.P);
+            }, [](cnkalman_state_s& self, py::array_t<double> v) {
+                CnMat m = from_py_array(v);
+                cn_matrix_copy(&self.P, &m);
+            })
+            .def_property("X", [](cnkalman_state_s& self){
+                return to_py_array(self.state);
+            }, [](cnkalman_state_s& self, py::array_t<double> v) {
+                CnMat m = from_py_array(v);
+                //cn_print_mat(&m);
+                cn_matrix_copy(&self.state, &m);
+            })
+            .def_readwrite("state_variance_per_second", &cnkalman_state_s::state_variance_per_second)
+            .def_readwrite("time", &cnkalman_state_s::t)
+            ;
+
+    py::class_<cnkalman_update_extended_total_stats_t>(m, "update_extended_total_stats")
+            .def_readwrite("total_runs", &cnkalman_update_extended_total_stats_t::total_runs)
+            .def_readwrite("total_failures", &cnkalman_update_extended_total_stats_t::total_failures)
+            .def_readwrite("total_fevals", &cnkalman_update_extended_total_stats_t::total_fevals)
+;
+
+    py::class_<cnkalman_update_extended_stats_t>(m, "update_extended_stats")
+            .def_readwrite("besterror", &cnkalman_update_extended_stats_t::besterror)
+            .def_readwrite("bestnorm", &cnkalman_update_extended_stats_t::bestnorm)
+            .def_readwrite("origerror", &cnkalman_update_extended_stats_t::origerror)
+            .def_readwrite("stop_reason", &cnkalman_update_extended_stats_t::stop_reason)
+            .def_readwrite("iterations", &cnkalman_update_extended_stats_t::iterations)
+            ;
+
+    py::class_<term_criteria_t>(m, "term_criteria_t")
+            .def(py::init([](int max_iterations, double minimum_step, double xtol, double mtol, double max_error){
+                return term_criteria_t {
+                    max_iterations,
+                    minimum_step,
+                    xtol,
+                    mtol,
+                    max_error
+                };
+            }), "max_iterations"_a = 0, "minimum_step"_a=0, "xtol"_a=0, "mtol"_a=0, "max_error"_a=0)
+            .def_readwrite("max_error", &term_criteria_t::max_error)
+            .def_readwrite("max_iterations", &term_criteria_t::max_iterations)
+            .def_readwrite("minimum_step", &term_criteria_t::minimum_step)
+            .def_readwrite("mtol", &term_criteria_t::mtol)
+            .def_readwrite("xtol", &term_criteria_t::xtol)
+            ;
+
+    py::enum_<enum cnkalman_jacobian_mode>(m, "jacobian_mode")
+            .value("user_fn", cnkalman_jacobian_mode::cnkalman_jacobian_mode_user_fn)
+            .value("debug", cnkalman_jacobian_mode::cnkalman_jacobian_mode_debug)
+            .value("one_sided_minus", cnkalman_jacobian_mode::cnkalman_jacobian_mode_one_sided_minus)
+            .value("one_sided_plus", cnkalman_jacobian_mode::cnkalman_jacobian_mode_one_sided_plus)
+            .value("two_sided", cnkalman_jacobian_mode::cnkalman_jacobian_mode_two_sided)
+            ;
+    py::class_<cnkalman_meas_model>(m, "meas_state")
+            .def_readwrite("adaptive", &cnkalman_meas_model::adaptive)
+            .def_readwrite("meas_jacobian_mode", &cnkalman_meas_model::meas_jacobian_mode)
+            .def_readwrite("term_criteria", &cnkalman_meas_model::term_criteria)
+            ;
+
+    py::class_<PyKalmanModel>(m, "Model")
+            .def(py::init<const std::string&, size_t>(), "name"_a, "state_cnt"_a)
+            .def(py::init<size_t>(), "state_cnt"_a)
+            .def("predict", py::overload_cast<double, py::array_t<double>>(&PyKalmanModel::predict))
+            .def("predict_jac", py::overload_cast<double, py::array_t<double>>(&PyKalmanModel::predict_jac))
+            .def("process_noise", py::overload_cast<double, py::array_t<double>>(&PyKalmanModel::process_noise))
+            .def("update", &KalmanModel::update, "time"_a)
+            .def_readonly("state_cnt", &KalmanModel::state_cnt)
+            .def_readonly("name", &KalmanModel::name)
+            .def_readwrite("state", &KalmanModel::stateM)
+            .def_readwrite("kalman_state", &KalmanModel::kalman_state)
+;
+
+    py::class_<PyKalmanMeasurementModel>(m, "MeasurementModel")
+            .def(py::init<PyKalmanModel*, const std::string&, size_t>())
+            .def(py::init<PyKalmanModel*, size_t>())
+            .def("update", [](PyKalmanMeasurementModel& self, double t, py::array_t<double> Z,
+                              py::array_t<double> R) {
+                auto _Z = from_py_array(Z);
+                auto _R = from_py_array(R);
+
+                return self.update(t, _Z, _R);
+            }, "time"_a, "Z"_a, "R"_a)
+            .def("predict_measurement", py::overload_cast<py::array_t<double>>(&PyKalmanMeasurementModel::predict_measurement))
+            .def("predict_measurement_jac", py::overload_cast<py::array_t<double>>(&PyKalmanMeasurementModel::predict_measurement_jac))
+            .def("residual", &KalmanMeasurementModel::residual)
+            .def("default_R", [](PyKalmanMeasurementModel& self) {
+                return to_py_array(self.default_R().mat);
+            })
+            .def("jacobian_debug_misses", [](PyKalmanMeasurementModel& self) {
+                if(self.meas_mdl.numeric_calcs == 0)
+                    return 0.;
+                return self.meas_mdl.numeric_misses / (double)self.meas_mdl.numeric_calcs;
+            })
+            .def_readwrite("meas_mdl", &KalmanMeasurementModel::meas_mdl)
+;
+
+}
\ No newline at end of file
diff --git a/libs/cnkalman/src/iekf.c b/libs/cnkalman/src/iekf.c
new file mode 100644
index 00000000..f74a0474
--- /dev/null
+++ b/libs/cnkalman/src/iekf.c
@@ -0,0 +1,398 @@
+#include <cnkalman/kalman.h>
+#include <stdio.h>
+#include "cnkalman_internal.h"
+
+static inline FLT mul_at_b_a(const struct CnMat *A, const struct CnMat *B) {
+    CN_CREATE_STACK_MAT(V, 1, 1);
+    assert(A->cols == 1);
+    if (B->cols > 1) {
+		CN_CREATE_STACK_MAT(AtiB, 1, B->rows);
+        cnGEMM(A, B, 1, 0, 0, &AtiB, CN_GEMM_FLAG_A_T);
+        cnGEMM(&AtiB, A, 1, 0, 0, &V, 0);
+    } else {
+		CN_CREATE_STACK_VEC(AtiB, B->rows);
+        cnElementwiseMultiply(&AtiB, A, B);
+        V.data[0] = cnDot(&AtiB, A);
+    }
+
+    return V.data[0];
+}
+
+CN_EXPORT_FUNCTION FLT calculate_v_meas(const struct CnMat *y, const struct CnMat *iR) {
+        return .5 * mul_at_b_a(y, iR);
+}
+CN_EXPORT_FUNCTION FLT calculate_v_delta(const struct CnMat *xDiff, const struct CnMat *iP) {
+    return .5 * mul_at_b_a(xDiff, iP);
+}
+CN_EXPORT_FUNCTION FLT calculate_v(const struct CnMat *y, const struct CnMat *xDiff, const struct CnMat *iR,
+                                   const struct CnMat *iP, FLT *meas_part, FLT *delta_part) {
+    if (delta_part == 0) {
+        return *meas_part = calculate_v_meas(y, iR);
+    }
+    *meas_part = calculate_v_meas(y, iR);
+    *delta_part = calculate_v_delta(xDiff, iP);
+    return  *meas_part + *delta_part;
+}
+
+const char *cnkalman_update_extended_termination_reason_to_str(
+        enum cnkalman_update_extended_termination_reason reason) {
+    switch (reason) {
+        case cnkalman_update_extended_termination_reason_none:
+            return "none";
+        case cnkalman_update_extended_termination_reason_maxiter:
+            return "maxiter";
+        case cnkalman_update_extended_termination_reason_invalid_jacobian:
+            return "invalid_jac";
+        case cnkalman_update_extended_termination_reason_xtol:
+            return "xtol";
+        case cnkalman_update_extended_termination_reason_MAX:
+            return "MAX";
+        case cnkalman_update_extended_termination_reason_step:
+            return "step";
+        case cnkalman_update_extended_termination_reason_mtol:
+            return "mtol";
+		case cnkalman_update_extended_termination_reason_too_high_error:
+			return "maxerr";
+        default:
+            return "";
+    }
+}
+
+static inline enum cnkalman_update_extended_termination_reason
+cnkalman_termination_criteria(const struct term_criteria_t *term_criteria,
+                                    FLT initial_error, FLT error, FLT alpha, FLT last_error) {
+    FLT minimum_step = term_criteria->minimum_step > 0 ? term_criteria->minimum_step : .01;
+    if (alpha == 0 || alpha < minimum_step) {
+        return cnkalman_update_extended_termination_reason_step;
+    }
+    if (error == 0) {
+        return cnkalman_update_extended_termination_reason_xtol;
+    }
+
+    if (isfinite(last_error) && fabs(last_error - error) < term_criteria->xtol * error) {
+        return cnkalman_update_extended_termination_reason_xtol;
+    }
+    return cnkalman_update_extended_termination_reason_none;
+}
+
+static void cnkalman_find_error_state(void* user, cnkalman_state_t *k, const CnMat* x1, const CnMat* x0, CnMat* error_state) {
+	CN_CREATE_STACK_VEC(verify_x, k->state_cnt);
+	if (k->ErrorState_fn) {
+		k->ErrorState_fn(user, x0, x1, error_state, 0);
+		k->Update_fn(user, x0, error_state, &verify_x, 0);
+		//FLT err = cnDistance(&verify_x, x1);
+		//assert(err < 1e-4);
+	} else {
+		cnSub(error_state, x1, x0);
+	}
+}
+
+void cnkalman_update_state(void* user, cnkalman_state_t *k, const CnMat* x0, FLT scale, const CnMat* error_state, CnMat* x1) {
+	if(k->Update_fn) {
+		CN_CREATE_STACK_MAT(error_state_delta_scaled, k->error_state_size, 1);
+		cnScale(&error_state_delta_scaled, error_state, scale);
+		k->Update_fn(user, x0, &error_state_delta_scaled, x1, 0);
+	} else {
+		cnAddScaled(x1, error_state, scale, x0, 1);
+	}
+}
+
+#define FOR_EACH_STATE(mk) \
+for(int i = 0, filter_idx = 0, state_idx = 0;i < mk->ks_cnt;i++) {\
+cnkalman_state_t * k = mk->ks[i];                                 \
+CnMat error_state_delta_view = cnMatView(k->error_state_size, 1, &error_state_delta, filter_idx, 0);\
+CnMat xn_view = cnMatView(k->state_cnt, 1, &xn, state_idx, 0);    \
+CnMat iPdx_view = cnMatView(k->error_state_size, 1, &iPdx, filter_idx, 0);\
+CnMat x_i_view = cnMatConstView(k->state_cnt, 1, &x_i, state_idx, 0);                         \
+CnMat error_state_view = cnMatView(k->error_state_size, 1, &error_state, filter_idx, 0);\
+const CnMat x_k_k1_view = cnMatConstView(k->state_cnt, 1, x_k_k1, state_idx, 0);\
+
+
+#define END_FOR_EACH_STATE()\
+filter_idx += k->error_state_size;\
+state_idx += k->state_cnt;\
+}
+
+// Extended Kalman Filter Modifications Based on an Optimization View Point
+// https://www.diva-portal.org/smash/get/diva2:844060/FULLTEXT01.pdf
+// Note that in this document, 'y' is the measurement; which we refer to as 'Z'
+// throughout this code and so 'y - h(x)' from the paper is 'y' in code.
+// The main driver in this document is V(x) which is defined as:
+// r(x) = [ R^-.5 * y; P^-.5 * (x_t-1 * x) ]
+// V(X) = r'(x) * r(x) / 2
+// V(X) = 1/2 * (R^-.5 * y)' * R^-.5 * y + (P^-.5 * (x_t-1 * x))'*P^-.5 * (x_t-1 * x)
+// Then owing to the identity (AB)' = B'A', and also to the fact that R and P are symmetric and so R' = R; P' = P:
+// V(X) = 1/2 * (y' * (R^-.5)' * R^-.5 * y + (x_t-1 * x)' * (P^-.5)'* P^-.5 * (x_t-1 * x))
+// V(X) = 1/2 * (y' * (R^-.5) * R^-.5 * y + (x_t-1 * x)' * (P^-.5)* P^-.5 * (x_t-1 * x))
+// V(X) = 1/2 * (y' * (R^-1) * y + (x_t-1 * x)' * (P^-1) * (x_t-1 * x))
+
+// Similarly, we need dV(X)/dX -- ΔV(X) --
+// ΔV(X) = -[ R^-.5 * H; P^-.5]' * r(x)
+// ΔV(X) = -[ R^-.5 * H; P^-.5]' * [ R^-.5 * y; P^-.5 * (x_t-1 * x) ]
+// ΔV(X) =  -(R^-.5 * H) * (R^-.5 * y) - P^-.5 * (P^-.5 * (x_t-1 * x))
+// ΔV(X) =  H' * R^-1 * y - P^-1 * (x_t-1 * x)
+// The point of all of this is that we don't need to ever explicitly calculate R^-.5 / P^-.5; just the inverses
+FLT cnkalman_run_iterations(cnkalman_meas_model_t *mk, const struct CnMat *Z, const struct CnMat *R,
+                                         void *user, const CnMat *x_k_k1, CnMat *K,
+                                         CnMat *H, CnMat *x_k_k, struct cnkalman_update_extended_stats_t *stats) {
+    int state_cnt = cnkalman_model_state_count(mk);
+    int filter_cnt = cnkalman_model_filter_count(mk);
+
+    int meas_cnt = Z->rows;
+
+    CN_CREATE_STACK_MAT(y, meas_cnt, 1);
+    CN_CREATE_STACK_MAT(x_i, state_cnt, 1);
+    CN_CREATE_STACK_MAT(x_i_best, state_cnt, 1);
+
+    CN_CREATE_STACK_MAT(iR, meas_cnt, R->cols > 1 ? meas_cnt : 1);
+    if (R->cols > 1) {
+        cnInvert(R, &iR, CN_INVERT_METHOD_LU);
+    } else {
+        for (int i = 0; i < meas_cnt; i++) {
+            cn_as_vector(&iR)[i] = cn_as_const_vector(R)[i] == 0 ? 0 : 1. / cn_as_const_vector(R)[i];
+        }
+    }
+    // kalman_print_mat_v(k, 100, "iR", &iR, true);
+
+    CnMat iPS[CNKALMAN_STATES_PER_MODEL] = { 0 };
+    for(int i = 0;i < mk->ks_cnt;i++) {
+        CN_CREATE_STACK_MAT(iP, mk->ks[i]->error_state_size, mk->ks[i]->error_state_size);
+        cnInvert(&mk->ks[i]->P, &iP, CN_INVERT_METHOD_SVD);
+        iPS[i] = iP;
+
+        assert(sane_covariance(&mk->ks[i]->P));
+        assert(cn_is_finite(&iP));
+    }
+
+    assert(cn_is_finite(&iR));
+
+    enum cnkalman_update_extended_termination_reason stop_reason =
+            cnkalman_update_extended_termination_reason_none;
+    FLT current_norm = INFINITY, last_norm = INFINITY;
+    FLT initial_norm = 0;
+
+    cn_matrix_copy(&x_i, x_k_k1);
+    int iter;
+    int max_iter = mk->term_criteria.max_iterations;
+    FLT meas_part = 0, delta_part = 0;
+    CN_CREATE_STACK_MAT(Hxdiff, Z->rows, 1);
+	CN_CREATE_STACK_MAT(error_state_delta, filter_cnt, 1);
+	CN_CREATE_STACK_MAT(error_state_delta_scaled, filter_cnt, 1);
+    CN_CREATE_STACK_MAT(xn, x_i.rows, x_i.cols);
+    CN_CREATE_STACK_MAT(error_state, filter_cnt, 1);
+    CN_CREATE_STACK_MAT(iRy, meas_cnt, 1);
+    CN_CREATE_STACK_MAT(iPdx, filter_cnt, 1);
+    CN_CREATE_STACK_MAT(dVt, filter_cnt, 1);
+
+	CN_CREATE_STACK_VEC(verify_x, state_cnt);
+
+	// `x_k_k1` is prior state
+	// `xi` is current iteration state; initializes as `x_k_k1`
+	// `error_state` is the difference in xi and x_k_k1
+	// `error_state_delta` is the max gain change for this iteration step; which is scaled down if its
+	//        too expensive
+    for (iter = 0; iter < max_iter && stop_reason == cnkalman_update_extended_termination_reason_none; iter++) {
+        // Find the residual y and possibly also the jacobian H. The user could have passed one in as 'user', or given
+        // us a map function which will calculate it.
+        struct CnMat *HR = cnkalman_find_residual(mk, user, Z, &x_i, &y, H);
+        if (stats) {
+            stats->fevals++;
+            stats->hevals++;
+        }
+
+        // If the measurement jacobian isn't calculable, the best we can do is just bail.
+        if (HR == 0) {
+            stop_reason = cnkalman_update_extended_termination_reason_invalid_jacobian;
+            current_norm = INFINITY;
+            break;
+        }
+        last_norm = current_norm;
+
+		// error_state = x_k_k1 - x_i
+        delta_part = 0;
+		if(iter != 0) {
+		    FOR_EACH_STATE(mk)
+                cnkalman_find_error_state(user, k, &x_k_k1_view, &x_i_view, &error_state_view);
+                delta_part += calculate_v_delta(&error_state_view, &iPS[i]);
+		    END_FOR_EACH_STATE()
+		}
+        meas_part = calculate_v_meas(&y, &iR);
+
+        current_norm = meas_part + delta_part;
+        assert(current_norm >= 0);
+
+		if (iter == 0) {
+			initial_norm = current_norm;
+			if (stats) {
+				stats->orignorm_meas += meas_part;
+				stats->origerror = cnNorm(&y);
+			}
+
+			if(initial_norm > mk->term_criteria.max_error && mk->term_criteria.max_error > 0) {
+				stop_reason = cnkalman_update_extended_termination_reason_too_high_error;
+				break;
+			}
+		}
+		//printf("%s iteration %2d: %7.7f/%7.7f %7.7f %7.7f\n", mk->name, iter, current_norm, initial_norm, meas_part, delta_part);
+
+        if (R->cols > 1) {
+            cnGEMM(&iR, &y, 1, 0, 0, &iRy, 0);
+        } else {
+            cnElementwiseMultiply(&iRy, &iR, &y);
+        }
+
+		// dVt = H * (R^-1 * y)' - iP * error_state
+		if(iter != 0) {
+            FOR_EACH_STATE(mk)
+                cnGEMM(&iPS[i], &error_state_view, 1, 0, 0, &iPdx_view, 0);
+            END_FOR_EACH_STATE()
+		}
+		cnGEMM(H, &iRy, -1, &iPdx, -1, &dVt, CN_GEMM_FLAG_A_T);
+
+        // Run update; filling in K
+        cnkalman_find_k(mk, K, H, R);
+
+        if ((stop_reason =
+                     cnkalman_termination_criteria(&mk->term_criteria, initial_norm, current_norm, 1, last_norm)) >
+            cnkalman_update_extended_termination_reason_none) {
+            goto end_of_loop;
+        }
+
+        // error_state_delta = K * (z - y - H * Δx)
+		if(iter != 0) {
+			cnGEMM(H, &error_state, 1, 0, 0, &Hxdiff, 0);
+			cnSub(&y, &y, &Hxdiff);
+		}
+        cnGEMM(K, &y, 1, &error_state, 1, &error_state_delta, 0);
+
+        FLT scale = 1.;
+        FLT m = cnDot(&dVt, &error_state_delta);
+        if (fabs(m) < mk->term_criteria.mtol) {
+            stop_reason = cnkalman_update_extended_termination_reason_mtol;
+            break;
+        }
+        FLT c = .5, tau = .5;
+        FLT fa = 0, fa_best = current_norm, a_best = 0;
+
+        FLT min_step = mk->term_criteria.minimum_step == 0 ? .05 : mk->term_criteria.minimum_step;
+
+        bool exit_condition = false;
+        while (!exit_condition) {
+            exit_condition = true;
+
+			// xn = x_i + scale * error_state
+            FOR_EACH_STATE(mk)
+                cnkalman_update_state(user, k, &x_i_view, scale, &error_state_delta_view, &xn_view);
+            END_FOR_EACH_STATE()
+
+			if(mk->Hfn) {
+				mk->Hfn(user, Z, &xn, &y, 0);
+			} else {
+				cnGEMM((struct CnMat *)user, &xn, -1, Z, 1, &y, 0);
+			}
+
+            if (stats) {
+                stats->fevals++;
+            }
+
+            fa = 0;
+			// error_state = x_k_k1 - xn
+			// xn = x_i_view * scale * error_state_delta
+            FOR_EACH_STATE(mk)
+                cnkalman_find_error_state(user, k, &x_k_k1_view, &xn_view, &error_state_view);
+                cnkalman_update_state(user, k, &x_i_view, scale, &error_state_delta_view, &xn_view);
+                fa += calculate_v_delta(&error_state_view, &iPS[i]);
+            END_FOR_EACH_STATE()
+            meas_part = calculate_v_meas(&y, &iR);
+			//printf("\tscale  %f: %7.7f/%7.7f %7.7f %7.7f\n", scale, current_norm, initial_norm, meas_part, fa);
+			fa += meas_part;
+
+			if (fa_best > fa) {
+				fa_best = fa;
+				a_best = scale;
+			}
+
+            if (fa >= current_norm + scale * m * c) {
+                exit_condition = false;
+
+                scale = tau * scale;
+
+                if (scale <= min_step) {
+                    current_norm = fa_best;
+                    scale = a_best;
+                    break;
+                }
+            }
+        }
+
+        if (stats && stats->total_stats) {
+            stats->total_stats->step_cnt++;
+            stats->total_stats->step_acc += scale;
+        }
+
+		// x_i = x_i + scale * error_state_delta
+		//printf("\tupdate %f: %7.7f/%7.7f\n", scale, fa_best, initial_norm);
+        FOR_EACH_STATE(mk)
+            cnkalman_update_state(user, k, &x_i_view, scale, &error_state_delta_view, &x_i_view);
+
+            if (k->normalize_fn) {
+                k->normalize_fn(k->user, &x_i_view);
+            }
+            assert(cn_is_finite(&x_i));
+        END_FOR_EACH_STATE()
+
+        end_of_loop:
+        if (stop_reason == cnkalman_update_extended_termination_reason_none)
+            stop_reason =
+                    cnkalman_termination_criteria(&mk->term_criteria, initial_norm, current_norm, scale, last_norm);
+    }
+    if (stop_reason == cnkalman_update_extended_termination_reason_none)
+        stop_reason = cnkalman_update_extended_termination_reason_maxiter;
+    bool isFailure = current_norm > initial_norm || isinf(current_norm);
+    if (stats) {
+        stats->besterror = cnNorm(&y);
+        stats->iterations = iter;
+        stats->orignorm = initial_norm;
+        stats->bestnorm = current_norm;
+        stats->stop_reason = stop_reason;
+        stats->bestnorm_meas = meas_part;
+        stats->bestnorm_delta = delta_part;
+        if (stats->total_stats) {
+            stats->total_stats->total_runs++;
+            stats->total_stats->total_failures += isFailure;
+            stats->total_stats->orignorm_acc += initial_norm;
+            stats->total_stats->bestnorm_acc += current_norm;
+            stats->total_stats->stop_reason_counts[stop_reason]++;
+            stats->total_stats->total_fevals += stats->fevals;
+            stats->total_stats->total_hevals += stats->hevals;
+            stats->total_stats->total_iterations += stats->iterations;
+            stats->total_stats->bestnorm_meas_acc += stats->bestnorm_meas;
+            stats->total_stats->bestnorm_delta_acc += stats->bestnorm_delta;
+            stats->total_stats->orignorm_meas_acc += stats->orignorm_meas;
+        }
+    }
+
+    if (isFailure) {
+        initial_norm = -1;
+    } else {
+        assert(cn_is_finite(H));
+        assert(cn_is_finite(K));
+        cn_matrix_copy(x_k_k, &x_i);
+    }
+
+    CN_FREE_STACK_MAT(Hxdiff);
+    CN_FREE_STACK_MAT(error_state_delta);
+    CN_FREE_STACK_MAT(xn);
+    CN_FREE_STACK_MAT(x_diff);
+    CN_FREE_STACK_MAT(iRy);
+    CN_FREE_STACK_MAT(iPdx);
+    CN_FREE_STACK_MAT(dVt);
+    CN_FREE_STACK_MAT(y);
+    CN_FREE_STACK_MAT(x_i);
+    CN_FREE_STACK_MAT(x_i_best);
+
+    CN_FREE_STACK_MAT(iR);
+    CN_FREE_STACK_MAT(iP);
+
+    return initial_norm;
+}
diff --git a/libs/cnkalman/src/kalman.c b/libs/cnkalman/src/kalman.c
new file mode 100644
index 00000000..6b49e481
--- /dev/null
+++ b/libs/cnkalman/src/kalman.c
@@ -0,0 +1,855 @@
+#include "cnkalman/kalman.h"
+#if !defined(__FreeBSD__) && !defined(__APPLE__)
+#include <malloc.h>
+#endif
+#include <cnkalman/kalman.h>
+#include <cnkalman/numerical_diff.h>
+#include <cnmatrix/cn_matrix.h>
+
+#include <memory.h>
+#include <stdio.h>
+
+#include "cnkalman_internal.h"
+
+#include "math.h"
+
+#define KALMAN_LOG_LEVEL 1000
+
+#define CN_KALMAN_VERBOSE(lvl, fmt, ...)                                                                               \
+	{                                                                                                                  \
+		if (k->log_level >= lvl) {                                                                                     \
+			fprintf(stdout, fmt "\n", __VA_ARGS__);                                                                    \
+		}                                                                                                              \
+	}
+
+void cnkalman_set_logging_level(cnkalman_state_t *k, int v) { k->log_level = v; }
+
+void cnkalman_linear_update(struct CnMat *F, const struct CnMat *x0, struct CnMat *x1) {
+	cnGEMM(F, x0, 1, 0, 0, x1, 0);
+}
+
+CN_EXPORT_FUNCTION void cnkalman_linear_transition_fn(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0, struct CnMat *x1, struct CnMat *f_out) {
+    const CnMat* given_F = k->user;
+    if(f_out) cnCopy(given_F, f_out, 0);
+    if(x1) cnGEMM(given_F, x0, 1, 0, 0, x1, (enum cnGEMMFlags)0);
+}
+void kalman_print_mat_v(const cnkalman_state_t *k, int ll, const char *name, const CnMat *M, bool newlines) {
+	if (k->log_level < ll) {
+		return;
+	}
+	char term = newlines ? '\n' : ' ';
+	if (!M) {
+		fprintf(stdout, "null%c", term);
+		return;
+	}
+	fprintf(stdout, "%8s %2d x %2d:%c", name, M->rows, M->cols, term);
+	FLT scale = cn_sum(M);
+	for (unsigned i = 0; i < M->rows; i++) {
+		for (unsigned j = 0; j < M->cols; j++) {
+			FLT v = cnMatrixGet(M, i, j);
+			if (v == 0)
+				fprintf(stdout, "             0, ");
+			else
+				fprintf(stdout, "%+7.7e,", v);
+		}
+		if (newlines && M->cols > 1)
+			fprintf(stdout, "\n");
+	}
+	fprintf(stdout, "\n");
+}
+static void kalman_print_mat(const cnkalman_state_t *k, const char *name, const CnMat *M, bool newlines) {
+	kalman_print_mat_v(k, KALMAN_LOG_LEVEL, name, M, newlines);
+}
+
+void user_is_q(void *user, FLT t, const struct CnMat *x, CnMat *Q_out) {
+	const CnMat *q = (const CnMat *)user;
+	cnScale(Q_out, q, t);
+}
+
+CN_EXPORT_FUNCTION void cnkalman_state_reset(cnkalman_state_t *k) {
+	k->t = 0;
+	cn_set_zero(&k->P);
+
+	cnkalman_predict_state(10, k);
+	k->t = 0;
+
+	kalman_print_mat(k, "initial Pk_k", &k->P, true);
+}
+
+CN_EXPORT_FUNCTION void cnkalman_meas_model_multi_init(cnkalman_state_t **k, size_t cnt, const char *name,
+                                                       cnkalman_meas_model_t *mk, kalman_measurement_model_fn_t Hfn) {
+    memset(mk, 0, sizeof(*mk));
+    assert(cnt < CNKALMAN_STATES_PER_MODEL);
+    for(int i = 0;i < cnt;i++) {
+        mk->ks[i] = k[i];
+    }
+    mk->ks_cnt = cnt;
+    mk->name = name;
+    mk->Hfn = Hfn;
+    mk->term_criteria = (struct term_criteria_t){.max_iterations = 0, .xtol = 1e-2, .mtol = 1e-8, .minimum_step = .05};
+}
+
+CN_EXPORT_FUNCTION void cnkalman_meas_model_init(cnkalman_state_t *k, const char *name,
+												   cnkalman_meas_model_t *mk, kalman_measurement_model_fn_t Hfn) {
+	memset(mk, 0, sizeof(*mk));
+	mk->ks[0] = k;
+	mk->ks_cnt++;
+	mk->name = name;
+	mk->Hfn = Hfn;
+	mk->term_criteria = (struct term_criteria_t){.max_iterations = 0, .xtol = 1e-2, .mtol = 1e-8, .minimum_step = .05};
+}
+
+void cnkalman_error_state_init(cnkalman_state_t *k, size_t state_cnt, size_t error_state_cnt,
+							   kalman_transition_model_fn_t F, kalman_process_noise_fn_t q_fn,
+							   kalman_error_state_model_fn_t Err_F, void *user, FLT *state) {
+	memset(k, 0, sizeof(*k));
+
+	k->state_cnt = (int)state_cnt;
+	k->error_state_size = error_state_cnt;
+	k->ErrorState_fn = Err_F;
+
+	k->Q_fn = q_fn;
+
+	size_t p_size = error_state_cnt;
+	k->P = cnMatCalloc(p_size, p_size);
+
+	k->Transition_fn = F;
+	k->user = user;
+
+	if (!state) {
+		k->State_is_heap = true;
+		state = (FLT*)calloc(1, sizeof(FLT) * k->state_cnt);
+	}
+
+	k->state = cnMat(k->state_cnt, 1, state);
+}
+
+void cnkalman_state_init(cnkalman_state_t *k, size_t state_cnt, kalman_transition_model_fn_t F,
+							   kalman_process_noise_fn_t q_fn, void *user, FLT *state) {
+	cnkalman_error_state_init(k, state_cnt, state_cnt, F, q_fn, 0, user, state);
+}
+
+void cnkalman_state_free(cnkalman_state_t *k) {
+	free(k->P.data);
+	k->P.data = 0;
+
+	if (k->State_is_heap)
+		free(CN_FLT_PTR(&k->state));
+	k->state.data = 0;
+}
+
+void cnkalman_predict_covariance1(FLT dt, cnkalman_state_t *k, const CnMat *F, const CnMat *x,  CnMat *Pk1_k11) {
+	int dims = k->state_cnt;
+
+	CnMat *Pk1_k1 = &k->P;
+	kalman_print_mat(k, "Pk-1_k-1", Pk1_k1, 1);
+
+	// k->P = F * k->P * F^T + Q
+	if(k->state_variance_per_second.rows > 0) {
+		cn_add_diag(Pk1_k1, &k->state_variance_per_second, dt);
+		assert(sane_covariance(Pk1_k1));
+	}
+
+	struct CnMat* Qp = 0;
+	CN_CREATE_STACK_MAT(Q, k->error_state_size, k->error_state_size);
+	if(k->Q_fn) {
+		k->Q_fn(k->user, dt, x, &Q);
+		Qp = &Q;
+		assert(sane_covariance(&Q));
+	}
+	cn_ABAt_add(Pk1_k1, F, Pk1_k1, Qp);
+	assert(sane_covariance(Pk1_k1));
+
+	// printf("!!!! %f\n", cnDet(Pk1_k1));
+	// assert(cnDet(Pk1_k1) >= 0);
+	if (k->log_level >= KALMAN_LOG_LEVEL) {
+		CN_KALMAN_VERBOSE(110, "T: %f", dt);
+		kalman_print_mat(k, "Q", Qp, 1);
+		kalman_print_mat(k, "F", F, 1);
+		kalman_print_mat(k, "Pk1_k-1", Pk1_k1, 1);
+	}
+	CN_FREE_STACK_MAT(Q);
+}
+
+static inline void cnkalman_predict_covariance(FLT dt, const CnMat *F, const CnMat *x, const cnkalman_state_t *k, CnMat *P) {
+
+
+	CnMat Pk1_k1;
+	if(P->rows == P->cols && P->cols == k->P.cols) {
+		Pk1_k1 = *P;
+	} else {
+		FLT *Pk1_k1_data = (FLT*)alloca((k->P.rows) * (k->P.cols) * sizeof(FLT));
+		Pk1_k1 = cnMat(k->P.rows, k->P.cols, Pk1_k1_data);
+	}
+
+	if(Pk1_k1.data != k->P.data) {
+		cnCopy(&k->P, &Pk1_k1, 0);
+	}
+
+	kalman_print_mat(k, "Pk-1_k-1", &Pk1_k1, 1);
+
+	if(dt != 0) {
+		// k->P = F * k->P * F^T + Q
+		if (k->state_variance_per_second.rows > 0) {
+			cn_add_diag(&Pk1_k1, &k->state_variance_per_second, fabs(dt));
+			assert(sane_covariance(&Pk1_k1));
+		}
+
+		struct CnMat *Qp = 0;
+		CN_CREATE_STACK_MAT(Q, k->error_state_size, k->error_state_size);
+		if (k->Q_fn) {
+			k->Q_fn(k->user, fabs(dt), x, &Q);
+			Qp = &Q;
+			assert(sane_covariance(&Q));
+		}
+		cn_ABAt_add(&Pk1_k1, F, &Pk1_k1, Qp);
+		assert(sane_covariance(&Pk1_k1));
+
+		// printf("!!!! %f\n", cnDet(Pk1_k1));
+		// assert(cnDet(Pk1_k1) >= 0);
+		if (k->log_level >= KALMAN_LOG_LEVEL) {
+			CN_KALMAN_VERBOSE(110, "T: %f", dt);
+			kalman_print_mat(k, "Q", Qp, 1);
+			kalman_print_mat(k, "F", F, 1);
+			kalman_print_mat(k, "Pk1_k-1", &Pk1_k1, 1);
+		}
+		CN_FREE_STACK_MAT(Q);
+	}
+
+	if(Pk1_k1.data != P->data) {
+		if(P->cols == 1) {
+			cn_get_diag(&Pk1_k1, cn_as_vector(P), P->rows);
+		} else {
+			CnMat Pk1_k1View = cnMatView(P->rows, P->cols, &Pk1_k1, 0, 0);
+			cnCopy(&Pk1_k1View, P, 0);
+		}
+	}
+}
+void cnkalman_find_s(const cnkalman_state_t *k, cnkalman_gain_matrix *S, const CnMat *Pk_k1Ht, const struct CnMat *H) {
+    kalman_print_mat(k, "H", H, 1);
+
+    cnGEMM(H, Pk_k1Ht, 1, S, 1, S, 0);
+    assert(cn_is_finite(S));
+
+    kalman_print_mat(k, "S", S, 1);
+}
+void cnkalman_find_k_from_s(cnkalman_gain_matrix *K, const CnMat *Pk_k1Ht,
+                            const CnMat *S) {
+
+    CN_CREATE_STACK_MAT(iS, S->rows, S->rows);
+    FLT diag = 0, non_diag = 0;
+#define CHECK_DIAG
+#ifdef CHECK_DIAG
+    FLT* _S = S->data;
+    for (int i = 0; i < S->rows; i++) {
+        for (int j = 0; j < S->rows; j++) {
+            if (i == j) {
+                diag += fabs(_S[i + j * S->rows]);
+                _iS[i + j * S->rows] = 1. / _S[i + j * S->rows];
+            } else {
+                non_diag += fabs(_S[i + j * S->rows]);
+                _iS[i + j * S->rows] = 0;
+            }
+        }
+    }
+#endif
+    if (diag == 0 || non_diag / diag > 1e-5) {
+        cnInvert(S, &iS, CN_INVERT_METHOD_SVD);
+    }
+    assert(cn_is_finite(&iS));
+
+    // K = Pk_k1Ht * iS
+    cnGEMM(Pk_k1Ht, &iS, 1, 0, 0, K, 0);
+}
+
+int cnkalman_model_state_count(const cnkalman_meas_model_t *mk) {
+    int rtn = 0;
+    for(int i = 0;i < mk->ks_cnt;i++) {
+        rtn += mk->ks[i]->state_cnt;
+    }
+    return rtn;
+}
+
+int cnkalman_model_filter_count(const cnkalman_meas_model_t *mk) {
+    int rtn = 0;
+    for(int i = 0;i < mk->ks_cnt;i++) {
+        rtn += mk->ks[i]->error_state_size;
+    }
+    return rtn;
+}
+
+void cnkalman_find_k(const cnkalman_meas_model_t *mk, cnkalman_gain_matrix *K, const struct CnMat *H,
+								  const CnMat *R) {
+    int state_cnt = cnkalman_model_state_count(mk);
+    int filter_cnt = cnkalman_model_filter_count(mk);
+
+	CN_CREATE_STACK_MAT(Pk_k1Ht, filter_cnt, H->rows);
+    CN_CREATE_STACK_MAT(S, H->rows, H->rows);
+    CN_CREATE_STACK_MAT(tmpS, H->rows, H->rows);
+
+    // S = H * P_k|k-1 * H^T + R
+    if (R->cols == 1) {
+        cn_set_diag(&S, cn_as_const_vector(R));
+    } else {
+        cnCopy(R, &S, 0);
+    }
+
+    for(int i = 0, state_idx = 0, filter_idx = 0;i < mk->ks_cnt;i++) {
+	    cnkalman_state_t* k = mk->ks[i];
+        const CnMat *Pk_k = &k->P;
+        CnMat Pk_k1HtView = cnMatView(Pk_k->rows, H->rows, &Pk_k1Ht, filter_idx, 0);
+        const CnMat HView = cnMatConstView(H->rows, k->error_state_size, H, 0, filter_idx);
+        // Pk_k1Ht = P_k|k-1 * H^T
+		if(Pk_k->rows != 0) {
+			cnGEMM(Pk_k, &HView, 1, 0, 0, &Pk_k1HtView, CN_GEMM_FLAG_B_T);
+			cnkalman_find_s(k, &S, &Pk_k1HtView, &HView);
+		}
+
+        state_idx += k->state_cnt;
+        filter_idx += k->error_state_size;
+    }
+
+    cnkalman_find_k_from_s(K, &Pk_k1Ht, &S);
+}
+
+static void cnkalman_update_covariance(cnkalman_state_t *k, const cnkalman_gain_matrix *K,
+											 const struct CnMat *H, const struct CnMat *R) {
+	int filter_cnt = k->P.rows;
+	CN_CREATE_STACK_MAT(eye, filter_cnt, filter_cnt);
+	cn_set_diag_val(&eye, 1);
+
+	CN_CREATE_STACK_MAT(ikh, filter_cnt, filter_cnt);
+
+	// ikh = (I - K * H)
+	cnGEMM(K, H, -1, &eye, 1, &ikh, 0);
+
+	// cvGEMM does not like the same addresses for src and destination...
+	CnMat *Pk_k = &k->P;
+	CN_CREATE_STACK_MAT(tmp, filter_cnt, filter_cnt);
+	cnCopy(Pk_k, &tmp, 0);
+
+	CN_CREATE_STACK_MAT(kRkt, filter_cnt, filter_cnt);
+	bool use_joseph_form = R->rows == R->cols;
+	if (use_joseph_form) {
+	  cn_ABAt_add(&kRkt, K, R, 0);
+		for (int i = 0; i < kRkt.rows; i++) {
+			FLT v = cnMatrixGet(&kRkt, i, i);
+			cnMatrixSet(&kRkt, i, i, v > 0 ? v : 0);
+		}
+		assert(sane_covariance(&kRkt));
+		cn_ABAt_add(Pk_k, &ikh, &tmp, &kRkt);
+	} else {
+		// P_k|k = (I - K * H) * P_k|k-1
+		cnGEMM(&ikh, &tmp, 1, 0, 0, Pk_k, 0);
+	}
+	for (int i = 0; i < k->P.rows; i++) {
+		cnMatrixSet(&k->P, i, i, fabs(cnMatrixGet(&k->P, i, i)));
+		for (int j = i + 1; j < k->P.rows; j++) {
+			FLT v1 = cnMatrixGet(&k->P, i, j);
+			FLT v2 = cnMatrixGet(&k->P, j, i);
+			FLT v = (v1 + v2) / 2.;
+			if (fabs(v) < 1e-10)
+				v = 0;
+			cnMatrixSet(&k->P, i, j, v);
+			cnMatrixSet(&k->P, j, i, v);
+		}
+	}
+	// printf("!!!? %7.7f\n", cnDet(Pk_k));
+	assert(sane_covariance(Pk_k));
+	assert(cn_is_finite(Pk_k));
+	if (k->log_level >= KALMAN_LOG_LEVEL) {
+		fprintf(stdout, "INFO gain\t");
+		kalman_print_mat(k, "K", K, true);
+
+		kalman_print_mat(k, "ikh", &ikh, true);
+
+		fprintf(stdout, "INFO new Pk_k\t");
+		kalman_print_mat(k, "Pk_k", Pk_k, true);
+	}
+	CN_FREE_STACK_MAT(tmp);
+	CN_FREE_STACK_MAT(ikh);
+	CN_FREE_STACK_MAT(eye);
+}
+
+static inline void cnkalman_predict(FLT t, const cnkalman_state_t *k, const CnMat *x_t0_t0, CnMat *x_t0_t1, CnMat* F) {
+	// X_k|k-1 = Predict(X_K-1|k-1)
+	if (k->log_level > KALMAN_LOG_LEVEL) {
+		fprintf(stdout, "INFO kalman_predict from ");
+		kalman_print_mat(k, "x_t0_t0", x_t0_t0, false);
+	}
+	assert(cn_as_const_vector(x_t0_t0) != cn_as_const_vector(x_t0_t1));
+	if (t == k->t || k->Transition_fn == 0) {
+		cnCopy(x_t0_t0, x_t0_t1, 0);
+		if(F) {
+			cn_eye(F, 0);
+		}
+	} else {
+		//assert(t > k->t);
+		k->Transition_fn(t - k->t, k, x_t0_t0, x_t0_t1, F);
+	}
+	if (k->log_level > KALMAN_LOG_LEVEL) {
+		fprintf(stdout, "INFO kalman_predict to   ");
+		kalman_print_mat(k, "x_t0_t1", x_t0_t1, false);
+	}
+	if (k->datalog) {
+		CN_CREATE_STACK_MAT(tmp, x_t0_t0->rows, x_t0_t1->cols);
+		cn_elementwise_subtract(&tmp, x_t0_t1, x_t0_t0);
+		k->datalog(k, "predict_diff", cn_as_const_vector(&tmp), tmp.rows * tmp.cols);
+	}
+}
+
+typedef struct numeric_jacobian_predict_fn_ctx {
+    FLT dt;
+    const cnkalman_state_t *k;
+} numeric_jacobian_predict_fn_ctx;
+static bool numeric_jacobian_predict_fn(void * user, const struct CnMat *x, struct CnMat *y) {
+    numeric_jacobian_predict_fn_ctx* ctx = user;
+    ctx->k->Transition_fn(ctx->dt, ctx->k, x, y, 0);
+    return true;
+}
+
+static bool numeric_jacobian_predict(const cnkalman_state_t *k, enum cnkalman_jacobian_mode mode, FLT dt, const struct CnMat *x, CnMat *H) {
+    numeric_jacobian_predict_fn_ctx ctx = {
+            .dt = dt,
+            .k = k
+    };
+    return cnkalman_numerical_differentiate(&ctx, mode == cnkalman_jacobian_mode_debug ?
+    (enum cnkalman_numerical_differentiate_mode) cnkalman_jacobian_mode_two_sided : mode, numeric_jacobian_predict_fn, x, H);
+}
+
+typedef struct numeric_jacobian_meas_fn_ctx {
+	cnkalman_meas_model_t *mk;
+    void *user;
+    const struct CnMat *Z, *x;
+} numeric_jacobian_meas_fn_ctx;
+
+static bool numeric_jacobian_meas_fn(void * user, const struct CnMat *x, struct CnMat *y) {
+    numeric_jacobian_meas_fn_ctx* ctx = user;
+
+	CN_CREATE_STACK_VEC(x1, cnkalman_model_state_count(ctx->mk));
+	for(int i = 0, state_idx = 0, filter_idx = 0;i < ctx->mk->ks_cnt;i++) {
+		cnkalman_state_t *k = ctx->mk->ks[i];
+
+		CnMat x1_view = cnMatView(k->state_cnt, 1, &x1, state_idx, 0);
+
+		if (k->ErrorState_fn && ctx->mk->error_state_model) {
+			const CnMat ctx_x_view = cnMatConstView(k->state_cnt, 1, ctx->x, state_idx, 0);
+			const CnMat x_view = cnMatConstView(k->error_state_size, 1, x, filter_idx, 0);
+			k->Update_fn(user, &ctx_x_view, &x_view, &x1_view, 0);
+		} else {
+			const CnMat x_view = cnMatConstView(k->state_cnt, 1, x, state_idx, 0);
+			cnCopy(&x_view, &x1_view, 0);
+		}
+
+		state_idx += k->state_cnt;
+		filter_idx += k->error_state_size;
+	}
+
+	if (ctx->mk->Hfn(ctx->user, ctx->Z, &x1, y, 0) == 0)
+		return false;
+
+    // Hfn gives jacobian of measurement estimation E, y returns the residual (Z - E). So we invert it and its the form
+    // we want going forward
+    cnScale(y, y, -1);
+    return true;
+}
+
+static inline bool compare_jacobs(const char* label, const CnMat *H, const CnMat *H_calc, const CnMat *y, const CnMat *Z) {
+	bool needsPrint = true;
+
+    for (int j = 0; j < H->cols; j++) {
+		if(!needsPrint) {
+			fprintf(stderr, "FJAC COLUMN %d\n", j);
+		}
+        for (int i = 0; i < H->rows; i++) {
+
+            FLT deriv_u = cnMatrixGet(H, i, j);
+            FLT deriv_n = cnMatrixGet(H_calc, i, j);
+            FLT diff_abs = fabs(deriv_n - deriv_u);
+            FLT diff_rel = diff_abs / (deriv_n + deriv_u + 1e-10);
+
+            if ((diff_abs > 1e-2 && diff_rel > 1e-2)) {
+				if(needsPrint) {
+					fprintf(stderr, "FJAC DEBUG BEGIN %s %2dx%2d\n", label, H->rows, H->cols);
+					fprintf(stderr, "FJAC COLUMN %d\n", j);
+
+					needsPrint = false;
+				}
+
+                fprintf(stderr, "%2d %+7.7f %+7.7f %+7.7f %+7.7f %+7.7f %+7.7f \n", i, cn_as_const_vector(Z)[i],
+						cn_as_const_vector(y)[i], deriv_u,
+                        deriv_n, diff_abs, diff_rel);
+            }
+        }
+    }
+	if(!needsPrint) {
+		fprintf(stderr, "FJAC DEBUG END\n");
+	}
+	return needsPrint;
+}
+
+CnMat *cnkalman_find_residual(cnkalman_meas_model_t *mk, void *user, const struct CnMat *Z,
+										   const struct CnMat *x, CnMat *y, CnMat *H) {
+	kalman_measurement_model_fn_t Hfn = mk->Hfn;
+	int state_cnt = cnkalman_model_state_count(mk);
+	int filter_cnt = cnkalman_model_filter_count(mk);
+
+	if (H) {
+		cn_set_constant(H, INFINITY);
+	}
+
+	CnMat *rtn = 0;
+	CN_CREATE_STACK_MAT(HFullState, Z->rows, state_cnt);
+
+    bool hasErrorState = state_cnt != filter_cnt;
+
+    if (Hfn) {
+		CnMat * Hfn_arg = (hasErrorState && !mk->error_state_model) ? &HFullState : H;
+		bool needsJacobian = H && (mk->meas_jacobian_mode == cnkalman_jacobian_mode_user_fn || mk->meas_jacobian_mode == cnkalman_jacobian_mode_debug);
+        bool okay = Hfn(user, Z, x, y, needsJacobian ? Hfn_arg : 0);
+		if (okay == false) {
+			return 0;
+		}
+		if(needsJacobian) {
+			assert(cn_is_finite(Hfn_arg));
+		}
+
+		if (mk->meas_jacobian_mode != cnkalman_jacobian_mode_user_fn && H) {
+			CN_CREATE_STACK_MAT(H_calc, Hfn_arg->rows, Hfn_arg->cols);
+
+			CN_CREATE_STACK_VEC(xe, mk->error_state_model ? filter_cnt : state_cnt);
+
+			if(!hasErrorState || !mk->error_state_model) {
+				cnCopy(x, &xe, 0);
+			}
+			numeric_jacobian_meas_fn_ctx ctx = {
+				.mk = mk,
+				.user = user,
+				.Z = Z,
+				.x = x
+			};
+			cnkalman_numerical_differentiate_step_size(&ctx,
+											 mk->meas_jacobian_mode == cnkalman_jacobian_mode_debug
+												 ? cnkalman_numerical_differentiate_mode_two_sided : mk->meas_jacobian_mode,
+													   mk->numeric_step_size, numeric_jacobian_meas_fn, &xe, &H_calc);
+
+			if(mk->meas_jacobian_mode == cnkalman_jacobian_mode_debug) {
+				mk->numeric_calcs++;
+                if(!compare_jacobs(mk->name, Hfn_arg, &H_calc, y, Z)) {
+					mk->numeric_misses++;
+					fprintf(stderr, "User H: \n");
+					cn_print_mat(Hfn_arg);
+					fprintf(stderr, "Calculated H: \n");
+					cn_print_mat(&H_calc);
+
+					fprintf(stderr, "For state: ");
+					cn_print_mat(x);
+					fprintf(stderr, "For Z:     ");
+					fprintf(stderr, "Numeric %f%% miss rate", 100. * (FLT)mk->numeric_misses / (FLT)mk->numeric_calcs++);
+					cn_print_mat(Z);
+				}
+            }
+
+            cn_matrix_copy(Hfn_arg, &H_calc);
+		}
+		rtn = H;
+	} else {
+		rtn = (struct CnMat *)user;
+		cnGEMM(rtn, x, -1, Z, 1, y, 0);
+	}
+
+    if(!mk->error_state_model && hasErrorState && H) {
+		for(int i = 0, state_idx = 0, filter_idx = 0;i < mk->ks_cnt;i++) {
+			cnkalman_state_t* k = mk->ks[i];
+
+			CN_CREATE_STACK_MAT(Hxsx, k->state_cnt, k->error_state_size);
+			CnMat *HFn = Hfn ? &HFullState : rtn;
+			CnMat HFn_view = cnMatView(HFn->rows, k->state_cnt, HFn, 0, state_idx);
+			CnMat H_view = cnMatView(H->rows, k->error_state_size, H, 0, filter_idx);
+			if (k->Update_fn && rtn && H && !mk->error_state_model) {
+				const CnMat x_view = cnMatConstView(k->state_cnt, 1, x, state_idx, 0);
+				k->Update_fn(user, &x_view, 0, 0, &Hxsx);
+				cnGEMM(&HFn_view, &Hxsx, 1, 0, 0, &H_view, 0);
+			} else {
+				cnCopy(&HFn_view, &H_view, 0);
+			}
+			state_idx += k->state_cnt;
+			filter_idx += k->error_state_size;
+		}
+		rtn = H;
+    }
+
+	assert(!rtn || cn_is_finite(rtn));
+	assert(cn_is_finite(y));
+
+	return rtn;
+}
+
+CN_EXPORT_FUNCTION void cnkalman_extrapolate_state(FLT t, const cnkalman_state_t *k, CnMat*x1, CnMat* P) {
+	FLT dt = t - k->t;
+	if(t == 0 || fabs(dt) < 1e-4){
+		dt = 0;
+	}
+
+	int state_cnt = k->state_cnt;
+
+	bool FInErrorState = k->error_state_transition || k->Transition_fn == 0;
+	size_t f_size = (FInErrorState) ? k->error_state_size : state_cnt;
+	CN_CREATE_STACK_MAT(F, f_size, f_size);
+	CN_CREATE_STACK_MAT(FHxsx, k->error_state_size, k->error_state_size);
+	cn_set_constant(&F, NAN);
+	CnMat* FP = &F;
+
+	CnMat x_k_k;
+	if(x1->rows == k->state.rows) {
+		x_k_k = *x1;
+	} else {
+		FLT *x_k_k_data = (FLT*)alloca((state_cnt) * sizeof(FLT));
+		x_k_k = cnVec(k->state_cnt, x_k_k_data);
+	}
+
+	if (dt != 0) {
+		CN_CREATE_STACK_MAT(x_k1_k1, state_cnt, 1);
+		cn_matrix_copy(&x_k1_k1, &k->state);
+
+		cnkalman_predict(t, k, &x_k1_k1, &x_k_k, &F);
+
+		assert(cn_is_finite(&F));
+
+		if(k->Update_fn && FInErrorState == false) {
+			CN_CREATE_STACK_MAT(OldXOlde, state_cnt, k->error_state_size);
+			k->Update_fn(k->user, &x_k1_k1, 0, 0, &OldXOlde);
+			CN_CREATE_STACK_MAT(NewENewX, k->error_state_size, state_cnt);
+			k->ErrorState_fn(k->user, &x_k_k, 0, 0, &NewENewX);
+
+			CN_CREATE_STACK_MAT(FOldXOldE, F.rows, OldXOlde.cols);
+			cnGEMM(&F, &OldXOlde, 1, 0, 0, &FOldXOldE, 0);
+			cnGEMM(&NewENewX, &FOldXOldE, 1, 0, 0, &FHxsx, 0);
+			FP = &FHxsx;
+		}
+
+		if(k->transition_jacobian_mode != cnkalman_jacobian_mode_user_fn) {
+			CN_CREATE_STACK_MAT(F_calc, F.rows, F.cols);
+
+			numeric_jacobian_predict(k, k->transition_jacobian_mode, dt, &x_k1_k1, &F_calc);
+
+			if(k->transition_jacobian_mode == cnkalman_jacobian_mode_debug) {
+				compare_jacobs("predict", &F, &F_calc, &x_k1_k1, &x_k_k);
+			}
+
+			cn_matrix_copy(&F, &F_calc);
+		}
+	} else {
+		cn_matrix_copy(&x_k_k, &k->state);
+	}
+
+	if(P) {
+		cnkalman_predict_covariance(dt, FP, &x_k_k, k, P);
+	}
+
+	if(x_k_k.data != x1->data) {
+		memcpy(cn_as_vector(x1), cn_as_const_vector(&x_k_k), sizeof(FLT) * x1->rows);
+	}
+
+	CN_FREE_STACK_MAT(F);
+}
+
+void cnkalman_predict_state(FLT t, cnkalman_state_t *k) {
+	cnkalman_extrapolate_state(t, k, &k->state, &k->P);
+    k->t = t;
+}
+
+// https://arxiv.org/pdf/1702.00884.pdf
+static void
+calculate_adaptive_covariance(cnkalman_meas_model_t *mk, void *user, const struct CnMat *Z, CnMat *R,
+                              CnMat *Pm, const struct CnMat *H) {
+	assert(mk->ks_cnt == 1);
+    const cnkalman_state_t *k = mk->ks[0];
+    const CnMat *x_k_k = &k->state;
+    int state_cnt = k->state_cnt;
+
+    CN_CREATE_STACK_MAT(y, Z->rows, Z->cols);
+    CN_CREATE_STACK_MAT(scaled_eTeHPkHt, Z->rows, Z->rows);
+    CN_CREATE_STACK_MAT(yyt, Z->rows, Z->rows);
+
+    cnkalman_find_residual(mk, user, Z, x_k_k, &y, 0);
+    cnMulTransposed(&y, &yyt, false, 0, 1);
+
+    CN_CREATE_STACK_MAT(Pk_k1Ht, state_cnt, H->rows);
+
+    FLT a = .3;
+    FLT b = 1 - a;
+    cnGEMM(Pm, H, 1, 0, 0, &Pk_k1Ht, CN_GEMM_FLAG_B_T);
+    cnGEMM(H, &Pk_k1Ht, b, &yyt, b, &scaled_eTeHPkHt, 0);
+
+    kalman_print_mat_v(k, 200, "PkHt", &Pk_k1Ht, true);
+    kalman_print_mat_v(k, 200, "yyt", &yyt, true);
+
+    cnAddScaled(R, R, a, &scaled_eTeHPkHt, 1);
+
+    kalman_print_mat_v(k, 200, "Adaptive R", R, true);
+
+    CN_FREE_STACK_MAT(Pk_k1Ht);CN_FREE_STACK_MAT(yyt);CN_FREE_STACK_MAT(scaled_eTeHPkHt);
+}
+
+
+static FLT cnkalman_predict_update_state_extended_adaptive_internal(
+	FLT t, void *user, const struct CnMat *Z, CnMat *R, cnkalman_meas_model_t *mk,
+	struct cnkalman_update_extended_stats_t *stats) {
+
+	assert(R->rows == Z->rows && (R->cols == 1 || R->cols == R->rows));
+	assert(Z->cols == 1);
+
+	if (R->cols == R->rows) {
+		assert(sane_covariance(R));
+	}
+	kalman_measurement_model_fn_t Hfn = mk->Hfn;
+	bool adaptive = mk->adaptive;
+
+    int state_cnt = cnkalman_model_state_count(mk);
+	int filter_cnt = cnkalman_model_filter_count(mk);
+
+    FLT result = 0;
+
+    // Setup the R matrix.
+	if (adaptive && R->rows != R->cols) {
+		assert(false);
+		adaptive = false;
+	}
+
+    CnMat x_k_k = { 0 };
+    CN_CREATE_STACK_VEC(x_k_k1, state_cnt);
+    if(mk->ks_cnt == 1) {
+        x_k_k = mk->ks[0]->state;
+    } else {
+        x_k_k = cnMat(state_cnt, 1, alloca(state_cnt * sizeof(FLT)));
+    }
+
+    // Run prediction steps -- gets new state, and covariance matrix based on time delta
+    int state_idx = 0;
+	for(int ki = 0;ki < mk->ks_cnt;ki++) {
+        cnkalman_state_t *k = mk->ks[ki];
+        cnkalman_predict_state(t, k);
+        memcpy(cn_as_vector(&x_k_k1) + state_idx, cn_as_vector(&k->state), sizeof(FLT) * k->state_cnt);
+        state_idx += k->state_cnt;
+	}
+
+    // Adaptive update happens on the covariance matrix prior; so save it.
+    CN_CREATE_STACK_MAT(Pm, state_cnt, state_cnt);
+    if (adaptive) {
+        assert(mk->ks_cnt == 1);
+        cn_matrix_copy(&Pm, &mk->ks[0]->P);
+    }
+
+	if (mk->ks[0]->log_level > KALMAN_LOG_LEVEL) {
+		fprintf(stdout, "INFO kalman_predict_update_state_extended t=%f", t);
+		kalman_print_mat(mk->ks[0], "Z", Z, false);
+		fprintf(stdout, "\n");
+	}
+
+	CN_CREATE_STACK_MAT(K, filter_cnt, Z->rows);
+	CN_CREATE_STACK_MAT(HStorage, Z->rows, filter_cnt);
+	struct CnMat *H = &HStorage;
+
+	if (mk->term_criteria.max_iterations > 0) {
+		result = cnkalman_run_iterations(mk, Z, R, user, &x_k_k1, &K, H, &x_k_k, stats);
+		if (result < 0)
+			return result;
+	} else {
+        CN_CREATE_STACK_MAT(y, Z->rows, Z->cols);
+		H = cnkalman_find_residual(mk, user, Z, &x_k_k1, &y, H);
+
+		if (H == 0) {
+			return -1;
+		}
+
+        cnkalman_find_k(mk, &K, H, R);
+
+		// Calculate the next state
+		CN_CREATE_STACK_VEC(Ky, filter_cnt);
+		cnGEMM(&K, &y, 1, 0, 0, &Ky, 0);
+
+		int filter_idx = 0;
+        for(int ki = 0, state_idx = 0;ki < mk->ks_cnt;ki++) {
+            CnMat x_k_k1_view = cnMatView(mk->ks[ki]->state_cnt, 1, &x_k_k1, state_idx, 0);
+            CnMat x_k_k_view = cnMatView(mk->ks[ki]->state_cnt, 1, &x_k_k, state_idx, 0);
+            CnMat ky_view = cnMatView(mk->ks[ki]->error_state_size, 1, &Ky, filter_idx, 0);
+            cnkalman_update_state(user, mk->ks[ki], &x_k_k1_view, 1, &ky_view, &x_k_k_view);
+            result += cnNorm2(&y);
+            state_idx += mk->ks[ki]->state_cnt;
+            filter_idx += mk->ks[ki]->error_state_size;
+        }
+
+		if(stats) {
+            CN_CREATE_STACK_MAT(yp, Z->rows, Z->cols);
+            cnkalman_find_residual(mk, user, Z, &x_k_k, &yp, 0);
+		    stats->origerror = sqrt(result);
+            stats->besterror = cnNorm(&yp);
+			if(stats->total_stats) {
+				stats->total_stats->orignorm_acc += stats->origerror;
+				stats->total_stats->bestnorm_acc += stats->besterror;
+
+				stats->total_stats->total_runs++;
+				stats->total_stats->total_hevals++;
+				stats->total_stats->total_fevals++;
+			}
+		}
+	}
+
+	assert(cn_is_finite(H));
+	assert(cn_is_finite(&K));
+
+	if (mk->ks[0]->log_level > KALMAN_LOG_LEVEL) {
+		fprintf(stdout, "INFO kalman_update to    ");
+		kalman_print_mat(mk->ks[0], "x1", &x_k_k, false);
+	}
+
+	if(mk->ks_cnt > 1 || x_k_k.data != mk->ks[0]->state.data) {
+        for(int ki = 0, state_idx = 0;ki < mk->ks_cnt;ki++) {
+            CnMat x_k_k_view = cnMatView(mk->ks[ki]->state_cnt, 1, &x_k_k, state_idx, 0);
+            cnCopy(&x_k_k_view, &mk->ks[ki]->state, 0);
+            state_idx += mk->ks[ki]->state_cnt;
+        }
+	}
+
+    for(int ki = 0, state_idx = 0;ki < mk->ks_cnt;ki++) {
+        CnMat kv = cnMatView(mk->ks[ki]->error_state_size, Z->rows, &K, state_idx, 0);
+        CnMat hv = cnMatView(Z->rows, mk->ks[ki]->error_state_size, H, 0, state_idx);
+        cnkalman_update_covariance(mk->ks[ki], &kv, &hv, R);
+        state_idx += mk->ks[ki]->error_state_size;
+
+        mk->ks[ki]->t = t;
+        assert(sane_covariance(&mk->ks[ki]->P));
+    }
+
+	if (adaptive) {
+        calculate_adaptive_covariance(mk, user, Z, R, &Pm, H);
+    }
+
+	CN_FREE_STACK_MAT(K);
+	CN_FREE_STACK_MAT(HStorage);
+	CN_FREE_STACK_MAT(x_k_k1);
+	CN_FREE_STACK_MAT(Pm);
+
+	return result;
+}
+
+FLT cnkalman_meas_model_predict_update_stats(FLT t, struct cnkalman_meas_model *mk, void *user,
+												   const struct CnMat *Z, CnMat *R,
+												   struct cnkalman_update_extended_stats_t *stats) {
+	return cnkalman_predict_update_state_extended_adaptive_internal(t, user, Z, R, mk, stats);
+}
+
+FLT cnkalman_meas_model_predict_update(FLT t, struct cnkalman_meas_model *mk, void *user,
+											 const struct CnMat *Z, CnMat *R) {
+	struct cnkalman_update_extended_stats_t stats = {.total_stats = &mk->stats};
+	return cnkalman_predict_update_state_extended_adaptive_internal(t, user, Z, R, mk, &stats);
+}
+
+FLT cnkalman_predict_update_state(FLT t, cnkalman_state_t *k, const struct CnMat *Z, const struct CnMat *H,
+										CnMat *R, bool adaptive) {
+	cnkalman_meas_model_t mk = {.adaptive = adaptive, .ks = {k}, .ks_cnt = 1, .term_criteria = { .max_iterations = 10}};
+	return cnkalman_meas_model_predict_update(t, &mk, (void *)H, Z, R);
+}
+
+void cnkalman_set_P(cnkalman_state_t *k, const FLT *p) { cn_set_diag(&k->P, p); }
diff --git a/libs/cnkalman/src/model.cc b/libs/cnkalman/src/model.cc
new file mode 100644
index 00000000..8ee1dcbd
--- /dev/null
+++ b/libs/cnkalman/src/model.cc
@@ -0,0 +1,226 @@
+#include <iostream>
+#include "cnkalman/model.h"
+namespace cnkalman {
+
+    bool KalmanMeasurementModel::residual(const CnMat& Z, const CnMat& x_t, CnMat* y, CnMat* H_k) {
+        CN_CREATE_STACK_VEC(pz, Z.rows);
+        auto rtn = predict_measurement(x_t, &pz, H_k);
+        //cn_elementwise_subtract(y, &pz, &Z);
+        cn_elementwise_subtract(y, &Z, &pz);
+        return rtn;
+    }
+
+    static bool kalman_measurement(void *user, const struct CnMat *Z, const struct CnMat *x_t, struct CnMat *y, struct CnMat *H_k) {
+        auto *self = static_cast<cnkalman::KalmanMeasurementModel *>(user);
+
+        return self->residual(*Z, *x_t, y, H_k);
+    }
+
+    KalmanMeasurementModel::KalmanMeasurementModel(KalmanModel* kalmanModel, const std::string &name, size_t meas_cnt) : meas_cnt(meas_cnt) {
+        cnkalman_meas_model_init(&kalmanModel->kalman_state, "meas", &meas_mdl, kalman_measurement);
+    }
+
+    cnkalman_update_extended_stats_t KalmanMeasurementModel::update(FLT t, const CnMat &Z, CnMat &R) {
+        struct cnkalman_update_extended_stats_t stats = {};
+        cnkalman_meas_model_predict_update_stats(t, &meas_mdl, this, &Z, &R, &stats);
+        return stats;
+    }
+
+    void KalmanMeasurementModel::sample_measurement(const CnMat &x, CnMat &Z, const CnMat &R) {
+        predict_measurement(x, &Z, 0);
+
+        CN_CREATE_STACK_MAT(RL, meas_cnt, meas_cnt);
+        cnSqRootSymmetric(&R, &RL);
+        CN_CREATE_STACK_MAT(X, meas_cnt, 1);
+        CN_CREATE_STACK_MAT(Xs, meas_cnt, 1);
+        cnRand(&X, 0, 1);
+        cnGEMM(&RL, &X, 1, 0, 0, &Xs, (cnGEMMFlags)0);
+        cn_elementwise_add(&Z, &Z, &Xs);
+    }
+
+    std::ostream &KalmanMeasurementModel::write(std::ostream & os) const {
+        os << "{" << std::endl;
+        os << "\t" << R"("name": ")" << this->meas_mdl.name << "\"," << std::endl;
+        os << "\t" << R"("meas_cnt": )" << meas_cnt << std::endl;
+        os << "}";
+        return os;
+    }
+
+    static void transition_bounce(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0, struct CnMat *x1, CnMat* F) {
+        auto *self = static_cast<cnkalman::KalmanModel *>(k->user);
+        self->predict(dt, *x0, x1, F);
+    }
+
+    static void kalman_process_noise_fn(void *user, FLT dt, const struct CnMat *x, struct CnMat *Q_out) {
+        auto *self = static_cast<cnkalman::KalmanModel *>(user);
+        self->process_noise(dt, *x, *Q_out);
+    }
+
+    KalmanModel::KalmanModel(const std::string& name, size_t state_cnt) : name(name), state_cnt(state_cnt) {
+        cnkalman_state_init(&kalman_state, state_cnt, transition_bounce, kalman_process_noise_fn, this, 0);
+        state = kalman_state.state.data;
+        stateM = &kalman_state.state;
+    }
+
+    void KalmanModel::sample_state(FLT dt, const CnMat &x0, CnMat &x1, const struct CnMat* iQ) {
+        CN_CREATE_STACK_MAT(X, state_cnt, 1);
+        CN_CREATE_STACK_MAT(Xs, state_cnt, 1);
+        CN_CREATE_STACK_MAT(Q, state_cnt, state_cnt);
+
+        CN_CREATE_STACK_MAT(QL, state_cnt, state_cnt);
+
+        predict(dt, x0, &x1);
+
+        if(!iQ) {
+            process_noise(dt, x1, Q);
+        }
+
+        cnSqRootSymmetric(iQ ? iQ : &Q, &QL);
+        cnRand(&X, 0, 1);
+        cnGEMM(&QL, &X, 1, 0, 0, &Xs, (cnGEMMFlags)0);
+        cn_elementwise_add(&x1, &x1, &Xs);
+    }
+
+    void KalmanModel::update(FLT t) {
+        cnkalman_predict_state(t, &kalman_state);
+    }
+
+    bool KalmanLinearMeasurementModel::predict_measurement(const CnMat &x, CnMat *z, CnMat *h) {
+        if(z) cnGEMM(&H, &x, 1, 0, 0, z, cnGEMMFlags(0));
+        if(h) cnCopy(&H, h, 0);
+        return true;
+    }
+
+    KalmanLinearMeasurementModel::KalmanLinearMeasurementModel(KalmanModel* kalmanModel, const std::string& name,
+                                                               const CnMat& H) : KalmanMeasurementModel(kalmanModel, name, H.rows) {
+        this->H = cnMatCalloc(H.rows, H.cols);
+        cnCopy(&H, &this->H, 0);
+    }
+
+    KalmanLinearMeasurementModel::~KalmanLinearMeasurementModel() {
+        free(this->H.data);
+    }
+
+    std::ostream& KalmanModel::write(std::ostream& os) const {
+        os << "\"model\": {" << std::endl;
+        os << "\t" << R"("name": ")" << name << "\"," << std::endl;
+        os << "\t" << R"("state_cnt": )" << state_cnt << "," << std::endl;
+        os << "\t" << R"("measurement_models": [)" << std::endl;
+        bool needsComma = false;
+        for(auto& meas : measurementModels) {
+            if(needsComma) os << ", ";
+            needsComma = true;
+            meas->write(os);
+        }
+        os << "]" << std::endl;
+        os << "}" << std::endl;
+        return os;
+    }
+
+    KalmanModel::~KalmanModel() {
+        cnkalman_state_free(&this->kalman_state);
+    }
+
+    /*
+    void KalmanModel::predict(double dt, const CnMat &x0, CnMat *x1, CnMat* F) {
+        int state_cnt = this->state_cnt;
+        CN_CREATE_STACK_MAT(F, state_cnt, state_cnt);
+        CN_CREATE_STACK_VEC(x1_, state_cnt);
+
+        predict(dt, F, x0);
+
+        // X_k|k-1 = F * X_k-1|k-1
+        cnGEMM(&F, &x0, 1, 0, 0, &x1_, (cnGEMMFlags)0);
+        cnCopy(&x1_, &x1, 0);
+        CN_FREE_STACK_MAT(x1_);
+        CN_FREE_STACK_MAT(F);
+    }
+*/
+
+    static inline bool bulk_update_fn(void *user, const struct CnMat *Zs, const struct CnMat *x_t, struct CnMat *ys, struct CnMat *H_ks) {
+        auto *self = static_cast<cnkalman::KalmanModel *>(user);
+        size_t meas_cnt = 0, max_cnt = 0;
+        for(auto& meas : self->measurementModels) {
+            meas_cnt += meas->meas_cnt;
+            if(max_cnt < meas->meas_cnt) max_cnt= meas->meas_cnt;
+        }
+
+        FLT* HStorage = (FLT*)alloca(max_cnt * self->state_cnt * sizeof(FLT));
+        int meas_idx = 0;
+
+        if(H_ks) {
+            cnSetZero(H_ks);
+        }
+
+        for(auto & meas : self->measurementModels) {
+            auto Z = cnVec(meas->meas_cnt, Zs->data + meas_idx);
+            auto y = cnVec(meas->meas_cnt, ys ? ys->data + meas_idx : 0);
+            auto H = cnMat(meas->meas_cnt, self->state_cnt, HStorage);
+            cnSetZero(&H);
+
+            bool okay = meas->residual(Z, *x_t, ys ? &y : 0, H_ks ? &H : 0);
+            if(!okay)
+                return false;
+
+            if(H_ks) {
+                cnCopyROI(&H, H_ks, meas_idx, 0);
+            }
+            meas_idx += (int)meas->meas_cnt;
+        }
+
+        return true;
+    }
+    void KalmanModel::bulk_update(FLT t, const std::vector<CnMat> &Zs, const std::vector<cnmatrix::Matrix> &Rs) {
+        assert(Zs.size() == Rs.size());
+        assert(Zs.size() == measurementModels.size());
+        size_t meas_cnt = 0;
+        int min_iterations = 0;
+        bool debugJac = false;
+        for(auto& meas : measurementModels) {
+            meas_cnt += meas->meas_cnt;
+            if(meas->meas_mdl.term_criteria.max_iterations > min_iterations)
+                min_iterations = meas->meas_mdl.term_criteria.max_iterations;
+            if(meas->meas_mdl.meas_jacobian_mode == cnkalman_jacobian_mode_debug)
+                debugJac = true;
+        }
+
+        CN_CREATE_STACK_VEC(Z, meas_cnt);
+        CN_CREATE_STACK_MAT(R, meas_cnt, meas_cnt);
+        int meas_idx = 0;
+        for(int i = 0;i < (int)Zs.size();i++) {
+            cnCopyROI(&Zs[i], &Z, meas_idx, 0);
+            cnCopyROI(Rs[i], &R, meas_idx, meas_idx);
+            meas_idx += (int)measurementModels[i]->meas_cnt;
+        }
+
+        cnkalman_meas_model bulk = { };
+        cnkalman_meas_model_init(&kalman_state, "bulk", &bulk, bulk_update_fn);
+        bulk.term_criteria.max_iterations = min_iterations;
+        if(debugJac)
+            bulk.meas_jacobian_mode = cnkalman_jacobian_mode_debug;
+        cnkalman_meas_model_predict_update(t, &bulk, this, &Z, &R);
+    }
+
+    void KalmanModel::reset() {
+        cnkalman_state_reset(&kalman_state);
+    }
+
+    KalmanLinearPredictionModel::KalmanLinearPredictionModel(const std::string &name, size_t stateCnt)
+            : KalmanModel(name, stateCnt) {
+    }
+
+
+    void KalmanLinearPredictionModel::predict(FLT dt, const CnMat &x0, CnMat *x1, CnMat *cF) {
+        if(cF) {
+            cnCopy(&this->F(), cF, 0);
+        }
+        if(x1) {
+            CN_CREATE_STACK_VEC(x1_, state_cnt);
+
+            // X_k|k-1 = F * X_k-1|k-1
+            cnGEMM(&F(), &x0, 1, 0, 0, &x1_, (cnGEMMFlags)0);
+            cnCopy(&x1_, x1, 0);
+            CN_FREE_STACK_MAT(x1_);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libs/cnkalman/src/numerical_diff.c b/libs/cnkalman/src/numerical_diff.c
new file mode 100644
index 00000000..8e5b5936
--- /dev/null
+++ b/libs/cnkalman/src/numerical_diff.c
@@ -0,0 +1,45 @@
+#include <cnkalman/numerical_diff.h>
+
+bool cnkalman_numerical_differentiate(void * user, enum cnkalman_numerical_differentiate_mode mode, cnkalman_eval_fn_t fn, const CnMat* cx, CnMat* H) {
+	return cnkalman_numerical_differentiate_step_size(user, mode, -1, fn, cx, H);
+}
+
+bool cnkalman_numerical_differentiate_step_size(void * user, enum cnkalman_numerical_differentiate_mode mode, FLT step_size, cnkalman_eval_fn_t fn, const CnMat* cx, CnMat* H) {
+	CN_CREATE_STACK_VEC(y1, H->rows);
+	CN_CREATE_STACK_VEC(y2, H->rows);
+
+	CN_CREATE_STACK_VEC(x, cx->rows);
+	cn_matrix_copy(&x, cx);
+
+	if(mode != cnkalman_numerical_differentiate_mode_two_sided) {
+		if(fn(user, &x, &y2) == false) return false;
+	}
+
+	FLT* xa = cn_as_vector(&x);
+	for(int i = 0;i < cx->rows;i++) {
+		FLT s = step_size;
+		if(step_size <= 0) {
+			s = xa[i] == 0 ? 1e-7 : fmax(1e-9, fabs(xa[i] * 1e-8));
+		}
+
+		if(mode == cnkalman_numerical_differentiate_mode_two_sided) {
+			xa[i] += s;
+			if (fn(user, &x, &y1) == false) return false;
+			xa[i] -= 2 * s;
+			if (fn(user, &x, &y2) == false) return false;
+			xa[i] += s;
+			s = 2 * s;
+		} else {
+			s *= mode == cnkalman_numerical_differentiate_mode_one_sided_minus ? -1 : 1;
+			xa[i] += s;
+			if (fn(user, &x, &y1) == false) return false;
+			xa[i] -= s;
+		}
+
+		for(int j = 0;j < H->rows;j++) {
+			cnMatrixSet(H, j, i, (_y1[j] - _y2[j])/s);
+		}
+	}
+
+	return true;
+}
\ No newline at end of file
diff --git a/libs/cnkalman/tests/CMakeLists.txt b/libs/cnkalman/tests/CMakeLists.txt
new file mode 100644
index 00000000..a14597d2
--- /dev/null
+++ b/libs/cnkalman/tests/CMakeLists.txt
@@ -0,0 +1,37 @@
+include_directories(../include)
+
+include_directories(.)
+
+cnkalman_generate_code(./models/BikeLandmarks.py)
+cnkalman_generate_code(./models/BearingsOnlyTracking.py)
+cnkalman_generate_code(./models/EggLandscape.py)
+cnkalman_generate_code(./models/BearingAccel.py)
+add_executable(TestKalmanModels ModelRunner.cc TestKalmanModels.cc models/LinearPoseVel.h ./models/BikeLandmarks.gen.h
+        ./models/EggLandscape.gen.h ./models/BearingsOnlyTracking.gen.h ./models/BearingAccel.gen.h)
+target_link_libraries(TestKalmanModels cnkalman)
+
+add_executable(kalman_example kalman_example.c)
+target_link_libraries(kalman_example cnkalman)
+
+add_test(NAME runbenchmarks
+        COMMAND ${CMAKE_COMMAND}
+        -Dcnkalman_root_source_dir=${cnkalman_root_source_dir}
+        -DTEST_KALMAN_MODELS=$<TARGET_FILE:TestKalmanModels>
+        -DBENCHMARK_SCRIPT=${CMAKE_CURRENT_SOURCE_DIR}/benchmark.py
+        -P ${CMAKE_CURRENT_SOURCE_DIR}/runbenchmarks.cmake)
+
+add_test(NAME TestKalmanModels COMMAND TestKalmanModels)
+
+
+find_package(GTest)
+if(GTest_FOUND)
+    include(GoogleTest)
+
+    add_executable(kalman_toy kalman_toy.cc)
+    target_link_libraries(kalman_toy cnkalman GTest::gtest GTest::gtest_main)
+    gtest_discover_tests(kalman_toy)
+
+    add_executable(cnkalman-tests multi_model_test.cc feature_tests.cc)
+    target_link_libraries(cnkalman-tests cnkalman GTest::gtest GTest::gtest_main)
+    gtest_discover_tests(cnkalman-tests)
+endif()
diff --git a/libs/cnkalman/tests/ModelRunner.cc b/libs/cnkalman/tests/ModelRunner.cc
new file mode 100644
index 00000000..dc13967a
--- /dev/null
+++ b/libs/cnkalman/tests/ModelRunner.cc
@@ -0,0 +1,378 @@
+#include <vector>
+#include <cmath>
+
+#include "cnkalman/kalman.h"
+#include "cnkalman/model.h"
+#include "ModelRunner.h"
+
+
+#include <iostream>
+#include <cnmatrix/cn_matrix.h>
+#include <fstream>
+
+#if HAS_SCIPLOT
+#include <sciplot/sciplot.hpp>
+
+namespace sciplot::internal {
+    template<>
+    auto escapeIfNeeded<std::tuple<double, double>>(const std::tuple<double, double> &val) {
+        return internal::str(std::get<0>(val)) + " " + internal::str(std::get<1>(val));
+    }
+}
+
+#endif
+
+static inline FLT linmath_enforce_range(FLT v, FLT mn, FLT mx) {
+    if (v < mn)
+        return mn;
+    if (v > mx)
+        return mx;
+    return v;
+}
+
+static inline uint32_t create_rgb(const FLT* rgb) {
+    FLT _r = rgb[0], _g = rgb[1], _b = rgb[2];
+    uint8_t r = linmath_enforce_range(_r * 127 + 127, 0, 255);
+    uint8_t g = linmath_enforce_range(_g * 127 + 127, 0, 255);
+    uint8_t b = linmath_enforce_range(_b * 127 + 127, 0, 255);
+    return 0xff << 24 | b << 16 | g << 8 | r;
+}
+
+static inline std::ostream& write_matrix(std::ostream& os, const char* name, const std::vector<std::tuple<FLT, FLT>>& m) {
+    os << "\"" << name << "\": [" << std::endl;
+    bool needsCommaA = false;
+    for(auto& row : m) {
+        if(needsCommaA) os << ",";
+        needsCommaA = true;
+
+        os << "\t[" << std::get<0>(row) << ", " << std::get<1>(row) << "]" << std::endl;
+    }
+    os << "]" << std::endl;
+    return os;
+}
+static inline std::ostream& write_matrix(std::ostream& os, const std::vector<std::vector<FLT>>& m) {
+    os << "[" << std::endl;
+    bool needsCommaA = false;
+    for(auto& row : m) {
+        if(needsCommaA) os << ",";
+        needsCommaA = true;
+
+        bool needsCommaB = false;
+        os << "\t[ ";
+        for(auto& col : row) {
+            if(needsCommaB) os << ",";
+            needsCommaB = true;
+
+            os << col;
+        }
+        os << "]" << std::endl;
+    }
+    os << "]" << std::endl;
+    return os;
+}
+static inline std::ostream& write_matrix(std::ostream& os, const char* name, const std::vector<std::vector<FLT>>& m) {
+    os << "\"" << name << "\":";
+    return write_matrix(os, m);
+}
+static inline std::ostream& write_matrix(std::ostream& os, const char* name, const std::vector<std::vector<std::vector<FLT>>>& m) {
+    os << "\"" << name << "\": [" << std::endl;
+    bool needsCommaA = false;
+    for(auto& matrix : m) {
+        if(needsCommaA) os << ",";
+        needsCommaA = true;
+        write_matrix(os, matrix);
+    }
+    os << "]" << std::endl;
+    return os;
+}
+static inline std::ostream& write_matrix(std::ostream& os, const CnMat& m) {
+    os << "[" << std::endl;
+    for(int i = 0;i < m.rows;i++) {
+        if(i != 0) os << ", ";
+        os << "\t[";
+        for(int j = 0;j < m.cols;j++) {
+            if(j != 0) os << ", ";
+            os << m(i, j) << " ";
+        }
+        os << "]" << std::endl;
+    }
+    os << "]";
+    return os;
+}
+template <typename M>
+static inline std::ostream& write_matrix(std::ostream& os, const char* name, const std::vector<M>& m) {
+    os << "\"" << name << "\": [" << std::endl;
+    bool needsCommaA = false;
+    for(auto& matrix : m) {
+        if(needsCommaA) os << ",";
+        needsCommaA = true;
+
+        write_matrix(os, matrix);
+    }
+    os << "]" << std::endl;
+    return os;
+}
+static inline std::ostream& write_matrix(std::ostream & os, const char* name, const CnMat* m) {
+    os << "\"" << name << "\":" << std::endl;
+    write_matrix(os, *m);
+    return os;
+}
+static inline std::ostream& write_matrix(std::ostream & os, const char* name, const cnmatrix::Matrix& m) {
+    return write_matrix(os, name, (const CnMat*)m);
+}
+
+void ModelRunner::Run(cnkalman::KalmanModel &model, bool show, const CnMat* iX, const CnMat *Q, std::vector<cnmatrix::Matrix> mRs) {
+    cnkalman::ModelPlot plotter(model.name);
+    plotter.show = show;
+    draw_gt = true;
+    Run(plotter, model, iX, Q, mRs);
+}
+void ModelRunner::Run(cnkalman::ModelPlot& plotter, cnkalman::KalmanModel &model, const CnMat* iX, const CnMat *Q, std::vector<cnmatrix::Matrix> mRs, bool drawMap) {
+    model.reset();
+
+    std::vector<std::vector<std::vector<FLT>>> observations;
+    std::vector<cnmatrix::Matrix> Xs;
+    std::vector<std::vector<FLT>> Xfs;
+    std::vector<std::tuple<double, double>> pts_GT;
+    std::vector<std::tuple<double, double>> pts_Filtered;
+
+    std::vector<double> origerrors, besterrors, error;
+
+    FLT deviations = 2;
+
+    std::vector<cnmatrix::Matrix> Rs;
+    for (auto &measModel : model.measurementModels) {
+        auto &R = Rs.emplace_back(measModel->default_R());
+        measModel->meas_mdl.term_criteria.max_iterations = max_iterations;
+        //measModel->meas_mdl.meas_jacobian_mode = cnkalman_jacobian_mode_debug;
+    }
+
+    //model.kalman_state.transition_jacobian_mode = cnkalman_jacobian_mode_debug;
+
+    std::vector<std::vector<std::vector<FLT>>> covs;
+    std::vector<double> meas_error;
+
+    CN_CREATE_STACK_VEC(X, model.state_cnt);
+    cnCopy(model.stateM, &X, 0);
+
+    if(iX) {
+        cnCopy(iX, model.stateM, 0);
+    }
+
+    FLT t = 1 - dt;
+    plotter.include_point_in_range(model.state);
+    plotter.include_point_in_range(_X);
+
+    covs.emplace_back(cnMatToVectorVector(model.kalman_state.P));
+    pts_Filtered.emplace_back(model.state[0], model.state[1]);
+    pts_GT.emplace_back(X.data[0], X.data[1]);
+    Xfs.push_back(cnMatToVector(*model.stateM));
+
+    srand(42);
+    for (int i = 0; i <= iterations; i++) {
+        t += dt;
+        model.sample_state(dt, X, X, Q);
+        Xs.push_back(X);
+    }
+
+    t = 1 - dt;
+    for (int i = 0; i <= iterations; i++) {
+        t += dt;
+
+        X = Xs[i];
+
+        plotter.include_point_in_range(X.data);
+
+        pts_GT.emplace_back(X.data[0], X.data[1]);
+
+        if (i % run_every != 0)
+            continue;
+
+        model.update(t);
+
+        if (i % ellipse_step == 0) {
+            //plotter.plot_cov(model, deviations, "blue");
+        }
+
+        FLT error = 0, oerror = 0;
+
+        std::vector<std::vector<FLT>> obs;
+        std::vector<CnMat> Zs;
+        for (int j = 0; j < (int) model.measurementModels.size(); j++) {
+            auto &measModel = model.measurementModels[j];
+            auto Z = cnMatCalloc(measModel->meas_cnt, 1);
+            measModel->sample_measurement(X, Z, mRs.size() ? mRs[j] : Rs[j]);
+            Zs.push_back(Z);
+            obs.push_back(cnMatToVector(Z));
+        }
+
+        if (bulk_update) {
+            model.bulk_update(t, Zs, Rs);
+        } else {
+            for (int j = 0; j < (int) model.measurementModels.size(); j++) {
+                auto &measModel = model.measurementModels[j];
+                auto Z = Zs[j];
+
+                if(run_every_per_meas.empty() || iterations % run_every_per_meas[j] == 0) {
+                    auto stats = measModel->update(t, Z, Rs[j]);
+                    error += stats.besterror;
+                    oerror += stats.origerror;
+                }
+            }
+        }
+
+        for (int j = 0; j < (int) model.measurementModels.size(); j++) {
+            auto &Z = Zs[j];
+            free(Z.data);
+        }
+
+        observations.push_back(obs);
+        plotter.include_point_in_range(model.state);
+
+        Xfs.push_back(cnMatToVector(*model.stateM));
+        pts_Filtered.emplace_back(model.state[0], model.state[1]);
+
+        besterrors.push_back(error);
+        origerrors.push_back(oerror);
+
+        meas_error.emplace_back(cnDistance(&X, model.stateM));
+
+        if (i % ellipse_step == 0) {
+            plotter.plot_cov(model, deviations);
+        }
+        covs.emplace_back(cnMatToVectorVector(model.kalman_state.P));
+    }
+
+    std::string suffix = "";
+    if(!settingsName.empty()) suffix += "." + settingsName;
+    std::ofstream f(model.name + suffix + ".kf");
+    f << "{" << std::endl;
+    model.write(f);
+    f << ", " << std::endl;
+    write_matrix(f, "Z", observations) << ", ";
+    write_matrix(f, "X", Xs) << ", ";
+
+    if(Q) {
+        write_matrix(f, "Q", Q) << ", ";
+    }
+
+    write_matrix(f, "Xf", Xfs) << ", ";
+    write_matrix(f, "Ps", covs) << ", ";
+    write_matrix(f, "Rs", Rs) << ", ";
+    f << "\"run_every\":" << run_every << ", " << std::endl;
+    f << "\"dt\":" << dt << ", " << std::endl;
+    f << "\"ellipse_step\":" << ellipse_step << "" << std::endl;
+    f << "}";
+
+    model.draw(plotter);
+#ifdef HAS_SCIPLOT
+
+    //plotter.plot.drawWithVecs("lines", origerrors).label(settingsName + " Orig");
+    plotter.cnt++;
+    plotter.plot.drawWithVecs("lines", besterrors).label(settingsName + " Best").dashType(plotter.cnt % 12).lineStyle(plotter.cnt % 16);
+    plotter.plot.drawWithVecs("lines", meas_error).label(settingsName + " GT Error").dashType(plotter.cnt % 12).lineStyle(plotter.cnt % 16);
+
+    if (!model.measurementModels.empty() && drawMap ) {
+        cnmatrix::Matrix map(250, 250 * 3);
+        cnmatrix::Matrix map2(250, 250);
+
+        FLT x,y,w,h;
+        plotter.get_view(x, y, w, h);
+        CN_CREATE_STACK_VEC(S, model.state_cnt);
+        int color_offset = 0;
+        FLT min_z[4] = {INFINITY,INFINITY,INFINITY, INFINITY}, max_z[4] = {-INFINITY, -INFINITY, -INFINITY, -INFINITY};
+
+        for (int j = 0; j < 250; j++) {
+            for (int i = 0; i < 250; i++) {
+                FLT sx = i / 250. * w + x - w / 2;
+                FLT sy = j / 250. * w + y - w / 2;
+                S.data[0] = sx;
+                S.data[1] = sy;
+                color_offset = 0;
+                for(size_t z = 0;z < model.measurementModels.size();z++) {
+                    auto &meas = model.measurementModels[z];
+                    auto Z = cnmatrix::Matrix(meas->meas_cnt);
+                    meas->predict_measurement(S, Z, 0);
+                    auto iR = cnmatrix::Matrix::Like(Rs[z]);
+                    cnInvert(Rs[z], iR, CN_INVERT_METHOD_SVD);
+
+                    auto rZ = cnmatrix::Matrix::Like(Z);
+                    cnGEMM(iR, Z, 1, 0, 0, rZ, (enum cnGEMMFlags)0);
+
+                    for(size_t zc = 0;zc < meas->meas_cnt;zc++) {
+                        map(j, i * 3 + (color_offset++) % 3) += fabs(rZ(zc));
+                    }
+                }
+            }
+        }
+        for (int j = 0; j < 250; j++) {
+            for (int i = 0; i < 250; i++) {
+                for(int k = 0;k < 3;k++) {
+                    auto v = map(j, i * 3 + k);
+                    min_z[k] = fmin(min_z[k], v);
+                    max_z[k] = fmax(max_z[k], v);
+                    map2(j, i) += v;
+                }
+                min_z[3] = fmin(min_z[3], map2(j,i));
+                max_z[3] = fmax(max_z[3], map2(j,i));
+            }
+        }
+
+        FILE *f = fopen("map.rgb", "w");
+        FILE *f2 = fopen("map2.rgb", "w");
+        for (int j = 0; j < 250; j++) {
+            for (int i = 0; i < 250; i++) {
+                for(int k = 0;k < 3;k++) {
+                    auto v = (map(j, i * 3 + k) - min_z[k]) / (max_z[k] - min_z[k] + 1e-10);
+                    uint8_t b = fmax(0, fmin(255, v * 255));
+                    fwrite(&b, 1, 1, f);
+                }
+                uint8_t b = 0xff;
+
+                auto v = (map2(j, i) - min_z[3]) / (max_z[3] - min_z[3] + 1e-10);
+                b = fmax(0, fmin(255, v * 255));
+                uint8_t alpha = 0;
+
+                FLT div = .1;
+                FLT fv = fmod(v, div);
+                if(fv > div/2) fv = div - fv;
+                FLT pa = .01, pb = .005;
+                auto fvm = fmax(0, fmin(1, (fv - pb) / (pa - pb)));
+                alpha = 80 * fvm + (1-fvm) * 0;
+
+                for(int zz = 0;zz < 3;zz++) fwrite(&b, 1, 1, f2);
+                fwrite(&alpha, 1, 1, f);
+                fwrite(&alpha, 1, 1, f2);
+            }
+        }
+        fclose(f2);
+        fclose(f);
+
+        {
+            std::stringstream ss;
+            ss << "'map.rgb' binary array=(250,250) center=(" << x << ", " << y << ") dx=" << (w / 250.)
+               << " format='%uchar' ";
+            plotter.map.draw(ss.str(), "", "rgbalpha").labelNone();
+        }
+        if(false)
+        {
+            std::stringstream ss;
+            ss << "'map2.rgb' binary array=(250,250) center=(" << x << ", " << y << ") dx=" << (w / 250.)
+               << " format='%uchar' ";
+            plotter.map.draw(ss.str(), "", "rgbalpha").labelNone();
+        }
+    }
+
+    if(draw_gt)
+        plotter.map.drawWithVecs("lines", pts_GT).label(settingsName + " GT").lineWidth(3);
+    draw_gt = false;
+    plotter.cnt++;
+    plotter.map.drawWithVecs("lines", pts_Filtered).label(settingsName + " Filter").dashType(plotter.cnt % 12).lineStyle(plotter.cnt % 16);
+
+#endif
+
+}
+
+ModelRunner::ModelRunner(const std::string &settingsName, double dt, int iterations, int max_iterations, int runEvery, int ellipseStep,
+                         bool bulkUpdate) : settingsName(settingsName), dt(dt), iterations(iterations), max_iterations(max_iterations),
+                                            run_every(runEvery), ellipse_step(ellipseStep), bulk_update(bulkUpdate) {}
diff --git a/libs/cnkalman/tests/ModelRunner.h b/libs/cnkalman/tests/ModelRunner.h
new file mode 100644
index 00000000..23d000cb
--- /dev/null
+++ b/libs/cnkalman/tests/ModelRunner.h
@@ -0,0 +1,20 @@
+#include <cnkalman/model.h>
+#include "cnkalman/ModelPlot.h"
+
+struct ModelRunner {
+    std::string settingsName = "";
+    FLT dt = 0.1;
+    int iterations = 200;
+    int max_iterations = 0;
+    int run_every = 1;
+    int ellipse_step = 20;
+    bool bulk_update = false;
+    bool draw_gt = true;
+    std::vector<int> run_every_per_meas;
+
+    ModelRunner(const std::string &settingsName = "", double dt = .1, int iterations = 200, int max_iterations = 0, int runEvery = 1, int ellipseStep = 20,
+                bool bulkUpdate = false);
+
+    void Run(cnkalman::KalmanModel& model, bool show = false, const CnMat* X = 0, const CnMat* Q = 0, std::vector<cnmatrix::Matrix> mRs = {});
+    void Run(cnkalman::ModelPlot& plotter, cnkalman::KalmanModel& model, const CnMat* X = 0, const CnMat* Q = 0, std::vector<cnmatrix::Matrix> mRs = {}, bool drawMap = true);
+};
diff --git a/libs/cnkalman/tests/TestKalmanModels.cc b/libs/cnkalman/tests/TestKalmanModels.cc
new file mode 100644
index 00000000..9251388b
--- /dev/null
+++ b/libs/cnkalman/tests/TestKalmanModels.cc
@@ -0,0 +1,62 @@
+#include <models/BearingsOnlyTracking.h>
+#include "models/LinearPoseVel.h"
+#include "models/EggLandscape.h"
+#include "models/BikeLandmarks.h"
+#include "models/BearingAccel.h"
+
+#include "ModelRunner.h"
+
+int main(int argc, char** argv) {
+    bool show = argc > 1 && strcmp(argv[1], "--show") == 0;
+
+    ModelRunner defaultRunner;
+    ModelRunner defaultRunnerEvery10;
+    defaultRunnerEvery10.run_every = 10;
+
+    BikeLandmarks bikeModel;
+    EggLandscapeProblem eggModel;
+    LinearPoseVel linearModel;
+	BearingAccel bearingAccel;
+    BearingAccel bearingAccelNoIMU(false);
+    BearingAccel bearingAccelNoLandmark(true, false);
+
+	BearingsOnlyTracking bearingsOnlyTracking;
+
+    defaultRunnerEvery10.Run(bikeModel, show);
+    defaultRunner.Run(eggModel, show);
+    defaultRunner.Run(linearModel, show);
+    defaultRunner.Run(bearingsOnlyTracking, show);
+    {
+        cnkalman::ModelPlot iekfPlot("TDOA", show);
+
+        ModelRunner iekfRunner1000("iekf", .01, 5000, 10, 1, 500, false);
+        iekfRunner1000.run_every_per_meas = { 10, 10, 10, 10, 10, 10, 1};
+        ModelRunner runner1000("ekf", .01, 5000, 0, 1, 500, false);
+        runner1000.run_every_per_meas = { 10, 10, 10, 10, 10, 10, 1};
+
+        iekfRunner1000.Run(iekfPlot, bearingAccel, 0, 0, {}, true);
+        iekfRunner1000.settingsName = "iekf(No IMU)";
+        iekfRunner1000.Run(iekfPlot, bearingAccelNoIMU, 0, 0, {}, false);
+        runner1000.draw_gt = false;
+        runner1000.Run(iekfPlot, bearingAccel, 0, 0, {}, false);
+    }
+
+    CN_CREATE_STACK_MAT(Q, 2, 2);
+    std::vector<cnmatrix::Matrix> Rs;
+    Rs.emplace_back(1, 1);
+    Rs.emplace_back(1, 1);
+    Rs.emplace_back(1, 1);
+
+    CN_CREATE_STACK_VEC(initial_x, 2);
+    initial_x.data[0] = .5; initial_x.data[1] = .1;
+
+    cnkalman::ModelPlot iekfPlot("IEKF vs not");
+    iekfPlot.show = show;
+    ModelRunner testRunner("small-iteration", .1, 5, 0, 1, 1, false);
+    testRunner.Run(iekfPlot, bearingsOnlyTracking, &initial_x, &Q, Rs);
+
+    ModelRunner testRunnerIEKF("small-iteration-iekf", .1, 5, 5, 1, 1, false);
+    testRunnerIEKF.Run(iekfPlot, bearingsOnlyTracking, &initial_x, &Q, Rs);
+
+    return 0;
+}
diff --git a/libs/cnkalman/tests/benchmark.py b/libs/cnkalman/tests/benchmark.py
new file mode 100644
index 00000000..45293599
--- /dev/null
+++ b/libs/cnkalman/tests/benchmark.py
@@ -0,0 +1,132 @@
+import math
+import sys
+
+import numpy as np
+from filterpy.kalman import KalmanFilter
+from filterpy.stats import plot_covariance
+
+import json
+
+from matplotlib import pyplot as plt
+
+from models.EggLandscape import EggLandscape
+from models.RobotEKF import RobotEKF
+from models.BearingsOnlyTracking import BearingsOnlyTracking
+
+kp = json.load(open(sys.argv[1]))
+model = kp['model']
+
+def create_filter(model):
+    if model['name'] == "BikeLandmarks":
+        return RobotEKF(1, .5, [1.1, .01], .1, .015)
+    elif model['name'] == "EggLandscape":
+        return EggLandscape()
+    elif model['name'] == "BearingsOnlyTracking":
+        return BearingsOnlyTracking()
+    elif model['name'] == "LinearToy":
+        f = KalmanFilter(model['state_cnt'], model['measurement_models'][0]['meas_cnt'])
+        f.Q = np.array([
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 0.01, 0],
+            [0, 0, 0, 0.01],
+        ])
+        f.H = np.array([
+            [-1, 0, 0, 0],
+            [0, 1, 0, 0],
+            [-.5, .5, 0, 0]
+        ])
+        f.F = np.array([
+            [1, 0, 1, 0],
+            [0, 1, 0, 1],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1],
+        ])
+        return f
+
+
+matrices = {k: np.array(kp[k]) for k in kp.keys() if k != 'model'}
+Xf = matrices['Xf']
+GT = matrices['X']
+Ps = matrices['Ps']
+
+def run_model(kp, f):
+    ellipse_step = kp['ellipse_step']
+    run_every = kp['run_every']
+
+    Xf = matrices['Xf']
+    GT = matrices['X']
+    Ps = matrices['Ps']
+
+    Zs = matrices['Z']
+
+    f.x = np.array([Xf[0]]).transpose()
+    f.P = Ps[0]
+
+    Rs = matrices['Rs']
+
+    Xs = []
+
+    ax = plt.gca()  # you first need to get the axis handle
+    ax.set_aspect(1)  # sets the height to width ratio to 1.5.
+
+    dZs = []
+    z_idx = 0
+    Xs.append(f.x)
+    pError = 0
+
+    for i in range(0, GT.shape[0]):
+        if i % run_every != 0:
+            continue
+
+        f.predict()
+
+        if i % ellipse_step == 0 and False:
+            plot_covariance(
+                (f.x[0], f.x[1]), cov=f.P[0:2, 0:2],
+                std=1, facecolor='k', alpha=0.3)
+
+        z = Zs[z_idx]
+        z_idx += 1
+
+        if len(model['measurement_models']) == 1:
+            R = Rs[0]
+            f.update(z[0].reshape(-1, 1), Rs[0])
+        else:
+            for meas_idx in range(len(model['measurement_models'])):
+                R = Rs[meas_idx]
+                if hasattr(f, 'predict_meas'):
+                    ZGt = f.predict_meas(GT[i].reshape(-1, 1), meas_idx)
+                    dZ = z[meas_idx].reshape(-1) - ZGt.reshape(-1)
+                    dZs.append(dZ)
+                f.update(z[meas_idx].reshape(-1, 1), Rs[meas_idx], meas_idx)
+
+        dP = f.P - Ps[z_idx]
+        pError = np.linalg.norm(dP) ** 2
+
+        Xs.append(f.x)
+
+        if i % ellipse_step == 0:
+            plot_covariance(
+                (f.x[0], f.x[1]), cov=f.P[0:2, 0:2],
+                std=1, facecolor='g', alpha=0.8)
+
+    Xs = np.array(Xs)
+    err = np.linalg.norm(Xf - Xs.reshape(Xf.shape)) / Xf.shape[0]
+    print(err, math.sqrt(pError) / Xf.shape[0])
+
+    has_error = math.sqrt(pError) / Xf.shape[0] > 1e-3 or err > 1e-3
+
+
+    return Xs
+
+f = create_filter(model)
+Xs = run_model(kp, f)
+
+plt.plot(Xs[:, 0], Xs[:, 1], label="pyfilter")
+plt.plot(Xf[:, 0], Xf[:, 1], '--', label="cnkalman")
+plt.plot(GT[:, 0], GT[:, 1], '-.', label="GT")
+plt.legend()
+
+if '--show' in sys.argv:
+    plt.show()
diff --git a/libs/cnkalman/tests/feature_tests.cc b/libs/cnkalman/tests/feature_tests.cc
new file mode 100644
index 00000000..d143e4e5
--- /dev/null
+++ b/libs/cnkalman/tests/feature_tests.cc
@@ -0,0 +1,41 @@
+#include <cnkalman/kalman.h>
+#include <cnmatrix/cn_matrix.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <gtest/gtest.h>
+#include "test_utils.h"
+
+TEST(Kalman, Predict) {
+    FLT _F[] = {
+            0, 0, 0, 0, 1,
+            0, 0, 0, 1, 0,
+            0, 0, 1, 0, 0,
+            0, 1, 0, 0, 0,
+            1, 0, 0, 0, 0,
+    };
+    CnMat F = cnMat(5, 5, _F);
+    FLT X[5] = { 1, 2, 3, 4, 5};
+
+    cnkalman_state_t m = { 0 };
+    cnkalman_state_init(&m, 5, cnkalman_linear_transition_fn, 0, &F, X);
+
+    FLT X0[5] = { 1, 2, 3, 4, 5};
+    ASSERT_DOUBLE_ARRAY_EQ(5, X, X0);
+    cnkalman_predict_state(1, &m);
+    cnkalman_state_free(&m);
+}
+
+void error_fn(void *user, const struct CnMat *x0, const struct CnMat *x1, struct CnMat *error_state, struct CnMat *E_jac_x1) {
+    CN_CREATE_STACK_MAT(xd, x0->rows, 1);
+    if(error_state) {
+        cn_elementwise_subtract(&xd, x1, x0);
+        for (int i = 0; i < x0->rows; i++) {
+            cn_as_vector(error_state)[i] = 10. * cn_as_vector(&xd)[i];
+        }
+    }
+    if(E_jac_x1) {
+        cn_set_diag_val(E_jac_x1, 10);
+    }
+}
diff --git a/libs/cnkalman/tests/kalman_example.c b/libs/cnkalman/tests/kalman_example.c
new file mode 100644
index 00000000..732b84c8
--- /dev/null
+++ b/libs/cnkalman/tests/kalman_example.c
@@ -0,0 +1,37 @@
+#include <cnkalman/kalman.h>
+#include <stdio.h>
+
+static inline void kalman_transition_model_fn(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0,
+                                             struct CnMat *x1, struct CnMat *F) {
+    // Logic to fill in the next state x1 and the associated transition matrix F
+}
+
+static inline void kalman_process_noise_fn(void *user, FLT dt, const struct CnMat *x, struct CnMat *Q) {
+    // Logic to fill in the process covariance Q
+}
+
+static inline bool kalman_measurement_model_fn(void *user, const struct CnMat *Z, const struct CnMat *x_t,
+                                              struct CnMat *y, struct CnMat *H_k) {
+    // Logic to fill in the residuals `y`, and the jacobian of the predicted measurement function `h`
+    return false; // This should return true if the jacobian and evaluation were valid.
+}
+
+int main() {
+    int state_cnt = 1;
+    cnkalman_state_t kalman_state = { 0 };
+    cnkalman_state_init(&kalman_state, state_cnt, kalman_transition_model_fn, kalman_process_noise_fn, 0, 0);
+    // Uncomment the next line if you want to use numerical jacobians for the transition matrix
+    //kalman_state.transition_jacobian_mode = cnkalman_jacobian_mode_two_sided;
+
+    cnkalman_meas_model_t kalman_meas_model = { 0 };
+    cnkalman_meas_model_init(&kalman_state, "Example Measurement", &kalman_meas_model, kalman_measurement_model_fn);
+    // Uncomment the next line if you want to use numerical jacobians for this measurement
+    // kalman_meas_model.meas_jacobian_mode = cnkalman_jacobian_mode_two_sided;
+
+    CnMat Z, R;
+    // Logic to fill in measurement matrix Z and measurement covariance matrix R
+    cnkalman_meas_model_predict_update(1, &kalman_meas_model, 0, &Z, &R);
+
+    printf("Output:%f\n", cn_as_vector(&kalman_state.state)[0]);
+    return 0;
+}
\ No newline at end of file
diff --git a/libs/cnkalman/tests/kalman_toy.cc b/libs/cnkalman/tests/kalman_toy.cc
new file mode 100644
index 00000000..0ee35c03
--- /dev/null
+++ b/libs/cnkalman/tests/kalman_toy.cc
@@ -0,0 +1,232 @@
+#include <cnkalman/kalman.h>
+#include <cnmatrix/cn_matrix.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <gtest/gtest.h>
+#include "test_utils.h"
+
+// Generates the transition matrix F
+void transition(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0,
+											 struct CnMat *x1, struct CnMat *f_out) {
+	if(f_out) {
+		cn_set_zero(f_out);
+		cn_set_diag_val(f_out, 1);
+	}
+	if(x1) {
+		cnCopy(x0, x1, 0);
+	}
+}
+
+typedef FLT LinmathPoint2d[2];
+
+static void process_noise(void *user, FLT dt, const struct CnMat *x, struct CnMat *Q_out) {
+	cn_set_zero(Q_out);
+	cn_set_diag_val(Q_out, .1);
+}
+
+static FLT measurement_model(const LinmathPoint2d sensor_pos, const LinmathPoint2d state) {
+	return atan2(state[1] - sensor_pos[1], state[0] - sensor_pos[0]);
+	// return atan2(state[0] - sensor_pos[0], state[1] - sensor_pos[1]);
+}
+
+static bool toy_measurement_model(void *user, const struct CnMat *Z, const struct CnMat *x_t, struct CnMat *y,
+								  struct CnMat *H_k) {
+	auto sensors = (const LinmathPoint2d *)user;
+
+	for (int i = 0; i < Z->rows; i++) {
+		if (y) {
+			y->data[i] = Z->data[i] - measurement_model(sensors[i], x_t->data);
+		}
+		if (H_k) {
+			FLT dx = x_t->data[0] - sensors[i][0];
+			FLT dy = x_t->data[1] - sensors[i][1];
+			FLT scale = 1. / ((dx * dx) + (dy * dy));
+			cnMatrixSet(H_k, i, 0, scale * -dy);
+			cnMatrixSet(H_k, i, 1, scale * dx);
+		}
+	}
+
+	return true;
+}
+
+void run_standard_experiment(LinmathPoint2d X_out, FLT *P, const term_criteria_t *termCriteria, int time_steps) {
+	LinmathPoint2d true_state = {1.5, 1.5};
+	FLT X[2] = {0.5, 0.1};
+	LinmathPoint2d sensors[2] = {{0, 0}, {1.5, 0}};
+	cnkalman_state_t state = {};
+	cnkalman_state_init(&state, 2, transition, process_noise, 0, X);
+	//    state.debug_jacobian = true;
+	//state.log_level = 101;
+	cn_set_diag_val(&state.P, .1);
+
+	CN_CREATE_STACK_MAT(Z, 2, 1);
+	FLT Rv = M_PI * M_PI * 1e-5;
+	FLT _R[] = {Rv, Rv};
+
+	for (int i = 0; i < 2; i++) {
+		Z.data[i] = measurement_model(sensors[i], true_state);
+	}
+
+	struct cnkalman_meas_model meas_model = {
+		.ks = {&state},
+		.ks_cnt = 1,
+		.Hfn = toy_measurement_model,
+		.term_criteria = *termCriteria,
+	};
+
+	CnMat R = cnVec(2, _R);
+
+	for (int i = 0; i < time_steps; i++) {
+		cnkalman_meas_model_predict_update(1, &meas_model, sensors, &Z, &R);
+		printf("%3d: %7.6f %7.6f\n", i, X[0], X[1]);
+	}
+
+	memcpy(P, state.P.data, sizeof(FLT) * 4);
+	memcpy(X_out, X, sizeof(FLT) * 2);
+	cnkalman_state_free(&state);
+}
+
+// https://www.diva-portal.org/smash/get/diva2:844060/FULLTEXT01.pdf
+
+TEST(Kalman, EKFTest) {
+	/**
+	 * These values are not the ones shown in the chart in the paper, but it's unclear if that chart is with noise or
+	 * without noise in the measurement. Results reproduced in octave:
+
+	  x0 = [.5, .1]
+	  PQv = .1 * .1
+	  P0 = [PQv 0; 0 PQv]
+	  Q = [ 0 0; 0 0]
+	  rv = pi*pi * 10^-5
+	  R = [rv 0; 0 rv]
+
+	  P0_1 = P0 + Q
+	  x_t = [1.5 1.5]
+	  S0 = [ 0, 0]
+	  S1 = [ 1.5, 0]
+	  y0 = atan2(x_t(2) - S0(2), x_t(1) - S0(1))
+	  y1 = atan2(x_t(2) - S1(2), x_t(1) - S1(1))
+	  y = [y0; y1]
+	  h0 = atan2(x0(2) - S0(2), x0(1) - S0(1))
+	  h1 = atan2(x0(2) - S1(2), x0(1) - S1(1))
+	  hx = [h0;h1]
+	  H0 = [-(x0(2) - S0(2)) (x0(1) - S0(1))] / ( (x0(2) - S0(2))^2 + (x0(1) - S0(1))^2)
+	  H1 = [-(x0(2) - S1(2)) (x0(1) - S1(1))] / ( (x0(2) - S1(2))^2 + (x0(1) - S1(1))^2)
+	  H = [H0; H1]
+
+	  K0 = P0_1 * H' * (H * P0_1 * H' + R)^-1
+
+	  x1 = x0' + K0 * (y - hx)
+	  P1 = (eye(2) - K0 * H) * P0_1
+
+	 */
+	FLT expected_X[2] = {4.4049048331227372, 1.1884307714294169};
+	FLT expected_P[4] = {0.0014050624776344668, 0.00019267084936229752, 0.0001926708493623336, 4.751355804986091e-05};
+
+	FLT X[2];
+	FLT P[4];
+	term_criteria_t termCriteria = {};
+
+	run_standard_experiment(X, P, &termCriteria, 1);
+
+	LinmathPoint2d true_state = {1.5, 1.5};
+	CnMat Xm = cnVec(2, X);
+	CnMat Tm = cnVec(2, true_state);
+	FLT error = cnDistance(&Xm, &Tm);
+
+	EXPECT_ARRAY_NEAR(2, X, expected_X, 1e-5);
+	EXPECT_ARRAY_NEAR(4, P, expected_P, 1e-5);
+}
+
+TEST(Kalman, IEKFTest) {
+	FLT X[2];
+	FLT P[4];
+	term_criteria_t termCriteria = {.max_iterations = 10};
+	run_standard_experiment(X, P, &termCriteria, 1);
+
+	LinmathPoint2d true_state = {1.5, 1.5};
+	CnMat Xm = cnVec(2, X);
+	CnMat Tm = cnVec(2, true_state);
+
+	FLT error = cnDistance(&Xm, &Tm);
+	ASSERT_GT(.17, error);
+}
+
+static void process_noise2(void *user, FLT dt, const struct CnMat *x, struct CnMat *Q_out) { cn_set_zero(Q_out); }
+
+static bool toy_measurement_model2(void *user, const struct CnMat *Z, const struct CnMat *x_t, struct CnMat *y,
+								   struct CnMat *H_k) {
+	if (y) {
+		y->data[0] = Z->data[0] - -(x_t->data[0] * x_t->data[0]);
+	}
+	if (H_k) {
+		cnMatrixSet(H_k, 0, 0, -2 * x_t->data[0]);
+	}
+
+	return true;
+}
+
+void run_standard_experiment2(LinmathPoint2d X_out, FLT *P, const term_criteria_t *termCriteria, int time_steps,
+							  struct cnkalman_update_extended_stats_t *stats) {
+	FLT true_state[] = {1};
+	FLT X[2] = {0.1};
+
+	cnkalman_state_t state = {};
+	cnkalman_state_init(&state, 1, transition, process_noise2, 0, X);
+	//state.log_level = 1001;
+	cn_set_diag_val(&state.P, 1);
+
+	CN_CREATE_STACK_MAT(Z, 1, 1);
+	FLT Rv = .1;
+	Z.data[0] = -true_state[0] * true_state[0];
+
+	cnkalman_meas_model_t measModel = {
+		.ks = {&state},
+		.ks_cnt = 1,
+		.Hfn = toy_measurement_model2,
+		.term_criteria = *termCriteria,
+	};
+
+	CN_CREATE_STACK_MAT(Rm, 1, 1);
+	Rm.data[0] = Rv;
+
+	for (int i = 0; i < time_steps; i++) {
+		FLT error = cnkalman_meas_model_predict_update_stats(1, &measModel, 0, &Z, &Rm, stats);
+		printf("%3d: %7.6f %7.6f\n", i, X[0], error);
+	}
+
+	memcpy(P, state.P.data, sizeof(FLT) * 1);
+	memcpy(X_out, X, sizeof(FLT) * 1);
+	cnkalman_state_free(&state);
+}
+
+TEST(Kalman, EKFTest2) {
+	FLT expected_X = 1.5142857142857145;
+	FLT expected_P = 0.71428571428571419;
+
+	FLT X;
+	FLT P;
+	term_criteria_t termCriteria = {};
+
+	run_standard_experiment2(&X, &P, &termCriteria, 1, 0);
+
+	EXPECT_NEAR(X, expected_X, 1e-5);
+	EXPECT_NEAR(P, expected_P, 1e-5);
+}
+
+TEST(Kalman, IEKFTest2) {
+	FLT expected_X = 0.977300604309293;
+	FLT expected_P = 0.025506798237172168;
+
+	FLT X;
+	FLT P;
+	term_criteria_t termCriteria = {.max_iterations = 5};
+	struct cnkalman_update_extended_stats_t stats = {};
+	run_standard_experiment2(&X, &P, &termCriteria, 1, &stats);
+
+	EXPECT_NEAR(X, expected_X, 1e-5);
+	EXPECT_NEAR(P, expected_P, 1e-5);
+	EXPECT_NEAR(stats.orignorm, 4.9005, 1e-5);
+}
diff --git a/libs/cnkalman/tests/kalman_toy.py b/libs/cnkalman/tests/kalman_toy.py
new file mode 100644
index 00000000..b2abf0de
--- /dev/null
+++ b/libs/cnkalman/tests/kalman_toy.py
@@ -0,0 +1,94 @@
+import math
+import unittest
+
+import numpy as np
+
+from cnkalman import filter
+
+class StandardModel(filter.Model):
+    def __init__(self):
+        super().__init__("standard_model", 2)
+
+    def predict(self, dt, x0):
+        return x0
+
+    def predict_jac(self, dt, x0):
+        return np.eye(2)
+
+    def process_noise(self, dt, x):
+        return np.eye(2) * .1
+
+def measurement_model(sensor_pos, state):
+    return math.atan2(state[1] - sensor_pos[1], state[0] - sensor_pos[0])
+
+class StandardMeasModel(filter.MeasurementModel):
+    def __init__(self, mdl, sensors):
+        self.sensors = sensors
+        super().__init__(mdl, 1)
+        self.meas_mdl.meas_jacobian_mode = filter.jacobian_mode.debug
+
+    def predict_measurement(self, x):
+        return np.array([measurement_model(sensor, x) for sensor in self.sensors])
+
+    def predict_measurement_jac(self, x):
+        H = np.zeros((len(self.sensors), 2))
+        for idx, sensor in enumerate(self.sensors):
+            dx = x[0] - sensor[0]
+            dy = x[1] - sensor[1]
+            scale = 1. / ((dx*dx) + (dy*dy)) if dx != 0 or dy != 0 else 0
+            H[idx, 0] = scale * -dy
+            H[idx, 1] = scale * dx
+        return H
+#  2 x  1: +5.0000000e-01, +1.0000000e-01,
+#
+# -3.8461538e-01, +1.9230769e+00,
+# -9.9009901e-02, -9.9009901e-01,
+def run_standard_experiment(termCriteria, time_steps):
+    true_state = [1.5, 1.5]
+    X = [.5, .1]
+    sensors = np.array([
+        [0, 0],
+        [1.5, 0]
+    ])
+
+    mdl = StandardModel()
+    mdl.kalman_state.X = np.array(X)
+    mdl.kalman_state.P = np.eye(2) * .1
+
+    Rv = math.pi * math.pi * 1e-5
+    R = np.array([Rv, Rv])
+
+    Z = np.array([measurement_model(sensor, true_state) for sensor in sensors])
+
+    meas_model = StandardMeasModel(mdl, sensors)
+    meas_model.meas_mdl.term_criteria = termCriteria
+
+    for i in range(time_steps):
+        meas_model.update(1., Z, R)
+        print(f"{np.array(mdl.kalman_state.X)}")
+
+    assert meas_model.jacobian_debug_misses() < .1
+    return mdl.kalman_state.P, mdl.kalman_state.X
+
+class KalmanTestCase(unittest.TestCase):
+    def test_EKF(self):
+        expected_X = [4.4049048331227372, 1.1884307714294169]
+        expected_P = np.array([0.0014050624776344668, 0.00019267084936229752, 0.0001926708493623336, 4.751355804986091e-05]).reshape(2,2)
+        termCriteria = filter.term_criteria_t()
+        P, X = run_standard_experiment(termCriteria, 1)
+        np.testing.assert_allclose(X, expected_X, rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(P, expected_P, rtol=1e-5, atol=1e-5)
+
+    def test_IEKF(self):
+        termCriteria = filter.term_criteria_t(max_iterations=10)
+        P, X = run_standard_experiment(termCriteria, 1)
+
+        error = np.linalg.norm(X - np.array([1.5,1.5]))
+        assert .01 > error
+
+def main():
+    termCriteria = filter.term_criteria_t()
+    print(run_standard_experiment(termCriteria, 1))
+
+if __name__ == '__main__':
+    main()
diff --git a/libs/cnkalman/tests/models/BearingAccel.gen.h b/libs/cnkalman/tests/models/BearingAccel.gen.h
new file mode 100644
index 00000000..48bdfd88
--- /dev/null
+++ b/libs/cnkalman/tests/models/BearingAccel.gen.h
@@ -0,0 +1,141 @@
+/// NOTE: This is a generated file; do not edit.
+#pragma once
+#include <cnkalman/generated_header.h>
+
+// clang-format off
+static inline void BearingAccelModel_predict(BearingAccelModel* out, const FLT t, const BearingAccelModel* model) {
+	const FLT x0 = 1.0/2.0 * (t * t);
+	out->Position.Pos[0]=(t * (*model).Velocity.Pos[0]) + (x0 * (*model).Accel[0]) + (*model).Position.Pos[0];
+	out->Position.Pos[1]=(t * (*model).Velocity.Pos[1]) + (x0 * (*model).Accel[1]) + (*model).Position.Pos[1];
+	out->Position.Theta=(*model).Position.Theta + (t * (*model).Velocity.Theta);
+	out->Velocity.Pos[0]=(*model).Velocity.Pos[0] + (t * (*model).Accel[0]);
+	out->Velocity.Pos[1]=(*model).Velocity.Pos[1] + (t * (*model).Accel[1]);
+	out->Velocity.Theta=(*model).Velocity.Theta;
+	out->Accel[0]=(*model).Accel[0];
+	out->Accel[1]=(*model).Accel[1];
+}
+
+static inline void BearingAccelModel_predict_jac_t(CnMat* Hx, const FLT t, const BearingAccelModel* model) {
+	cnSetZero(Hx);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[0])/sizeof(FLT), 0, (t * (*model).Accel[0]) + (*model).Velocity.Pos[0]);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[1])/sizeof(FLT), 0, (t * (*model).Accel[1]) + (*model).Velocity.Pos[1]);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), 0, (*model).Velocity.Theta);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Velocity.Pos[0])/sizeof(FLT), 0, (*model).Accel[0]);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Velocity.Pos[1])/sizeof(FLT), 0, (*model).Accel[1]);
+}
+
+static inline void BearingAccelModel_predict_jac_model(CnMat* Hx, const FLT t, const BearingAccelModel* model) {
+	const FLT x0 = 1.0/2.0 * (t * t);
+	cnSetZero(Hx);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[0])/sizeof(FLT), offsetof(BearingAccelModel, Accel[0])/sizeof(FLT), x0);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[0])/sizeof(FLT), offsetof(BearingAccelModel, Position.Pos[0])/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[0])/sizeof(FLT), offsetof(BearingAccelModel, Velocity.Pos[0])/sizeof(FLT), t);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[1])/sizeof(FLT), offsetof(BearingAccelModel, Accel[1])/sizeof(FLT), x0);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[1])/sizeof(FLT), offsetof(BearingAccelModel, Position.Pos[1])/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Pos[1])/sizeof(FLT), offsetof(BearingAccelModel, Velocity.Pos[1])/sizeof(FLT), t);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), offsetof(BearingAccelModel, Velocity.Theta)/sizeof(FLT), t);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Velocity.Pos[0])/sizeof(FLT), offsetof(BearingAccelModel, Accel[0])/sizeof(FLT), t);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Velocity.Pos[0])/sizeof(FLT), offsetof(BearingAccelModel, Velocity.Pos[0])/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Velocity.Pos[1])/sizeof(FLT), offsetof(BearingAccelModel, Accel[1])/sizeof(FLT), t);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Velocity.Pos[1])/sizeof(FLT), offsetof(BearingAccelModel, Velocity.Pos[1])/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Velocity.Theta)/sizeof(FLT), offsetof(BearingAccelModel, Velocity.Theta)/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Accel[0])/sizeof(FLT), offsetof(BearingAccelModel, Accel[0])/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, offsetof(BearingAccelModel, Accel[1])/sizeof(FLT), offsetof(BearingAccelModel, Accel[1])/sizeof(FLT), 1);
+}
+
+static inline void BearingAccelModel_imu_predict(CnMat* out, const BearingAccelModel* model) {
+	const FLT x0 = cos((*model).Position.Theta);
+	const FLT x1 = sin((*model).Position.Theta);
+	cnMatrixOptionalSet(out, 0, 0, (*model).Velocity.Theta);
+	cnMatrixOptionalSet(out, 1, 0, x0 * (*model).Accel[0]);
+	cnMatrixOptionalSet(out, 2, 0, x1 * (*model).Accel[1]);
+	cnMatrixOptionalSet(out, 3, 0, x0);
+	cnMatrixOptionalSet(out, 4, 0, x1);
+}
+
+static inline void BearingAccelModel_imu_predict_jac_model(CnMat* Hx, const BearingAccelModel* model) {
+	const FLT x0 = cos((*model).Position.Theta);
+	const FLT x1 = sin((*model).Position.Theta);
+	cnSetZero(Hx);
+	cnMatrixOptionalSet(Hx, 0, offsetof(BearingAccelModel, Velocity.Theta)/sizeof(FLT), 1);
+	cnMatrixOptionalSet(Hx, 1, offsetof(BearingAccelModel, Accel[0])/sizeof(FLT), x0);
+	cnMatrixOptionalSet(Hx, 1, offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), -1 * x1 * (*model).Accel[0]);
+	cnMatrixOptionalSet(Hx, 2, offsetof(BearingAccelModel, Accel[1])/sizeof(FLT), x1);
+	cnMatrixOptionalSet(Hx, 2, offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), x0 * (*model).Accel[1]);
+	cnMatrixOptionalSet(Hx, 3, offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), -1 * x1);
+	cnMatrixOptionalSet(Hx, 4, offsetof(BearingAccelModel, Position.Theta)/sizeof(FLT), x0);
+}
+
+
+    static inline void BearingAccelModel_imu_predict_jac_model_with_hx(CnMat* Hx, CnMat* hx, const BearingAccelModel* model) {
+        if(hx != 0) { 
+            BearingAccelModel_imu_predict(hx, model);
+        }
+        if(Hx != 0) { 
+            BearingAccelModel_imu_predict_jac_model(Hx, model);
+        }
+    }
+static inline FLT BearingAccelModel_tdoa_predict(const BearingAccelLandmark* A, const BearingAccelLandmark* B, const BearingAccelModel* model) {
+	const FLT x0 = -1 * (*model).Position.Pos[1];
+	const FLT x1 = -1 * (*model).Position.Pos[0];
+	return (3.33564095198152 * sqrt(1e-05 + (((*A).Position[0] + x1) * ((*A).Position[0] + x1)) + (((*A).Position[1] + x0) * ((*A).Position[1] + x0)))) + (-3.33564095198152 * sqrt(1e-05 + (((*B).Position[0] + x1) * ((*B).Position[0] + x1)) + (((*B).Position[1] + x0) * ((*B).Position[1] + x0))));
+}
+
+static inline void BearingAccelModel_tdoa_predict_jac_A(CnMat* Hx, const BearingAccelLandmark* A, const BearingAccelLandmark* B, const BearingAccelModel* model) {
+	const FLT x0 = (*A).Position[0] + (-1 * (*model).Position.Pos[0]);
+	const FLT x1 = (*A).Position[1] + (-1 * (*model).Position.Pos[1]);
+	const FLT x2 = 3.33564095198152 * (1. / sqrt(1e-05 + (x0 * x0) + (x1 * x1)));
+	cnMatrixOptionalSet(Hx, 0, offsetof(BearingAccelLandmark, Position[0])/sizeof(FLT), x0 * x2);
+	cnMatrixOptionalSet(Hx, 0, offsetof(BearingAccelLandmark, Position[1])/sizeof(FLT), x2 * x1);
+}
+
+
+    static inline void BearingAccelModel_tdoa_predict_jac_A_with_hx(CnMat* Hx, CnMat* hx, const BearingAccelLandmark* A, const BearingAccelLandmark* B, const BearingAccelModel* model) {
+        if(hx != 0) { 
+            hx->data[0] = BearingAccelModel_tdoa_predict(A, B, model);
+        }
+        if(Hx != 0) { 
+            BearingAccelModel_tdoa_predict_jac_A(Hx, A, B, model);
+        }
+    }
+static inline void BearingAccelModel_tdoa_predict_jac_B(CnMat* Hx, const BearingAccelLandmark* A, const BearingAccelLandmark* B, const BearingAccelModel* model) {
+	const FLT x0 = (*B).Position[0] + (-1 * (*model).Position.Pos[0]);
+	const FLT x1 = (*B).Position[1] + (-1 * (*model).Position.Pos[1]);
+	const FLT x2 = 3.33564095198152 * (1. / sqrt(1e-05 + (x0 * x0) + (x1 * x1)));
+	cnMatrixOptionalSet(Hx, 0, offsetof(BearingAccelLandmark, Position[0])/sizeof(FLT), -1 * x0 * x2);
+	cnMatrixOptionalSet(Hx, 0, offsetof(BearingAccelLandmark, Position[1])/sizeof(FLT), -1 * x2 * x1);
+}
+
+
+    static inline void BearingAccelModel_tdoa_predict_jac_B_with_hx(CnMat* Hx, CnMat* hx, const BearingAccelLandmark* A, const BearingAccelLandmark* B, const BearingAccelModel* model) {
+        if(hx != 0) { 
+            hx->data[0] = BearingAccelModel_tdoa_predict(A, B, model);
+        }
+        if(Hx != 0) { 
+            BearingAccelModel_tdoa_predict_jac_B(Hx, A, B, model);
+        }
+    }
+static inline void BearingAccelModel_tdoa_predict_jac_model(CnMat* Hx, const BearingAccelLandmark* A, const BearingAccelLandmark* B, const BearingAccelModel* model) {
+	const FLT x0 = -1 * (*model).Position.Pos[0];
+	const FLT x1 = (*A).Position[0] + x0;
+	const FLT x2 = -1 * (*model).Position.Pos[1];
+	const FLT x3 = (*A).Position[1] + x2;
+	const FLT x4 = 3.33564095198152 * (1. / sqrt(1e-05 + (x1 * x1) + (x3 * x3)));
+	const FLT x5 = (*B).Position[0] + x0;
+	const FLT x6 = (*B).Position[1] + x2;
+	const FLT x7 = 3.33564095198152 * (1. / sqrt(1e-05 + (x5 * x5) + (x6 * x6)));
+	cnSetZero(Hx);
+	cnMatrixOptionalSet(Hx, 0, offsetof(BearingAccelModel, Position.Pos[0])/sizeof(FLT), (x5 * x7) + (-1 * x1 * x4));
+	cnMatrixOptionalSet(Hx, 0, offsetof(BearingAccelModel, Position.Pos[1])/sizeof(FLT), (x6 * x7) + (-1 * x4 * x3));
+}
+
+
+    static inline void BearingAccelModel_tdoa_predict_jac_model_with_hx(CnMat* Hx, CnMat* hx, const BearingAccelLandmark* A, const BearingAccelLandmark* B, const BearingAccelModel* model) {
+        if(hx != 0) { 
+            hx->data[0] = BearingAccelModel_tdoa_predict(A, B, model);
+        }
+        if(Hx != 0) { 
+            BearingAccelModel_tdoa_predict_jac_model(Hx, A, B, model);
+        }
+    }
diff --git a/libs/cnkalman/tests/models/BearingAccel.h b/libs/cnkalman/tests/models/BearingAccel.h
new file mode 100644
index 00000000..895e3123
--- /dev/null
+++ b/libs/cnkalman/tests/models/BearingAccel.h
@@ -0,0 +1,157 @@
+#include <cnkalman/model.h>
+#include "cnkalman/ModelPlot.h"
+#include <sstream>
+
+struct BearingAccelLandmark {
+    FLT Position[2];
+};
+
+struct BearingAccelPose {
+    FLT Pos[2];
+    FLT Theta;
+};
+
+struct BearingAccelModel {
+    BearingAccelPose Position;
+    BearingAccelPose Velocity;
+    FLT Accel[2];
+};
+
+#include "BearingAccel.gen.h"
+#ifdef HAS_SCIPLOT
+#include <sciplot/sciplot.hpp>
+#endif
+
+struct BearingAccel : public cnkalman::KalmanModel {
+    struct IMUMeasurement : public cnkalman::KalmanMeasurementModel {
+        IMUMeasurement(KalmanModel *kalmanModel) : KalmanMeasurementModel(
+                kalmanModel, "IMU", 5) {}
+
+        bool predict_measurement(const CnMat &x, CnMat *z, CnMat *h) override {
+            auto* mdl = reinterpret_cast<BearingAccelModel *>(x.data);
+            BearingAccelModel_imu_predict_jac_model_with_hx(h, z, mdl);
+            return h == nullptr || cn_is_finite(h);
+        }
+
+
+        cnmatrix::Matrix default_R() override {
+            auto R = cnmatrix::Matrix(meas_cnt, meas_cnt);
+            const FLT r = 1e-5;
+            cn_set_diag_val(R, r);
+            return R;
+        }
+    };
+	struct LandmarkMeasurement : public cnkalman::KalmanMeasurementModel {
+        BearingAccelLandmark a, b;
+
+		cnmatrix::Matrix default_R() override {
+			auto R = cnmatrix::Matrix(meas_cnt, meas_cnt);
+			const FLT r = 1;
+			cn_set_diag_val(R, r);
+			return R;
+		}
+
+		LandmarkMeasurement(cnkalman::KalmanModel *kalmanModel, const BearingAccelLandmark& a, const BearingAccelLandmark& b)
+		: KalmanMeasurementModel(kalmanModel, "Landmark", 1), a(a), b(b) {
+            //this->meas_mdl.meas_jacobian_mode = cnkalman_jacobian_mode_debug;
+		}
+
+		bool predict_measurement(const CnMat &x_t, CnMat *pz, CnMat *h) override {
+            if(pz) {
+                pz->data[0] = BearingAccelModel_tdoa_predict(&a, &b, (BearingAccelModel *) x_t.data);
+            }
+            if(h) {
+                BearingAccelModel_tdoa_predict_jac_model(h, &a, &b, (BearingAccelModel *) x_t.data);
+            }
+			return h == nullptr || cn_is_finite(h);
+		}
+	};
+	const FLT spacing = 10;
+	std::vector<BearingAccelLandmark> landmarks = {
+            {spacing, -spacing},
+            {-spacing, spacing},
+	        {-spacing, -spacing},
+		{spacing,spacing},
+
+        //{-0,0}
+	};
+
+	static std::string get_name(bool useIMU = true, bool useLandmarks = true) {
+	    if(useIMU && useLandmarks) return "BearingAccel";
+        if(!useIMU) return "BearingAccelNoIMU";
+        return "IMU only";
+	}
+
+	BearingAccel(bool useIMU = true, bool useLandmarks = true) : cnkalman::KalmanModel(get_name(useIMU, useLandmarks), sizeof(BearingAccelModel)/sizeof(FLT)) {
+	    if(useLandmarks) {
+            for (size_t i = 0; i < landmarks.size(); i++) {
+                for (size_t j = i + 1; j < landmarks.size(); j++) {
+                    this->measurementModels.emplace_back(
+                            std::make_unique<LandmarkMeasurement>(this, landmarks[i], landmarks[j]));
+                }
+            }
+        }
+	    if(useIMU) {
+            this->measurementModels.emplace_back(std::make_unique<IMUMeasurement>(this));
+        }
+		reset();
+	}
+	void reset() override {
+		KalmanModel::reset();
+		cn_set_diag_val(&kalman_state.P, .1);
+        cn_set_zero(&kalman_state.state);
+	}
+
+	void process_noise(FLT dt, const struct CnMat &x, struct CnMat &Q_out) override {
+	    BearingAccelModel v = { };
+        v.Velocity.Pos[0] = v.Velocity.Pos[1] = 1e-2;
+        v.Velocity.Theta = 1e-1;
+        v.Accel[0] = v.Accel[1] = .01;
+
+        cn_set_diag(&Q_out, reinterpret_cast<const FLT *>(&v));
+	}
+
+	void predict(FLT dt, const struct CnMat &x0, struct CnMat *x1, CnMat* F) override {
+	    if(F) {
+            BearingAccelModel_predict_jac_model(F, dt, reinterpret_cast<const BearingAccelModel *>(x0.data));
+	    }
+        if(x1) {
+            BearingAccelModel_predict(reinterpret_cast<BearingAccelModel *>(x1->data), dt,
+                                          reinterpret_cast<const BearingAccelModel *>(x0.data));
+	    }
+	}
+
+    void sample_state(FLT dt, const CnMat &x0, CnMat &x1, const struct CnMat *Q) override {
+        BearingAccelModel mdl = *reinterpret_cast<BearingAccelModel *>(x1.data);
+        FLT G = 8. / landmarks.size();
+        mdl.Accel[0] = mdl.Accel[1] = 0;
+        for(auto& l : landmarks) {
+            FLT dx = l.Position[0] - mdl.Position.Pos[0];
+            FLT dy = l.Position[1] - mdl.Position.Pos[1];
+            FLT dist = sqrt(dx*dx + dy*dy);
+            if(dist < 1) dist = 1;
+            mdl.Accel[0] += dx / dist / dist * G;
+            mdl.Accel[1] += dy / dist / dist * G;
+        }
+
+        mdl.Velocity.Theta *= .5;
+
+        auto x1_5 = cnMat(x1.rows, x1.cols, (FLT*)&mdl);
+        KalmanModel::sample_state(dt, x1_5, x1, Q);
+    }
+
+    void draw(cnkalman::ModelPlot &p) override {
+        KalmanModel::draw(p);
+        for(auto& l : landmarks) {
+            std::stringstream ss;
+            static int idx = 1000;
+            p.include_point_in_range(l.Position);
+            ss << "set obj " << idx++ << " ellipse fc rgb \"green\" fs transparent solid 1 center "
+               << l.Position[0] << "," << l.Position[1] << " size " << .1 << "," << .1 << " angle " << 0 << " front\n";
+#ifdef HAS_SCIPLOT
+            p.map.gnuplot(ss.str());
+#endif
+        }
+    }
+
+};
diff --git a/libs/cnkalman/tests/models/BearingAccel.py b/libs/cnkalman/tests/models/BearingAccel.py
new file mode 100644
index 00000000..82540b6c
--- /dev/null
+++ b/libs/cnkalman/tests/models/BearingAccel.py
@@ -0,0 +1,62 @@
+from dataclasses import dataclass
+
+from symengine import atan2, asin, cos, sin, tan, sqrt
+import cnkalman.codegen as cg
+from filterpy.kalman import ExtendedKalmanFilter as EKF
+import numpy as np
+
+@dataclass
+class BearingAccelPose:
+    Pos: np.array = (0., 0.)
+    Theta: float = 0
+@dataclass
+class BearingAccelModel:
+    Position: BearingAccelPose
+    Velocity: BearingAccelPose
+    Accel: np.array = (0., 0.)
+
+@dataclass
+class BearingAccelLandmark:
+    Position: np.array = (0., 0.)
+
+@cg.generate_code()
+def BearingAccelModel_predict(t: float, model: BearingAccelModel):
+    pos = model.Position.Pos
+    vpos = model.Velocity.Pos
+    acc = model.Accel
+    new_pos = (
+        pos[0] + vpos[0] * t + acc[0] * t * t / 2,
+        pos[1] + vpos[1] * t + acc[1] * t * t / 2,
+    )
+    new_theta = model.Position.Theta + model.Velocity.Theta * t
+
+    new_vpos = (
+        vpos[0] + acc[0] * t,
+        vpos[1] + acc[1] * t,
+    )
+    return BearingAccelModel(
+        Position=BearingAccelPose(new_pos, new_theta),
+        Velocity=BearingAccelPose(new_vpos, model.Velocity.Theta),
+        Accel=model.Accel
+    )
+
+
+@cg.generate_code()
+def BearingAccelModel_imu_predict(model: BearingAccelModel):
+    return [
+        model.Velocity.Theta,
+        model.Accel[0] * cos(model.Position.Theta),
+        model.Accel[1] * sin(model.Position.Theta),
+        cos(model.Position.Theta),
+        sin(model.Position.Theta),
+    ]
+
+def BearingAccelModel_toa_predict(A : BearingAccelLandmark, model: BearingAccelModel):
+    dx = A.Position[0] - model.Position.Pos[0]
+    dy = A.Position[1] - model.Position.Pos[1]
+    speed_of_light = 299792458
+    return sqrt(dx*dx + dy*dy + 1e-5) / speed_of_light * 1e9
+
+@cg.generate_code()
+def BearingAccelModel_tdoa_predict(A : BearingAccelLandmark, B : BearingAccelLandmark, model: BearingAccelModel):
+    return BearingAccelModel_toa_predict(A, model) - BearingAccelModel_toa_predict(B, model)
diff --git a/libs/cnkalman/tests/models/BearingsOnlyTracking.gen.h b/libs/cnkalman/tests/models/BearingsOnlyTracking.gen.h
new file mode 100644
index 00000000..14d6cf77
--- /dev/null
+++ b/libs/cnkalman/tests/models/BearingsOnlyTracking.gen.h
@@ -0,0 +1,55 @@
+/// NOTE: This is a generated file; do not edit.
+#pragma once
+#include <cnkalman/generated_header.h>
+
+// clang-format off
+static inline FLT bearings_meas_function(const FLT* state, const FLT* landmark) {
+	const FLT state0 = state[0];
+	const FLT state1 = state[1];
+	const FLT landmark0 = landmark[0];
+	const FLT landmark1 = landmark[1];
+	return atan2(state1 + (-1 * landmark1), state0 + (-1 * landmark0));
+}
+
+static inline void bearings_meas_function_jac_state(CnMat* Hx, const FLT* state, const FLT* landmark) {
+	const FLT state0 = state[0];
+	const FLT state1 = state[1];
+	const FLT landmark0 = landmark[0];
+	const FLT landmark1 = landmark[1];
+	const FLT x0 = state1 + (-1 * landmark1);
+	const FLT x1 = state0 + (-1 * landmark0);
+	const FLT x2 = 1. / ((x1 * x1) + (x0 * x0));
+	cnMatrixOptionalSet(Hx, 0, 0, -1 * x0 * x2);
+	cnMatrixOptionalSet(Hx, 0, 1, x2 * x1);
+}
+
+
+    static inline void bearings_meas_function_jac_state_with_hx(CnMat* Hx, CnMat* hx, const FLT* state, const FLT* landmark) {
+        if(hx != 0) { 
+            hx->data[0] = bearings_meas_function(state, landmark);
+        }
+        if(Hx != 0) { 
+            bearings_meas_function_jac_state(Hx, state, landmark);
+        }
+    }
+static inline void bearings_meas_function_jac_landmark(CnMat* Hx, const FLT* state, const FLT* landmark) {
+	const FLT state0 = state[0];
+	const FLT state1 = state[1];
+	const FLT landmark0 = landmark[0];
+	const FLT landmark1 = landmark[1];
+	const FLT x0 = state1 + (-1 * landmark1);
+	const FLT x1 = state0 + (-1 * landmark0);
+	const FLT x2 = 1. / ((x1 * x1) + (x0 * x0));
+	cnMatrixOptionalSet(Hx, 0, 0, x0 * x2);
+	cnMatrixOptionalSet(Hx, 0, 1, -1 * x2 * x1);
+}
+
+
+    static inline void bearings_meas_function_jac_landmark_with_hx(CnMat* Hx, CnMat* hx, const FLT* state, const FLT* landmark) {
+        if(hx != 0) { 
+            hx->data[0] = bearings_meas_function(state, landmark);
+        }
+        if(Hx != 0) { 
+            bearings_meas_function_jac_landmark(Hx, state, landmark);
+        }
+    }
diff --git a/libs/cnkalman/tests/models/BearingsOnlyTracking.h b/libs/cnkalman/tests/models/BearingsOnlyTracking.h
new file mode 100644
index 00000000..5a0e84e6
--- /dev/null
+++ b/libs/cnkalman/tests/models/BearingsOnlyTracking.h
@@ -0,0 +1,77 @@
+#include <cnkalman/model.h>
+#include "BearingsOnlyTracking.gen.h"
+#include <cnkalman/ModelPlot.h>
+#ifdef HAS_SCIPLOT
+#include <sciplot/sciplot.hpp>
+#endif
+
+struct BearingsOnlyTracking : public cnkalman::KalmanModel {
+    struct LandmarkMeasurement : public cnkalman::KalmanMeasurementModel {
+        FLT px, py;
+
+#ifdef HAS_SCIPLOT
+        void draw(cnkalman::ModelPlot& p) override {
+            std::stringstream ss;
+            static int idx = 1000;
+            p.include_point_in_range(px, py);
+            ss << "set obj " << idx++ << " ellipse fc rgb \"green\" fs transparent solid 1 center "
+               << px << "," << py << " size " << .05 << "," << .05 << " angle " << 0 << " front\n";
+            p.map.gnuplot(ss.str());
+        }
+#endif
+        virtual cnmatrix::Matrix default_R() {
+            auto R = cnmatrix::Matrix(meas_cnt, meas_cnt);
+            const FLT r = M_PI * M_PI * 1e-5;
+            cn_set_diag_val(R, r);
+            return R;
+        }
+
+        LandmarkMeasurement(cnkalman::KalmanModel *kalmanModel, double x, double y)
+                : KalmanMeasurementModel(kalmanModel, "Landmark", 1), px(x), py(y) {
+
+        }
+
+        static inline FLT wrapAngle( FLT angle )
+        {
+            while (angle > M_PI)
+                angle -= 2.0 * M_PI;
+            while (angle < -M_PI)
+                angle += 2.0 * M_PI;
+            return angle;
+        }
+
+        bool residual(const CnMat &Z, const CnMat &x, CnMat *y, CnMat *h) override {
+            auto rtn = KalmanMeasurementModel::residual(Z, x, y, h);
+            y->data[0] = wrapAngle(y->data[0]);
+            return rtn;
+        }
+
+        bool predict_measurement(const CnMat &x_t, CnMat *pz, CnMat *h) override {
+            FLT landmark[] = { px, py };
+            bearings_meas_function_jac_state_with_hx(h, pz, x_t.data, landmark);
+
+            return h == nullptr || cn_is_finite(h);
+        }
+    };
+
+    BearingsOnlyTracking() : cnkalman::KalmanModel("BearingsOnlyTracking", 2) {
+        this->measurementModels.emplace_back(std::make_unique<LandmarkMeasurement>(this, 0, 0));
+        this->measurementModels.emplace_back(std::make_unique<LandmarkMeasurement>(this, 1.5, 0));
+        reset();
+    }
+    void reset() override {
+        KalmanModel::reset();
+        state[0] = 1.5; state[1] = 1.5;
+        cn_set_diag_val(&kalman_state.P, .1);
+    }
+
+    void process_noise(FLT dt, const struct CnMat &x, struct CnMat &Q_out) override {
+        cn_set_diag_val(&Q_out, 0.1);
+    }
+
+    void predict(FLT dt, const struct CnMat &x0, struct CnMat *x1, CnMat* F) override {
+        if(F) cn_set_diag_val(F, 1);
+        if(x1) cnCopy(&x0, x1, 0);
+    }
+
+};
diff --git a/libs/cnkalman/tests/models/BearingsOnlyTracking.py b/libs/cnkalman/tests/models/BearingsOnlyTracking.py
new file mode 100644
index 00000000..ddd0cf01
--- /dev/null
+++ b/libs/cnkalman/tests/models/BearingsOnlyTracking.py
@@ -0,0 +1,31 @@
+from symengine import atan2, asin, cos, sin, tan, sqrt
+import cnkalman.codegen as cg
+from filterpy.kalman import ExtendedKalmanFilter as EKF
+import numpy as np
+
+@cg.generate_code(state = 2, landmark = 2)
+def bearings_meas_function(state, landmark):
+    x, y = state
+    px, py = landmark
+    return atan2(y - py, x - px)
+
+class BearingsOnlyTracking(EKF):
+    def __init__(self):
+        EKF.__init__(self, 2, 1)
+        self.Q = np.eye(2) * .1
+        self.F = np.eye(2)
+        self.landmarks = [[0, 0], [1.5, 0]]
+
+    def update(self, z, R, idx):
+        def residual(a, b):
+            """ compute residual (a-b) between measurements containing
+            [range, bearing]. Bearing is normalized to [-pi, pi)"""
+            y = a - b
+            y[0] = y[0] % (2 * np.pi)    # force in range [0, 2 pi)
+            if y[0] > np.pi:             # move to [-pi, pi)
+                y[0] -= 2 * np.pi
+            return y
+
+        EKF.update(self, z,
+                   HJacobian=lambda x: bearings_meas_function.jacobians.state(x.reshape(-1), self.landmarks[idx]), residual=residual,
+                   Hx=lambda x: np.array(bearings_meas_function(x.reshape(-1), self.landmarks[idx]), dtype=np.float64).reshape(-1, 1), R=R)
diff --git a/libs/cnkalman/tests/models/BikeLandmarks.gen.h b/libs/cnkalman/tests/models/BikeLandmarks.gen.h
new file mode 100644
index 00000000..c88378e7
--- /dev/null
+++ b/libs/cnkalman/tests/models/BikeLandmarks.gen.h
@@ -0,0 +1,186 @@
+/// NOTE: This is a generated file; do not edit.
+#pragma once
+#include <cnkalman/generated_header.h>
+
+// clang-format off
+static inline void predict_function(CnMat* out, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+	const FLT state0 = state[0];
+	const FLT state1 = state[1];
+	const FLT state2 = state[2];
+	const FLT u0 = u[0];
+	const FLT u1 = u[1];
+	const FLT x0 = tan(u1);
+	const FLT x1 = state2 + (u0 * x0 * dt * (1. / wheelbase));
+	const FLT x2 = (1. / x0) * wheelbase;
+	cnMatrixOptionalSet(out, 0, 0, (-1 * x2 * sin(state2)) + state0 + (x2 * sin(x1)));
+	cnMatrixOptionalSet(out, 1, 0, (x2 * cos(state2)) + state1 + (-1 * x2 * cos(x1)));
+	cnMatrixOptionalSet(out, 2, 0, x1);
+}
+
+static inline void predict_function_jac_dt(CnMat* Hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+	const FLT state2 = state[2];
+	const FLT u0 = u[0];
+	const FLT u1 = u[1];
+	const FLT x0 = u0 * tan(u1) * (1. / wheelbase);
+	const FLT x1 = state2 + (x0 * dt);
+	cnMatrixOptionalSet(Hx, 0, 0, u0 * cos(x1));
+	cnMatrixOptionalSet(Hx, 1, 0, u0 * sin(x1));
+	cnMatrixOptionalSet(Hx, 2, 0, x0);
+}
+
+
+    static inline void predict_function_jac_dt_with_hx(CnMat* Hx, CnMat* hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+        if(hx != 0) { 
+            predict_function(hx, dt, wheelbase, state, u);
+        }
+        if(Hx != 0) { 
+            predict_function_jac_dt(Hx, dt, wheelbase, state, u);
+        }
+    }
+static inline void predict_function_jac_wheelbase(CnMat* Hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+	const FLT state2 = state[2];
+	const FLT u0 = u[0];
+	const FLT u1 = u[1];
+	const FLT x0 = tan(u1);
+	const FLT x1 = 1. / x0;
+	const FLT x2 = u0 * dt;
+	const FLT x3 = x2 * (1. / wheelbase);
+	const FLT x4 = state2 + (x0 * x3);
+	const FLT x5 = cos(x4);
+	const FLT x6 = sin(x4);
+	cnMatrixOptionalSet(Hx, 0, 0, (x1 * x6) + (-1 * x1 * sin(state2)) + (-1 * x3 * x5));
+	cnMatrixOptionalSet(Hx, 1, 0, (-1 * x1 * x5) + (x1 * cos(state2)) + (-1 * x3 * x6));
+	cnMatrixOptionalSet(Hx, 2, 0, -1 * x0 * x2 * (1. / (wheelbase * wheelbase)));
+}
+
+
+    static inline void predict_function_jac_wheelbase_with_hx(CnMat* Hx, CnMat* hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+        if(hx != 0) { 
+            predict_function(hx, dt, wheelbase, state, u);
+        }
+        if(Hx != 0) { 
+            predict_function_jac_wheelbase(Hx, dt, wheelbase, state, u);
+        }
+    }
+static inline void predict_function_jac_state(CnMat* Hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+	const FLT state2 = state[2];
+	const FLT u0 = u[0];
+	const FLT u1 = u[1];
+	const FLT x0 = tan(u1);
+	const FLT x1 = (1. / x0) * wheelbase;
+	const FLT x2 = state2 + (u0 * x0 * dt * (1. / wheelbase));
+	cnSetZero(Hx);
+	cnMatrixOptionalSet(Hx, 0, 0, 1);
+	cnMatrixOptionalSet(Hx, 0, 2, (x1 * cos(x2)) + (-1 * x1 * cos(state2)));
+	cnMatrixOptionalSet(Hx, 1, 1, 1);
+	cnMatrixOptionalSet(Hx, 1, 2, (x1 * sin(x2)) + (-1 * x1 * sin(state2)));
+	cnMatrixOptionalSet(Hx, 2, 2, 1);
+}
+
+
+    static inline void predict_function_jac_state_with_hx(CnMat* Hx, CnMat* hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+        if(hx != 0) { 
+            predict_function(hx, dt, wheelbase, state, u);
+        }
+        if(Hx != 0) { 
+            predict_function_jac_state(Hx, dt, wheelbase, state, u);
+        }
+    }
+static inline void predict_function_jac_u(CnMat* Hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+	const FLT state2 = state[2];
+	const FLT u0 = u[0];
+	const FLT u1 = u[1];
+	const FLT x0 = tan(u1);
+	const FLT x1 = dt * (1. / wheelbase);
+	const FLT x2 = x0 * x1;
+	const FLT x3 = state2 + (u0 * x2);
+	const FLT x4 = cos(x3);
+	const FLT x5 = x4 * dt;
+	const FLT x6 = x0 * x0;
+	const FLT x7 = 1 + x6;
+	const FLT x8 = (1. / x6) * x7 * wheelbase;
+	const FLT x9 = u0 * x7;
+	const FLT x10 = (1. / x0) * x9;
+	const FLT x11 = sin(x3);
+	const FLT x12 = dt * x11;
+	cnMatrixOptionalSet(Hx, 0, 0, x5);
+	cnMatrixOptionalSet(Hx, 0, 1, (-1 * x8 * x11) + (x8 * sin(state2)) + (x5 * x10));
+	cnMatrixOptionalSet(Hx, 1, 0, x12);
+	cnMatrixOptionalSet(Hx, 1, 1, (x4 * x8) + (-1 * x8 * cos(state2)) + (x12 * x10));
+	cnMatrixOptionalSet(Hx, 2, 0, x2);
+	cnMatrixOptionalSet(Hx, 2, 1, x1 * x9);
+}
+
+
+    static inline void predict_function_jac_u_with_hx(CnMat* Hx, CnMat* hx, const FLT dt, const FLT wheelbase, const FLT* state, const FLT* u) {
+        if(hx != 0) { 
+            predict_function(hx, dt, wheelbase, state, u);
+        }
+        if(Hx != 0) { 
+            predict_function_jac_u(Hx, dt, wheelbase, state, u);
+        }
+    }
+static inline void meas_function(CnMat* out, const FLT* state, const FLT* landmark) {
+	const FLT state0 = state[0];
+	const FLT state1 = state[1];
+	const FLT state2 = state[2];
+	const FLT landmark0 = landmark[0];
+	const FLT landmark1 = landmark[1];
+	const FLT x0 = landmark1 + (-1 * state1);
+	const FLT x1 = landmark0 + (-1 * state0);
+	cnMatrixOptionalSet(out, 0, 0, sqrt((x1 * x1) + (x0 * x0)));
+	cnMatrixOptionalSet(out, 1, 0, atan2(x0, x1) + (-1 * state2));
+}
+
+static inline void meas_function_jac_state(CnMat* Hx, const FLT* state, const FLT* landmark) {
+	const FLT state0 = state[0];
+	const FLT state1 = state[1];
+	const FLT landmark0 = landmark[0];
+	const FLT landmark1 = landmark[1];
+	const FLT x0 = landmark1 + (-1 * state1);
+	const FLT x1 = landmark0 + (-1 * state0);
+	const FLT x2 = (x1 * x1) + (x0 * x0);
+	const FLT x3 = 1. / sqrt(x2);
+	const FLT x4 = 1. / x2;
+	cnMatrixOptionalSet(Hx, 0, 0, -1 * x1 * x3);
+	cnMatrixOptionalSet(Hx, 0, 1, -1 * x0 * x3);
+	cnMatrixOptionalSet(Hx, 0, 2, 0);
+	cnMatrixOptionalSet(Hx, 1, 0, x0 * x4);
+	cnMatrixOptionalSet(Hx, 1, 1, -1 * x1 * x4);
+	cnMatrixOptionalSet(Hx, 1, 2, -1);
+}
+
+
+    static inline void meas_function_jac_state_with_hx(CnMat* Hx, CnMat* hx, const FLT* state, const FLT* landmark) {
+        if(hx != 0) { 
+            meas_function(hx, state, landmark);
+        }
+        if(Hx != 0) { 
+            meas_function_jac_state(Hx, state, landmark);
+        }
+    }
+static inline void meas_function_jac_landmark(CnMat* Hx, const FLT* state, const FLT* landmark) {
+	const FLT state0 = state[0];
+	const FLT state1 = state[1];
+	const FLT landmark0 = landmark[0];
+	const FLT landmark1 = landmark[1];
+	const FLT x0 = landmark1 + (-1 * state1);
+	const FLT x1 = landmark0 + (-1 * state0);
+	const FLT x2 = (x1 * x1) + (x0 * x0);
+	const FLT x3 = 1. / sqrt(x2);
+	const FLT x4 = 1. / x2;
+	cnMatrixOptionalSet(Hx, 0, 0, x1 * x3);
+	cnMatrixOptionalSet(Hx, 0, 1, x0 * x3);
+	cnMatrixOptionalSet(Hx, 1, 0, -1 * x0 * x4);
+	cnMatrixOptionalSet(Hx, 1, 1, x1 * x4);
+}
+
+
+    static inline void meas_function_jac_landmark_with_hx(CnMat* Hx, CnMat* hx, const FLT* state, const FLT* landmark) {
+        if(hx != 0) { 
+            meas_function(hx, state, landmark);
+        }
+        if(Hx != 0) { 
+            meas_function_jac_landmark(Hx, state, landmark);
+        }
+    }
diff --git a/libs/cnkalman/tests/models/BikeLandmarks.h b/libs/cnkalman/tests/models/BikeLandmarks.h
new file mode 100644
index 00000000..7c1e7870
--- /dev/null
+++ b/libs/cnkalman/tests/models/BikeLandmarks.h
@@ -0,0 +1,80 @@
+#include <cnkalman/model.h>
+#include "BikeLandmarks.gen.h"
+
+struct LandmarkMeasurement : public cnkalman::KalmanMeasurementModel {
+    FLT px, py;
+
+    virtual cnmatrix::Matrix default_R() {
+        auto R = cnmatrix::Matrix(2, 2);
+        cnMatrixSet(R, 0, 0, .3*.3);
+        cnMatrixSet(R, 1, 1, .1*.1);
+        return R;
+    }
+
+    LandmarkMeasurement(cnkalman::KalmanModel *kalmanModel, double x, double y)
+            : KalmanMeasurementModel(kalmanModel, "Landmark", 2), px(x), py(y) {
+
+    }
+
+    static inline FLT wrapAngle( FLT angle )
+    {
+        while (angle > M_PI)
+            angle -= 2.0 * M_PI;
+        while (angle < -M_PI)
+            angle += 2.0 * M_PI;
+        return angle;
+    }
+
+    bool residual(const CnMat &Z, const CnMat &x, CnMat *y, CnMat *h) override {
+        auto rtn = KalmanMeasurementModel::residual(Z, x, y, h);
+        y->data[1] = wrapAngle(y->data[1]);
+        return rtn;
+    }
+
+    bool predict_measurement(const CnMat &x_t, CnMat *pz, CnMat *h) override {
+        FLT landmark[] = { px, py };
+        meas_function_jac_state_with_hx(h, pz, x_t.data, landmark);
+
+        return h == nullptr || cn_is_finite(h);
+    }
+};
+
+struct BikeLandmarks : public cnkalman::KalmanModel {
+    FLT wheelbase = 0.5;
+    FLT v = 1.1, alpha = .01;
+    FLT v_std = .1, alpha_std = .015;
+    BikeLandmarks() : cnkalman::KalmanModel("BikeLandmarks", 3) {
+        this->measurementModels.emplace_back(std::make_unique<LandmarkMeasurement>(this, 5, 10));
+        this->measurementModels.emplace_back(std::make_unique<LandmarkMeasurement>(this, 10, 5));
+        this->measurementModels.emplace_back(std::make_unique<LandmarkMeasurement>(this, 15, 15));
+
+        state[0] = 2; state[1] = 6; state[2] = .3;
+        cn_set_diag_val(&kalman_state.P, .1);
+    }
+
+    void process_noise(FLT dt, const struct CnMat &x, struct CnMat &Q_out) override {
+        FLT U[2] = { v, alpha};
+        if(fabs(U[1]) < 1e-7) {
+            U[1] = 1e-7;
+        }
+        CN_CREATE_STACK_MAT(V, 3, 2);
+        predict_function_jac_u(&V, dt, wheelbase, state, U);
+
+        FLT _M[] = {
+                v_std*v_std, 0,
+                0, alpha_std*alpha_std
+        };
+        auto M = cnMat(2, 2, _M);
+        cn_ABAt_add(&Q_out, &V, &M, 0);
+    }
+
+    void predict(FLT dt, const struct CnMat &x0, struct CnMat *x1, CnMat* F) override {
+        FLT U[2] = { v, alpha};
+        if(fabs(U[1]) < 1e-7) {
+            U[1] = 1e-7;
+        }
+
+        predict_function_jac_state_with_hx(F, x1, dt, wheelbase, x0.data, U);
+    }
+
+};
diff --git a/libs/cnkalman/tests/models/BikeLandmarks.py b/libs/cnkalman/tests/models/BikeLandmarks.py
new file mode 100644
index 00000000..d93e8fd9
--- /dev/null
+++ b/libs/cnkalman/tests/models/BikeLandmarks.py
@@ -0,0 +1,24 @@
+from symengine import atan2, asin, cos, sin, tan, sqrt
+import cnkalman.codegen as cg
+
+@cg.generate_code(state = 3, u = 2)
+def predict_function(dt, wheelbase, state, u):
+    x, y, theta = state
+    v, alpha = u
+    d = v * dt
+    R = wheelbase/tan(alpha)
+    beta = d / wheelbase * tan(alpha)
+
+    return [x + -R * sin(theta) + R * sin(theta + beta),
+            y + R * cos(theta) - R * cos(theta + beta),
+            theta + beta]
+
+@cg.generate_code(state = 3, landmark = 2)
+def meas_function(state, landmark):
+    x, y, theta = state
+    px, py = landmark
+
+    hyp = (px-x)**2 + (py-y)**2
+    dist = sqrt(hyp)
+
+    return [dist, atan2(py - y, px - x) - theta]
diff --git a/libs/cnkalman/tests/models/EggLandscape.gen.h b/libs/cnkalman/tests/models/EggLandscape.gen.h
new file mode 100644
index 00000000..9dc5b827
--- /dev/null
+++ b/libs/cnkalman/tests/models/EggLandscape.gen.h
@@ -0,0 +1,39 @@
+/// NOTE: This is a generated file; do not edit.
+#pragma once
+#include <cnkalman/generated_header.h>
+
+// clang-format off
+static inline void predict_meas(CnMat* out, const FLT* x) {
+	const FLT x0 = x[0];
+	const FLT x1 = x[1];
+	cnMatrixOptionalSet(out, 0, 0, sin((5 * (x1 * x1)) + (5 * (x0 * x0))));
+	cnMatrixOptionalSet(out, 1, 0, sin(101 * x0) * sin(101 * x1));
+	cnMatrixOptionalSet(out, 2, 0, cos(11 * x1) * sin(11 * x0));
+}
+
+static inline void predict_meas_jac_x(CnMat* Hx, const FLT* x) {
+	const FLT x0 = x[0];
+	const FLT x1 = x[1];
+	const FLT x2 = 10 * cos((5 * (x1 * x1)) + (5 * (x0 * x0)));
+	const FLT x3 = 101 * x1;
+	const FLT x4 = 101 * x0;
+	const FLT x5 = 11 * x1;
+	const FLT x6 = 11 * x0;
+	cnSetZero(Hx);
+	cnMatrixOptionalSet(Hx, 0, 0, x0 * x2);
+	cnMatrixOptionalSet(Hx, 0, 1, x2 * x1);
+	cnMatrixOptionalSet(Hx, 1, 0, 101 * sin(x3) * cos(x4));
+	cnMatrixOptionalSet(Hx, 1, 1, 101 * sin(x4) * cos(x3));
+	cnMatrixOptionalSet(Hx, 2, 0, 11 * cos(x6) * cos(x5));
+	cnMatrixOptionalSet(Hx, 2, 1, -11 * sin(x6) * sin(x5));
+}
+
+
+    static inline void predict_meas_jac_x_with_hx(CnMat* Hx, CnMat* hx, const FLT* x) {
+        if(hx != 0) { 
+            predict_meas(hx, x);
+        }
+        if(Hx != 0) { 
+            predict_meas_jac_x(Hx, x);
+        }
+    }
diff --git a/libs/cnkalman/tests/models/EggLandscape.h b/libs/cnkalman/tests/models/EggLandscape.h
new file mode 100644
index 00000000..306d5189
--- /dev/null
+++ b/libs/cnkalman/tests/models/EggLandscape.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <cnkalman/model.h>
+
+struct EggLandscapeMeas : public cnkalman::KalmanMeasurementModel {
+    EggLandscapeMeas(cnkalman::KalmanModel *kalmanModel, const std::string &name)
+            : KalmanMeasurementModel(kalmanModel, name, 3) {
+
+    }
+
+    cnmatrix::Matrix default_R() override{
+        auto rtn = cnmatrix::Matrix(meas_cnt, meas_cnt);
+        cn_set_diag_val(rtn, 5e-7);
+        return rtn;
+    }
+
+    FLT coeff_dist = 5;
+    FLT coeff_a = 101, coeff_b = 11;
+    bool predict_measurement(const CnMat &x_t, CnMat *pz, CnMat *h) override {
+        FLT x = x_t.data[0]; FLT y = x_t.data[1];
+        if(pz) {
+            pz->data[0] = sin((x * x + y * y) * coeff_dist);
+            //pz->data[0] = (x * x + y * y) * coeff_dist - 1;
+            pz->data[1] = sin(x * coeff_a) * sin(y * coeff_a);
+            pz->data[2] = sin(x * coeff_b) * cos(y * coeff_b);
+        }
+        if(h) {
+            FLT c = cos(x * x + y * y);
+            FLT jac[] = {
+                    2 * x * coeff_dist * c, 2 * y * coeff_dist * c, 0, 0,
+                    //2 * x * coeff_dist, 2 * y * coeff_dist, 0, 0,
+                    coeff_a * cos(coeff_a * x) * sin(y * coeff_a), coeff_a * cos(coeff_a * y) * sin(x * coeff_a), 0, 0,
+                    coeff_b * cos(coeff_b * x) * cos(y * coeff_b), -coeff_b * sin(coeff_b * y) * sin(x * coeff_b), 0, 0,
+            };
+            cn_copy_data_in(h, 1, jac);
+        }
+
+        return true;
+    }
+};
+struct EggLandscapeProblem : public cnkalman::KalmanLinearPredictionModel {
+    FLT _F[16] = {
+            1, 0, 1, 0,
+            0, 1, 0, 1,
+            0, 0, 1, 0,
+            0, 0, 0, 1,
+    };
+    CnMat Fm = cnMat(4, 4, _F);
+    virtual const CnMat& F() const { return Fm; }
+
+    EggLandscapeProblem() : cnkalman::KalmanLinearPredictionModel("EggLandscape", 4) {
+        state_cnt = 4;
+        measurementModels.emplace_back(std::make_unique<EggLandscapeMeas>(this, "meas"));
+        cnkalman_state_reset(&kalman_state);
+
+        state[0] = .1; state[1] = .1;
+    }
+
+    void process_noise(FLT dt, const CnMat &x, CnMat &Q_out) override {
+        cn_set_zero(&Q_out);
+        cnMatrixSet(&Q_out, 2, 2, .0001);
+        cnMatrixSet(&Q_out, 3, 3, .0001);
+    }
+
+    void sample_state(FLT dt, const CnMat &x0, CnMat &x1, const struct CnMat* iQ) override {
+        KalmanModel::sample_state(dt, x0, x1, iQ);
+
+        FLT wall = .4;
+        if(x1.data[0] > +wall && x1.data[2] > 0) x1.data[2] *= .9;
+        if(x1.data[0] < -wall && x1.data[2] < 0) x1.data[2] *= .9;
+        if(x1.data[1] > +wall && x1.data[3] > 0) x1.data[3] *= .9;
+        if(x1.data[1] < -wall && x1.data[3] < 0) x1.data[3] *= .9;
+    }
+};
\ No newline at end of file
diff --git a/libs/cnkalman/tests/models/EggLandscape.py b/libs/cnkalman/tests/models/EggLandscape.py
new file mode 100644
index 00000000..68c7a536
--- /dev/null
+++ b/libs/cnkalman/tests/models/EggLandscape.py
@@ -0,0 +1,41 @@
+from sympy import cos, sin, tan, atan2
+
+import numpy as np
+import sympy
+from filterpy.kalman import ExtendedKalmanFilter as EKF
+from numpy import array, sqrt
+from sympy import symbols, Matrix
+import cnkalman.codegen as cg
+
+@cg.generate_code(x=4)
+def predict_meas(x):
+    var_a = 101
+    var_b = 11
+    coeff_dist = 5
+
+    return [
+            sin((x[0] * x[0] + x[1] * x[1]) * coeff_dist),
+            sin(x[0] * var_a) * sin(x[1] * var_a),
+            sin(x[0] * var_b) * cos(x[1] * var_b)
+        ]
+
+class EggLandscape(EKF):
+    def __init__(self):
+        EKF.__init__(self, 4, 3)
+        self.Q = np.array([
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, .0001, 0],
+            [0, 0, 0, 0.0001],
+        ])
+        self.F = np.array([
+            [1, 0, 1, 0],
+            [0, 1, 0, 1],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1],
+            ])
+
+    def update(self, z, R):
+        EKF.update(self, z,
+                   HJacobian=lambda x: predict_meas.jacobians.x(x.reshape(-1)),
+                   Hx=lambda x: np.array(predict_meas(x.reshape(-1)), dtype=np.float64).reshape(-1, 1), R=R)
\ No newline at end of file
diff --git a/libs/cnkalman/tests/models/LinearPoseVel.h b/libs/cnkalman/tests/models/LinearPoseVel.h
new file mode 100644
index 00000000..fe34bf84
--- /dev/null
+++ b/libs/cnkalman/tests/models/LinearPoseVel.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <cnkalman/model.h>
+
+struct LinearPoseVel : public cnkalman::KalmanLinearPredictionModel {
+    FLT _F[16] = {
+            1, 0, 1, 0,
+            0, 1, 0, 1,
+            0, 0, 1, 0,
+            0, 0, 0, 1,
+    };
+    CnMat Fm = cnMat(4, 4, _F);
+    virtual const CnMat& F() const { return Fm; }
+
+    LinearPoseVel() : cnkalman::KalmanLinearPredictionModel("LinearToy", 4){
+        FLT _H[12] = {
+                -1, 0, 0, 0,
+                0, 1, 0, 0,
+                -.5, .5, 0, 0
+        };
+        CnMat H = cnMat(3, 4, _H);
+
+        measurementModels.emplace_back(std::make_unique<cnkalman::KalmanLinearMeasurementModel>(this, "meas", H));
+        cnkalman_state_reset(&kalman_state);
+    }
+
+    void process_noise(FLT dt, const CnMat &x, CnMat &Q_out) override {
+        cn_set_zero(&Q_out);
+        cnMatrixSet(&Q_out, 2, 2, .01);
+        cnMatrixSet(&Q_out, 3, 3, .01);
+    }
+
+    void sample_state(FLT dt, const CnMat &x0, CnMat &x1, const struct CnMat* iQ) override {
+        KalmanModel::sample_state(dt, x0, x1, iQ);
+        FLT wall = 10;
+        if(x1.data[0] > +wall && x1.data[2] > 0) x1.data[2] *= .9;
+        if(x1.data[0] < -wall && x1.data[2] < 0) x1.data[2] *= .9;
+        if(x1.data[1] > +wall && x1.data[3] > 0) x1.data[3] *= .9;
+        if(x1.data[1] < -wall && x1.data[3] < 0) x1.data[3] *= .9;
+    }
+};
\ No newline at end of file
diff --git a/libs/cnkalman/tests/models/RobotEKF.py b/libs/cnkalman/tests/models/RobotEKF.py
new file mode 100644
index 00000000..a7698dd4
--- /dev/null
+++ b/libs/cnkalman/tests/models/RobotEKF.py
@@ -0,0 +1,100 @@
+from math import cos, sin, tan, atan2
+
+import numpy as np
+import sympy
+from filterpy.kalman import ExtendedKalmanFilter as EKF
+from numpy import array, sqrt
+from sympy import symbols, Matrix
+from . import BikeLandmarks
+
+class RobotEKF(EKF):
+    def __init__(self, dt, wheelbase, u, std_vel, std_steer):
+        EKF.__init__(self, 3, 2, 2)
+        self.u = u
+        self.dt = dt
+        self.wheelbase = wheelbase
+        self.std_vel = std_vel
+        self.std_steer = std_steer
+        self.landmarks = array([[5, 10], [10, 5], [15, 15]])
+
+        a, x, y, v, w, theta, time = symbols(
+            'a, x, y, v, w, theta, t')
+        d = v*time
+        beta = (d/w)*sympy.tan(a)
+        r = w/sympy.tan(a)
+
+        self.fxu = Matrix(
+            [[x-r*sympy.sin(theta)+r*sympy.sin(theta+beta)],
+             [y+r*sympy.cos(theta)-r*sympy.cos(theta+beta)],
+             [theta+beta]])
+
+        self.F_j = self.fxu.jacobian(Matrix([x, y, theta]))
+        self.V_j = self.fxu.jacobian(Matrix([v, a]))
+
+        # save dictionary and it's variables for later use
+        self.subs = {x: 0, y: 0, v:0, a:0,
+                     time:dt, w:wheelbase, theta:0}
+        self.x_x, self.x_y, = x, y
+        self.v, self.a, self.theta = v, a, theta
+
+    def predict(self, u=None):
+        if u is None:
+            u = self.u
+        self.x = self.move(self.x, u, self.dt)
+
+        self.subs[self.theta] = self.x[2, 0]
+        self.subs[self.v] = u[0]
+        self.subs[self.a] = u[1]
+
+        F = array(self.F_j.evalf(subs=self.subs)).astype(float)
+        V = array(self.V_j.evalf(subs=self.subs)).astype(float)
+
+        # covariance of motion noise in control space
+        M = array([[self.std_vel**2, 0],
+                   [0, self.std_steer**2]])
+
+        self.P = F @ self.P @ F.T + V @ M @ V.T
+
+    def move(self, x, u, dt):
+        hdg = x[2, 0]
+        vel = u[0]
+        steering_angle = u[1]
+        dist = vel * dt
+
+        if abs(steering_angle) > 0.001: # is robot turning?
+            beta = (dist / self.wheelbase) * tan(steering_angle)
+            r = self.wheelbase / tan(steering_angle) # radius
+
+            dx = np.array([[-r*sin(hdg) + r*sin(hdg + beta)],
+                           [r*cos(hdg) - r*cos(hdg + beta)],
+                           [beta]])
+        else: # moving in straight line
+            dx = np.array([[dist*cos(hdg)],
+                           [dist*sin(hdg)],
+                           [0]])
+        return x + dx
+
+    def predict_meas(self, x, idx):
+        landmark_pos = self.landmarks[idx]
+        return BikeLandmarks.meas_function(x.reshape(-1), landmark_pos)
+
+    def update(self, z, R, landmark_idx):
+        def residual(a, b):
+            """ compute residual (a-b) between measurements containing
+            [range, bearing]. Bearing is normalized to [-pi, pi)"""
+            y = a - b
+            y[1] = y[1] % (2 * np.pi)    # force in range [0, 2 pi)
+            if y[1] > np.pi:             # move to [-pi, pi)
+                y[1] -= 2 * np.pi
+            return y
+
+        def Hx(x, landmark_pos):
+            return BikeLandmarks.meas_function(x.reshape(-1), landmark_pos).reshape(-1,1)
+
+        def H_of(x, landmark_pos):
+            return BikeLandmarks.meas_function.jacobians.state(x.reshape(-1), landmark_pos)
+        landmark = self.landmarks[landmark_idx]
+
+        EKF.update(self, z, HJacobian=H_of, Hx=Hx, R=R,
+               residual=residual,
+               args=(landmark), hx_args=(landmark))
\ No newline at end of file
diff --git a/libs/cnkalman/tests/multi_model_test.cc b/libs/cnkalman/tests/multi_model_test.cc
new file mode 100644
index 00000000..a16dd4a0
--- /dev/null
+++ b/libs/cnkalman/tests/multi_model_test.cc
@@ -0,0 +1,135 @@
+#include <cnkalman/kalman.h>
+#include <cnmatrix/cn_matrix.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <gtest/gtest.h>
+#include "test_utils.h"
+
+void F(FLT dt, const struct cnkalman_state_s *k, const struct CnMat *x0, struct CnMat *x1, struct CnMat *f_out) {
+    if(x1) cn_matrix_copy(x1, x0);
+    if(f_out) cn_set_diag_val(f_out, 1);
+}
+
+void Hfn(void *user, const struct CnMat *x_t, struct CnMat *hx, struct CnMat *Hx) {
+    const CnMat* H = static_cast<const CnMat *>(user);
+
+    if(Hx) cnCopy(H, Hx, 0);
+    if(hx) cnGEMM(H, x_t, 1, 0, 0, hx, (enum cnGEMMFlags)0);
+}
+bool Hx(void *user, const struct CnMat *Z, const struct CnMat *x_t, struct CnMat *y, struct CnMat *H_k) {
+    CN_CREATE_STACK_VEC(Hx, Z->rows);
+    Hfn(user, x_t, &Hx, H_k);
+    cn_elementwise_subtract(y, Z, &Hx);
+    return true;
+}
+
+void random_cov(CnMat* dst, FLT sig) {
+    CN_CREATE_STACK_VEC(D, dst->rows);
+    cnRand(&D, 0, sig);
+
+    CN_CREATE_STACK_MAT(S, dst->rows, dst->rows);
+    cnRand(&S, 0, sig);
+    cnGEMM(&S, &S, 1, 0, 1, dst, CN_GEMM_FLAG_B_T);
+
+    for(int i = 0;i < dst->rows;i++) cnMatrixSet(dst, i, i, cnMatrixGet(dst, i, i) + fabs(cn_as_vector(&D)[i]));
+
+}
+
+cnkalman_state_t combine_states(cnkalman_state_t** ks, int ks_cnt) {
+    cnkalman_state_t rtn = { 0 };
+    int state_cnt = 0, error_state_cnt = 0;
+    for(int i = 0;i < ks_cnt;i++) {
+        cnkalman_state_t* k = ks[i];
+        state_cnt += k->state_cnt;
+        error_state_cnt += k->state_cnt;
+    }
+
+    cnkalman_state_init(&rtn, state_cnt, ks[0]->Transition_fn, 0, 0, 0);
+    for(int i = 0, state_idx = 0, error_state_idx = 0;i < ks_cnt;i++) {
+        cnkalman_state_t* k = ks[i];
+        CnMat xView = cnMatView(k->state_cnt, 1, &rtn.state, state_idx, 0);
+        CnMat pView = cnMatView(k->error_state_size, k->error_state_size, &rtn.P, state_idx, state_idx);
+        cnCopy(&ks[i]->P, &pView, 0);
+        cnCopy(&ks[i]->state, &xView, 0);
+        state_idx += k->state_cnt;
+        error_state_idx += k->state_cnt;
+    }
+
+    return rtn;
+}
+
+
+TEST(Kalman, MultiTest) {
+    CN_CREATE_STACK_MAT(H, 20, 10);
+    cnRand(&H, 1, .1);
+    //cn_set_diag_val(&H, 11);
+
+    cnkalman_state_t m2 = { 0 }, m3 = { 0 };
+    //cnkalman_state_init(&m1, 10, F, 0, 0, 0);
+    cnkalman_state_init(&m2, 5, F, 0, 0, 0);
+    cnkalman_state_init(&m3, 5, F, 0, 0, 0);
+
+    //random_cov(&m1.P, 100);
+    random_cov(&m2.P, 1);
+    random_cov(&m3.P, 1);
+
+    cnkalman_state_t *mdls[] = {&m2, &m3};
+    cnkalman_state_t m1 = combine_states(mdls, 2);
+
+    CN_CREATE_STACK_VEC(x0, 10);
+    cnRand(&x0, 0, .5);
+
+    CN_CREATE_STACK_VEC(Z, 20);
+    CN_CREATE_STACK_VEC(n, 20);
+    cnRand(&n, 0, 1e-4);
+    Hfn(&H, &x0, &Z, 0);
+    cn_elementwise_add(&Z, &Z, &n);
+
+    CN_CREATE_STACK_VEC(R, 20);
+    cn_set_constant(&R, 1e-3);
+
+    cn_print_mat(&m1.P);
+    cn_print_mat(&m2.P);
+    cn_print_mat(&m3.P);
+
+    cn_print_mat(&R);
+    {
+        cnkalman_meas_model meas = {0};
+        cnkalman_state_t *mdls[] = {&m1};
+        cnkalman_meas_model_multi_init(mdls, 1, "meas", &meas, Hx);
+        meas.term_criteria.max_iterations = 10;
+        cnkalman_meas_model_predict_update(1, &meas, &H, &Z, &R);
+    }
+    {
+        cnkalman_meas_model meas = {0};
+        cnkalman_state_t *mdls[] = {&m2, &m3};
+        cnkalman_meas_model_multi_init(mdls, 2, "meas", &meas, Hx);
+        meas.term_criteria.max_iterations = 10;
+        cnkalman_meas_model_predict_update(1, &meas, &H, &Z, &R);
+    }
+    cn_print_mat(&m1.P);
+    cn_print_mat(&x0);
+
+    CnMat m1_2 = cnMatView(5, 1, &m1.state, 0, 0);
+    CnMat p1_2 = cnMatView(5, 5, &m1.P, 0, 0);
+    CnMat m1_3 = cnMatView(5, 1, &m1.state, 5, 0);
+    CnMat p1_3 = cnMatView(5, 5, &m1.P, 5, 5);
+    cn_print_mat(&m1.state);
+    cn_print_mat(&m2.state);
+    cn_print_mat(&m3.state);
+
+    EXPECT_TRUE(is_mat_near(&m1_2, &m2.state, 1e-7));
+    EXPECT_TRUE(is_mat_near(&m1_3, &m3.state, 1e-7));
+    EXPECT_TRUE(is_mat_near(&p1_2, &m2.P, 1e-7));
+    EXPECT_TRUE(is_mat_near(&p1_3, &m3.P, 1e-7));
+
+    cn_print_mat(&m1.P);
+    cn_print_mat(&m2.P);
+    cn_print_mat(&m3.P);
+
+    cnkalman_state_free(&m1);
+    cnkalman_state_free(&m2);
+    cnkalman_state_free(&m3);
+}
\ No newline at end of file
diff --git a/libs/cnkalman/tests/runbenchmarks.cmake b/libs/cnkalman/tests/runbenchmarks.cmake
new file mode 100644
index 00000000..2ff247d6
--- /dev/null
+++ b/libs/cnkalman/tests/runbenchmarks.cmake
@@ -0,0 +1,14 @@
+macro(EXEC_CHECK CMD)
+    message(" === ${CMD} ${ARGN}")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${cnkalman_root_source_dir} ${CMD} ${ARGN} RESULT_VARIABLE CMD_RESULT)
+    if(CMD_RESULT)
+        message(FATAL_ERROR "Error running ${CMD} ${ARGN}")
+    endif()
+endmacro()
+exec_check(${TEST_KALMAN_MODELS})
+message(${CMAKE_CURRENT_BINARY_DIR}/BikeLandmarks.kf)
+exec_check(python3 ${BENCHMARK_SCRIPT} ${CMAKE_CURRENT_BINARY_DIR}/BikeLandmarks.kf)
+exec_check(python3 ${BENCHMARK_SCRIPT} ${CMAKE_CURRENT_BINARY_DIR}/LinearToy.kf)
+exec_check(python3 ${BENCHMARK_SCRIPT} ${CMAKE_CURRENT_BINARY_DIR}/EggLandscape.kf)
+exec_check(python3 ${BENCHMARK_SCRIPT} ${CMAKE_CURRENT_BINARY_DIR}/BearingsOnlyTracking.kf)
+exec_check(python3 ${BENCHMARK_SCRIPT} ${CMAKE_CURRENT_BINARY_DIR}/BearingsOnlyTracking.small-iteration.kf)
\ No newline at end of file
diff --git a/libs/cnkalman/tests/test_utils.h b/libs/cnkalman/tests/test_utils.h
new file mode 100644
index 00000000..126deb68
--- /dev/null
+++ b/libs/cnkalman/tests/test_utils.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#define ASSERT_DOUBLE_ARRAY_EQ(n, arr1, arr2)                                                                          \
+	for (int i = 0; i < n; i++) {                                                                                      \
+		ASSERT_DOUBLE_EQ(arr1[i], arr2[i]);                                                                            \
+	}
+
+#define EXPECT_ARRAY_NEAR(n, arr1, arr2, abs_error)                                                                          \
+	for (int i = 0; i < n; i++) {                                                                                      \
+		EXPECT_NEAR(arr1[i], arr2[i], abs_error);                                                                            \
+	}
+
+
+static inline bool is_mat_near(const CnMat* x, const CnMat* y, float tol) {
+    if(x->cols != y->cols) return false;
+    if(x->rows != y->rows) return false;
+    for(int i = 0;i < x->rows;i++) {
+        for(int j = 0;j < x->cols;j++) {
+            EXPECT_NEAR(cnMatrixGet(x, i, j), cnMatrixGet(y, i, j), tol);
+            if(fabs(cnMatrixGet(x, i, j) - cnMatrixGet(y, i, j)) > tol) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+#define EXPECT_MAT_NEAR(m1, m2) { \
+                                  ASSERT_EQ(m1.rows == m2.rows);\
+}
diff --git a/libs/cnmatrix/.github/workflows/cmake.yml b/libs/cnmatrix/.github/workflows/cmake.yml
new file mode 100644
index 00000000..3bccd6da
--- /dev/null
+++ b/libs/cnmatrix/.github/workflows/cmake.yml
@@ -0,0 +1,125 @@
+name: Build and Test
+
+on:
+  pull_request:
+  push:
+  release:
+    types:
+      - created
+
+env:
+  BUILD_TYPE: Release
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        use_eigen: [ON]
+        build_type: [Release]
+        os: [ubuntu-18.04, windows-latest]
+        use_single_precision: [ON, OFF]
+        include:
+        - os: ubuntu-18.04
+          build_type: Debug
+          use_eigen: OFF
+          use_single_precision: ON
+        - os: ubuntu-18.04
+          build_type: Debug
+          use_eigen: ON
+          use_single_precision: ON
+        - os: ubuntu-18.04
+          build_type: Debug
+          use_eigen: ON
+          use_single_precision: OFF
+        - os: windows-latest
+          build_type: Debug
+          use_eigen: ON
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+    - if: contains(matrix.os, 'ubuntu')
+      name: Get Dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install p7zip-full build-essential zlib1g-dev liblapacke-dev libopenblas-dev libatlas-base-dev python3 python3-pip apt-transport-https python-virtualenv ninja-build libeigen3-dev ${{ matrix.deps }}
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Setup windows flags
+      if: contains(matrix.os, 'windows') && matrix.build_type == 'Release'
+      id: flags
+      run: echo "::set-output name=SETUP_PY_FLAGS::-G 'Visual Studio 16 2019'"
+    - name: Get mac dependencies
+      if: contains(matrix.os, 'macos')
+      run: |
+        brew install freeglut lapack openblas
+    - name: Create Build Environment
+      # Some projects don't allow in-source building, so create a separate build directory
+      # We'll use this as our working directory for all subsequent commands
+      run: cmake -E make_directory ${{runner.workspace}}/build
+
+    - name: Configure CMake
+      shell: bash
+      working-directory: ${{runner.workspace}}/build
+      run: cmake $GITHUB_WORKSPACE -DUSE_EIGEN=${{matrix.use_eigen}} -DDOWNLOAD_EIGEN=${{matrix.use_eigen}} -DUSE_SINGLE_PRECISION=${{matrix.use_single_precision}} -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DENABLE_TESTS=ON -DCMAKE_INSTALL_PREFIX=install_root ${{matrix.config_flags}}
+
+    - name: Build
+      working-directory: ${{runner.workspace}}/build
+      shell: bash
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build . --config $BUILD_TYPE -v
+    
+    - name: Install
+      working-directory: ${{runner.workspace}}/build
+      shell: bash
+      # Execute the build.  You can specify a specific target with "--target <NAME>"
+      run: cmake --build . --config $BUILD_TYPE --target install
+
+    - name: Set bundle name
+      id: bundle
+      run: echo "BUNDLE_FILE_NAME=cnmatrix-$(git describe --tags)-${{ matrix.os }}.zip" >> $GITHUB_OUTPUT
+
+    - name: Bundle
+      if: matrix.build_type == 'Release'
+      working-directory: ${{runner.workspace}}/cnmatrix
+      run: 7z a ${{runner.workspace}}/build/${{ steps.bundle.outputs.BUNDLE_FILE_NAME }} ${{runner.workspace}}/build/install_root/*
+
+    - uses: actions/upload-artifact@v4
+      name: Upload
+      if: matrix.build_type == 'Release'
+      with:
+        name: cnmatrix-${{ matrix.os }}-${{ matrix.build_type }}
+        path: | 
+          ${{runner.workspace}}/build/${{steps.bundle.outputs.BUNDLE_FILE_NAME}}
+        # TODO: sort out single precision on/off collision
+        overwrite: true
+
+    - name: Get release
+      id: get_release
+      if: github.event_name == 'release'
+      uses: bruceadams/get-release@v1.2.2
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Upload Release Asset
+      id: upload-release-asset
+      continue-on-error: true
+      if: github.event_name == 'release'
+      uses: actions/upload-release-asset@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        upload_url: ${{ steps.get_release.outputs.upload_url }}
+        asset_path: ${{runner.workspace}}/build/${{steps.bundle.outputs.BUNDLE_FILE_NAME}}
+        asset_name: ${{steps.bundle.outputs.BUNDLE_FILE_NAME}}
+        asset_content_type: application/zip
+
+    - name: Test
+      working-directory: ${{runner.workspace}}/build
+      shell: bash
+      # Execute tests defined by the CMake configuration.
+      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+      run: ctest -C $BUILD_TYPE --output-on-failure -j30
diff --git a/libs/cnmatrix/.gitignore b/libs/cnmatrix/.gitignore
new file mode 100644
index 00000000..5eef8403
--- /dev/null
+++ b/libs/cnmatrix/.gitignore
@@ -0,0 +1,41 @@
+# Linux
+.idea
+*.o
+*.so
+lib
+cmake-build-*
+build*
+
+# Temp files
+*~
+*/\#*\#
+\#*\#
+
+
+# Windows specific
+*.dll
+*.exe
+obj/
+bin/
+x64/
+packages
+.vs/
+Debug/
+Release/
+*.VC.db
+*.user
+*.ipch
+*.tlog
+*.d
+build
+
+.options
+.zip
+.avi
+
+*/__pycache__
+*.pyc
+
+venv
+.eggs/
+dist/
diff --git a/libs/cnmatrix/Android.mk b/libs/cnmatrix/Android.mk
new file mode 100644
index 00000000..ba2941b8
--- /dev/null
+++ b/libs/cnmatrix/Android.mk
@@ -0,0 +1,33 @@
+LOCAL_PATH := $(call my-dir)
+
+ifneq ($(TARGET_SURVIVE_MATH_BACKEND),)
+    SURVIVE_MATH_BACKEND := $(TARGET_SURVIVE_MATH_BACKEND)
+else
+    SURVIVE_MATH_BACKEND := eigen
+endif
+
+include $(CLEAR_VARS)
+LOCAL_MODULE                := libcnmatrix
+LOCAL_SRC_FILES             := src/cn_matrix.c
+LOCAL_MODULE_CLASS          := STATIC_LIBRARIES
+LOCAL_C_INCLUDES            := $(LOCAL_PATH)/include
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include
+LOCAL_PROPRIETARY_MODULE    := true
+
+LOCAL_CFLAGS := \
+    -Wno-error=unused-const-variable \
+    -Wno-error=unused-function \
+    -Wno-error=unused-parameter
+
+ifeq ($(SURVIVE_MATH_BACKEND),eigen)
+    LOCAL_SRC_FILES        += src/eigen/core.cpp src/eigen/gemm.cpp src/eigen/svd.cpp
+    LOCAL_CFLAGS           += -DUSE_EIGEN
+    LOCAL_HEADER_LIBRARIES += libeigen
+else ifeq ($(SURVIVE_MATH_BACKEND),blas)
+    # This needs a blas library to link, which is not easily available
+    LOCAL_SRC_FILES        += src/cn_matrix.blas.c
+else
+    $(error Unsupported cnmatrix backend: $(SURVIVE_MATH_BACKEND))
+endif
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/libs/cnmatrix/CMakeLists.txt b/libs/cnmatrix/CMakeLists.txt
new file mode 100644
index 00000000..add87717
--- /dev/null
+++ b/libs/cnmatrix/CMakeLists.txt
@@ -0,0 +1,91 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(cnmatrix LANGUAGES C CXX)
+include(CTest)
+
+option(ENABLE_TESTS "Enable tests" OFF)
+if (${ENABLE_TESTS})
+    enable_testing()
+endif()
+  
+include(CheckIncludeFile)
+include (CheckSymbolExists)
+
+
+macro(CNMATRIX_OPTION ARG DESC)
+  option(${ARG} ${DESC} ${ARGN})
+  SET(${ARG} ${ARGN})
+  message("-- CNMatrix option: ${ARG}: ${ARGN}")
+endmacro()
+
+CNMATRIX_OPTION(DOWNLOAD_EIGEN "Download eigen if it isn't installed on the system" ON)
+CNMATRIX_OPTION(USE_EIGEN "Use eigen for math operations" ${DOWNLOAD_EIGEN})
+CNMATRIX_OPTION(USE_COLUMN_MAJOR_MATRICES "Use column major matrices for math operations" OFF)
+CNMATRIX_OPTION(USE_SINGLE_PRECISION "Use floats instead of doubles" OFF)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+find_package (Eigen3 3.3 QUIET)
+if(NOT EIGEN3_FOUND)
+  if(DOWNLOAD_EIGEN)
+    find_program(GIT git)
+    if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/libeigen-src/Eigen")
+      execute_process(COMMAND ${GIT} clone -b 3.4 https://gitlab.com/libeigen/eigen.git "${CMAKE_CURRENT_BINARY_DIR}/libeigen-src")
+    endif()
+    set(EIGEN3_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/libeigen-src/")
+    set(USE_EIGEN ON)
+  else()
+    message(WARNING "Could not find eigen; falling back to other backend")
+    set(USE_EIGEN OFF)
+  endif()
+else()
+  MESSAGE("Found system eigen")
+endif()
+
+IF(USE_EIGEN)
+    add_compile_definitions(USE_EIGEN)
+
+    if(USE_COLUMN_MAJOR_MATRICES)
+        add_compile_definitions(CN_MATRIX_IS_COL_MAJOR)
+    endif()
+ENDIF()
+
+find_library(CBLAS_LIB NAMES cblas)
+set(USE_OPENBLAS_DEFAULT OFF)
+if(NOT CBLAS_LIB)
+  if(NOT USE_EIGEN)
+    set(USE_OPENBLAS_DEFAULT ON)
+  endif()
+endif()
+
+CNMATRIX_OPTION(USE_OPENBLAS "Use OpenBLAS" ${USE_OPENBLAS_DEFAULT})
+
+install(DIRECTORY include/cnmatrix DESTINATION include)
+include(GNUInstallDirs)
+configure_file(cnmatrix.pc.in cnmatrix.pc @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cnmatrix.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+IF(UNIX)
+  set(SHARED_FLAGS "-fPIC -Wall -Wno-unused-variable -Wno-switch -Wno-parentheses -Wno-missing-braces -Werror=return-type -fvisibility=hidden -Werror=vla -fno-math-errno -Werror=missing-field-initializers -Werror=pointer-arith")
+
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SHARED_FLAGS} -std=gnu99 -Werror=incompatible-pointer-types -Werror=implicit-function-declaration ")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SHARED_FLAGS} -std=c++14")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic")
+
+  if(ENABLE_WARNINGS_AS_ERRORS)
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
+  endif()
+
+ELSEIF(WIN32)
+  add_compile_options(/wd4244 /wd4996 /wd4018 /wd4101 /wd4477 /wd4068 /wd4217)
+  set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+  set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+endif()
+
+
+add_subdirectory(src)
+
+if(ENABLE_TESTS)
+  add_subdirectory(tests)
+endif()
+
diff --git a/libs/cnmatrix/LICENSE b/libs/cnmatrix/LICENSE
new file mode 100644
index 00000000..2dd359be
--- /dev/null
+++ b/libs/cnmatrix/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 cntools
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/libs/cnmatrix/README.md b/libs/cnmatrix/README.md
new file mode 100644
index 00000000..60fc89c7
--- /dev/null
+++ b/libs/cnmatrix/README.md
@@ -0,0 +1,9 @@
+# CNMatrix [![.github/workflows/cmake.yml](https://github.com/cntools/cnmatrix/actions/workflows/cmake.yml/badge.svg)](https://github.com/cntools/cnmatrix/actions/workflows/cmake.yml)
+
+This library provides a consistent C interface to a few matrix backends.
+
+The interface itself is a little more sane than raw lapack / blas calls, and is meant to be reasonably performant for medium to large matrices. It should also be cross platform and work reasonably well on embedded low latency systems; as it consistently tries to avoid heap allocations.
+
+As a caveat though; this library makes sense for C code bases, for C++ codebases it likely makes more sense to just use eigen directly.
+
+
diff --git a/libs/cnmatrix/cmake/FindEigen3.cmake b/libs/cnmatrix/cmake/FindEigen3.cmake
new file mode 100644
index 00000000..ded9e35f
--- /dev/null
+++ b/libs/cnmatrix/cmake/FindEigen3.cmake
@@ -0,0 +1,108 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+#
+# and the following imported target:
+#
+#  Eigen3::Eigen - The header-only Eigen library
+#
+# This module reads hints about search locations from 
+# the following environment variables:
+#
+# EIGEN3_ROOT
+# EIGEN3_ROOT_DIR
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+# SPDX-License-Identifier: BSD-2-Clause
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif()
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif()
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif()
+
+  set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif()
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else()
+    set(EIGEN3_VERSION_OK TRUE)
+  endif()
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif()
+endmacro()
+
+if (EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+  set(Eigen3_FOUND ${EIGEN3_VERSION_OK})
+
+else ()
+  
+  # search first if an Eigen3Config.cmake is available in the system,
+  # if successful this would set EIGEN3_INCLUDE_DIR and the rest of
+  # the script will work as usual
+  find_package(Eigen3 ${Eigen3_FIND_VERSION} NO_MODULE )
+
+  if(NOT EIGEN3_INCLUDE_DIR)
+    find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+        HINTS
+        ENV EIGEN3_ROOT 
+        ENV EIGEN3_ROOT_DIR
+        PATHS
+        ${CMAKE_INSTALL_PREFIX}/include
+        ${KDE4_INCLUDE_DIR}
+        PATH_SUFFIXES eigen3 eigen
+      )
+  endif()
+  
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif()
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif()
+
+if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen)
+  add_library(Eigen3::Eigen INTERFACE IMPORTED)
+  set_target_properties(Eigen3::Eigen PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}")
+endif()
diff --git a/libs/cnmatrix/cnmatrix.pc.in b/libs/cnmatrix/cnmatrix.pc.in
new file mode 100644
index 00000000..7dcc0387
--- /dev/null
+++ b/libs/cnmatrix/cnmatrix.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/include/cnmatrix
+
+Name: @CMAKE_PROJECT_NAME@
+Description: Cnmatrix
+Version: 0
+Libs: -L${libdir} -lcnmatrix @EXTRA_LIBS@
+Cflags: -I${includedir} -I${includedir}/redist
diff --git a/libs/cnmatrix/include/cnmatrix/cn_flt.h b/libs/cnmatrix/include/cnmatrix/cn_flt.h
new file mode 100644
index 00000000..6586be9d
--- /dev/null
+++ b/libs/cnmatrix/include/cnmatrix/cn_flt.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#ifdef CN_USE_FLOAT
+
+#define FLT float
+#define FLT_SQRT sqrtf
+#define FLT_TAN tanf
+#define FLT_SIN sinf
+#define FLT_COS cosf
+#define FLT_ACOS acosf
+#define FLT_ASIN asinf
+#define FLT_ATAN2 atan2f
+#define FLT_FABS__ fabsf
+#define FLT_STRTO strtof
+
+#define CN_FLT CN_32F
+#define FLT_POW powf
+
+#else
+
+#define CN_USE_DOUBLE 1
+#define FLT double
+#define FLT_SQRT sqrt
+#define FLT_TAN tan
+#define FLT_SIN sin
+#define FLT_COS cos
+#define FLT_ACOS acos
+#define FLT_ASIN asin
+#define FLT_ATAN2 atan2
+#define FLT_FABS__ fabs
+#define FLT_STRTO strtod
+
+#endif
diff --git a/libs/cnmatrix/include/cnmatrix/cn_matrix.blas.h b/libs/cnmatrix/include/cnmatrix/cn_matrix.blas.h
new file mode 100644
index 00000000..3515a769
--- /dev/null
+++ b/libs/cnmatrix/include/cnmatrix/cn_matrix.blas.h
@@ -0,0 +1,64 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "assert.h"
+#include "stdint.h"
+#include "stdlib.h"
+
+#define CN_Error(code, msg) assert(0 && msg); // cv::error( code, msg, CN_Func, __FILE__, __LINE__ )
+
+#define CN_32FC1 CN_32F
+#define CN_64FC1 CN_64F
+
+#define CN_MAGIC_MASK 0xFFFF0000
+#define CN_MAT_MAGIC_VAL 0x42420000
+
+#define CN_CN_MAX 512
+#define CN_CN_SHIFT 3
+#define CN_DEPTH_MAX (1 << CN_CN_SHIFT)
+
+#define CN_MAT_DEPTH_MASK (CN_DEPTH_MAX - 1)
+#define CN_MAT_DEPTH(flags) ((flags)&CN_MAT_DEPTH_MASK)
+
+#define CN_MAKETYPE(depth, cn) (CN_MAT_DEPTH(depth) + (((cn)-1) << CN_CN_SHIFT))
+#define CN_MAKE_TYPE CN_MAKETYPE
+
+#define CN_MAT_CN_MASK ((CN_CN_MAX - 1) << CN_CN_SHIFT)
+#define CN_MAT_CN(flags) ((((flags)&CN_MAT_CN_MASK) >> CN_CN_SHIFT) + 1)
+#define CN_MAT_TYPE_MASK (CN_DEPTH_MAX * CN_CN_MAX - 1)
+#define CN_MAT_TYPE(flags) ((flags)&CN_MAT_TYPE_MASK)
+#define CN_MAT_CONT_FLAG_SHIFT 14
+#define CN_MAT_CONT_FLAG (1 << CN_MAT_CONT_FLAG_SHIFT)
+#define CN_IS_MAT_CONT(flags) ((flags)&CN_MAT_CONT_FLAG)
+#define CN_IS_CONT_MAT CN_IS_MAT_CONT
+#define CN_SUBMAT_FLAG_SHIFT 15
+#define CN_SUBMAT_FLAG (1 << CN_SUBMAT_FLAG_SHIFT)
+#define CN_IS_SUBMAT(flags) ((flags)&CN_MAT_SUBMAT_FLAG)
+
+#define CN_IS_MATND_HDR(mat) ((mat) != NULL && (((const CnMat *)(mat))->type & CN_MAGIC_MASK) == CN_MATND_MAGIC_VAL)
+
+#define CN_IS_MATND(mat) (CN_IS_MATND_HDR(mat) && ((const CnMat *)(mat))->data.ptr != NULL)
+#define CN_MATND_MAGIC_VAL 0x42430000
+
+/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
+#define CN_ELEM_SIZE(type)                                                                                             \
+	(CN_MAT_CN(type) << ((((sizeof(size_t) / 4 + 1) * 16384 | 0x3a50) >> CN_MAT_DEPTH(type) * 2) & 3))
+
+#ifndef MIN
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+#endif
+
+#ifndef MAX
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+#endif
+
+/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
+#define CN_ELEM_SIZE(type)                                                                                             \
+	(CN_MAT_CN(type) << ((((sizeof(size_t) / 4 + 1) * 16384 | 0x3a50) >> CN_MAT_DEPTH(type) * 2) & 3))
+
+//#include "shim_types_c.h"
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/libs/cnmatrix/include/cnmatrix/cn_matrix.eigen.h b/libs/cnmatrix/include/cnmatrix/cn_matrix.eigen.h
new file mode 100644
index 00000000..286b6339
--- /dev/null
+++ b/libs/cnmatrix/include/cnmatrix/cn_matrix.eigen.h
@@ -0,0 +1,15 @@
+#include "cnmatrix/cn_matrix.h"
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define SV_MATRIX_IS_COL_MAJOR 1
+#define CN_HAS_SQROOT 1
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/libs/cnmatrix/include/cnmatrix/cn_matrix.h b/libs/cnmatrix/include/cnmatrix/cn_matrix.h
new file mode 100644
index 00000000..3e40b214
--- /dev/null
+++ b/libs/cnmatrix/include/cnmatrix/cn_matrix.h
@@ -0,0 +1,478 @@
+#pragma once
+
+#ifndef CN_EXPORT_FUNCTION
+#ifdef _WIN32
+#define CN_EXPORT_FUNCTION __declspec(dllexport)
+#define CN_EXPORT_CLASS CN_EXPORT_FUNCTION
+#define CN_IMPORT_FUNCTION __declspec(dllimport)
+#else
+#define CN_EXPORT_FUNCTION __attribute__((visibility("default")))
+#define CN_EXPORT_CLASS CN_EXPORT_FUNCTION
+#define CN_IMPORT_FUNCTION CN_EXPORT_FUNCTION
+#endif
+#endif
+
+/**
+ * Work around an issue in MSVC where NAN is not a constant expression
+ */
+#ifdef _MSC_VER
+#define _UCRT_NOISY_NAN
+#endif
+
+#include <stdbool.h>
+
+#include <malloc.h>
+#include "cnmatrix/cn_flt.h"
+#include "math.h"
+#include "string.h"
+#include <assert.h>
+
+#define CN_FLT_PTR(m) ((FLT *)((m)->data))
+#define CN_RAW_PTR(m) ((FLT *)((m)->data))
+
+#ifndef M_PI
+# define M_PI		3.14159265358979323846
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+static inline FLT cnMatrixGet(const struct CnMat *mat, int row, int col);
+static inline size_t cn_matrix_idx(const struct CnMat *mat, int row, int col);
+#define CN_MATRIX_DEFAULT(x) = (x)
+}
+#endif
+#ifndef CN_MATRIX_DEFAULT
+#define CN_MATRIX_DEFAULT(x)
+#endif
+
+typedef struct CnMat {
+	int step;
+
+	FLT *data;
+
+	int rows;
+	int cols;
+#ifdef __cplusplus
+    FLT operator()(int i, int j) const { return cnMatrixGet(this, i, j);}
+    FLT& operator()(int i, int j) { return data[cn_matrix_idx(this, i, j)]; }
+    FLT operator()(int i) const { assert(cols == 1); return cnMatrixGet(this, i, 0);}
+    FLT& operator()(int i) { assert(cols == 1); return data[i]; }
+#endif
+} CnMat;
+
+#ifdef USE_EIGEN
+#include "cn_matrix.eigen.h"
+#else
+#include "cn_matrix.blas.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+// Avoids missing-field-initializers errors
+#define ZERO_INITIALIZATION                                                                                            \
+	{}
+#else
+#define ZERO_INITIALIZATION                                                                                            \
+	{ 0 }
+#endif
+
+CnMat *cnInitMatHeader(CnMat *arr, int rows, int cols);
+CnMat *cnCreateMat(int height, int width);
+  
+enum cnInvertMethod {
+	CN_INVERT_METHOD_UNKNOWN = 0,
+	CN_INVERT_METHOD_SVD = 1,
+	CN_INVERT_METHOD_LU = 2,
+	CN_INVERT_METHOD_QR = 3,
+};
+
+double cnInvert(const CnMat *srcarr, CnMat *dstarr, enum cnInvertMethod method);
+void cnSqRootSymmetric(const CnMat *srcarr, CnMat *dstarr);
+void cnRand(CnMat *dstarr, FLT mu, FLT sigma);
+
+enum cnGEMMFlags {
+	CN_GEMM_FLAG_A_T = 1,
+	CN_GEMM_FLAG_B_T = 2,
+	CN_GEMM_FLAG_C_T = 4,
+};
+
+void cnGEMM(const CnMat *src1, const CnMat *src2, double alpha, const CnMat *src3, double beta, CnMat *dst,
+			enum cnGEMMFlags tABC);
+
+FLT cnNorm(const CnMat *s);
+FLT cnNorm2(const CnMat *s);
+FLT cnDistance(const CnMat *a, const CnMat *b);
+void cnSub(CnMat *dest, const CnMat *a, const CnMat *b);
+void cnAdd(CnMat *dest, const CnMat *a, const CnMat *b);
+void cnAddScaled(CnMat *dest, const CnMat *a, FLT as, const CnMat *b, FLT bs);
+void cnScale(CnMat *dest, const CnMat *a, FLT s);
+void cnElementwiseMultiply(CnMat *dest, const CnMat *a, const CnMat *b);
+  FLT cnDot(const CnMat* a, const CnMat* b);
+  
+/**
+ * xarr = argmin_x(Aarr * x - Barr)
+ */
+int cnSolve(const CnMat *Aarr, const CnMat *Barr, CnMat *xarr, enum cnInvertMethod method);
+
+void cnSetZero(CnMat *arr);
+
+void cnCopy(const CnMat *src, CnMat *dest, const CnMat *mask CN_MATRIX_DEFAULT(0));
+void cnCopyROI(const CnMat *src, CnMat *dest, int start_i, int start_j);
+
+CnMat *cnCloneMat(const CnMat *mat);
+
+void cnReleaseMat(CnMat **mat);
+
+enum cnSVDFlags { CN_SVD_MODIFY_A = 1, CN_SVD_U_T = 2, CN_SVD_V_T = 4 };
+
+void cnSVD(CnMat *aarr, CnMat *warr, CnMat *uarr, CnMat *varr, enum cnSVDFlags flags);
+
+void cnMulTransposed(const CnMat *src, CnMat *dst, int order, const CnMat *delta, double scale);
+
+void cnTranspose(const CnMat *M, CnMat *dst);
+
+double cnDet(const CnMat *M);
+
+#ifdef CN_MATRIX_USE_MALLOC
+#define CN_MATRIX_ALLOC(size) calloc(1, size)
+#define CN_MATRIX_FREE(ptr) free(ptr)
+#define CN_MATRIX_STACK_SCOPE_BEGIN {
+#define CN_MATRIX_STACK_SCOPE_END }
+#else
+#define CN_MATRIX_ALLOC(size) (memset(alloca(size), 0, size))
+#define CN_MATRIX_FREE(ptr)
+#define CN_MATRIX_STACK_SCOPE_BEGIN
+#define CN_MATRIX_STACK_SCOPE_END
+#endif
+
+#define CN_CREATE_STACK_VEC(name, rows)					\
+  CN_MATRIX_STACK_SCOPE_BEGIN						\
+  FLT *_##name = (FLT*)CN_MATRIX_ALLOC((rows) * sizeof(FLT));	\
+  CnMat name = cnVec(rows, _##name);
+
+#define CN_CREATE_STACK_MAT(name, rows, cols)                                                                          \
+	CN_MATRIX_STACK_SCOPE_BEGIN                                                                                        \
+	FLT *_##name = (FLT*)CN_MATRIX_ALLOC((rows) * (cols) * sizeof(FLT)); \
+	CnMat name = cnMat(rows, cols, _##name);
+
+#define CN_FREE_STACK_MAT(name)                                                                                        \
+	CN_MATRIX_FREE(_##name);                                                                                           \
+	CN_MATRIX_STACK_SCOPE_END
+
+#ifndef CN_MATRIX_IS_COL_MAJOR
+static inline int cn_stride(const struct CnMat *m) { return m->rows; }
+#else
+static inline int cn_stride(const struct CnMat *m) { return m->cols; }
+#endif
+
+static inline void cn_set_zero(struct CnMat *m) { memset(CN_FLT_PTR(m), 0, sizeof(FLT) * m->rows * m->cols); }
+static inline void cn_set_constant(struct CnMat *m, FLT v) {
+	for (int i = 0; i < m->rows * m->cols; i++)
+		CN_FLT_PTR(m)[i] = v;
+}
+
+static inline bool cn_is_finite(const struct CnMat *m) {
+	for (int i = 0; i < m->rows * m->cols; i++)
+		if (!isfinite(CN_FLT_PTR(m)[i]))
+			return false;
+	return true;
+}
+
+static inline void cn_matrix_copy(struct CnMat *dst, const struct CnMat *src) {
+	cnCopy(src, dst, 0);
+}
+
+static inline FLT *cn_as_vector(struct CnMat *m) {
+	assert(m->rows == 1 || m->cols == 1);
+	return CN_FLT_PTR(m);
+}
+
+static inline const FLT *cn_as_const_vector(const struct CnMat *m) {
+	assert(m->rows == 1 || m->cols == 1);
+	return CN_FLT_PTR(m);
+}
+
+/** Inline constructor. No data is allocated internally!!!
+ * (Use together with cnCreateData, or use cnCreateMat instead to
+ * get a matrix with allocated data):
+ */
+static inline CnMat cnMat(int rows, int cols, FLT *data) {
+	CnMat m = ZERO_INITIALIZATION;
+
+	m.cols = cols;
+	m.rows = rows;
+#ifndef CN_MATRIX_IS_COL_MAJOR
+	m.step = m.cols;
+#else
+	m.step = m.rows;
+#endif
+
+	if (!data) {
+		m.data = (FLT *)calloc(m.cols * m.rows, sizeof(FLT));
+	} else {
+		m.data = (FLT *)data;
+	}
+
+#if SURVIVE_ASAN_CHECKS
+	volatile double v = cvmGet(&m, rows - 1, cols - 1);
+	(void)v;
+#endif
+
+	return m;
+}
+
+  static inline CnMat cnMatCalloc(int height, int width) {
+    return cnMat(height, width, (FLT *)calloc(height * width, sizeof(FLT)));
+  }
+
+static inline CnMat cnVec(int rows, FLT *data) { return cnMat(rows, 1, data); }
+
+static inline size_t cn_matrix_idx(const CnMat *mat, int row, int col) {
+	assert((unsigned)row < (unsigned)mat->rows && (unsigned)col < (unsigned)mat->cols);
+#ifndef CN_MATRIX_IS_COL_MAJOR
+	return (size_t)mat->step * row + col;
+#else
+	return (size_t)mat->step * col + row;
+#endif
+}
+
+static inline CnMat cnMatView(int rows, int cols, CnMat* V, int r0, int c0) {
+    if(rows != 0 && cols != 0) {
+		cn_matrix_idx(V, r0, c0);
+		cn_matrix_idx(V, r0 + rows - 1, c0 + cols - 1);
+	}
+	CnMat rtn = cnMat(rows, cols, cn_matrix_idx(V, r0, c0) + V->data);
+	rtn.step = V->step;
+	return rtn;
+}
+
+static inline CnMat cnMatConstView(int rows, int cols, const CnMat* V, int r0, int c0) {
+    return cnMatView(rows, cols, (CnMat*)V, r0, c0);
+}
+
+/*
+The function is a fast replacement for cvGetReal2D in the case of single-channel floating-point
+matrices. It is faster because it is inline, it does fewer checks for array type and array element
+type, and it checks for the row and column ranges only in debug mode.
+@param mat Input matrix
+@param row The zero-based index of row
+@param col The zero-based index of column
+ */
+static inline FLT cnMatrixGet(const CnMat *mat, int row, int col) { return mat->data[cn_matrix_idx(mat, row, col)]; }
+
+static inline FLT *cnMatrixPtr(const CnMat *mat, int row, int col) { return &mat->data[cn_matrix_idx(mat, row, col)]; }
+/** @brief Sets a specific element of a single-channel floating-point matrix.
+
+The function is a fast replacement for cvSetReal2D in the case of single-channel floating-point
+matrices. It is faster because it is inline, it does fewer checks for array type and array element
+type, and it checks for the row and column ranges only in debug mode.
+@param mat The matrix
+@param row The zero-based index of row
+@param col The zero-based index of column
+@param value The new value of the matrix element
+ */
+static inline void cnMatrixSet(CnMat *mat, int row, int col, FLT value) {
+	mat->data[cn_matrix_idx(mat, row, col)] = value;
+}
+static inline void cnMatrixOptionalSet(CnMat *mat, int row, int col, FLT value) {
+	if(row >= mat->rows || col >= mat->cols) return;
+	cnMatrixSet(mat, row, col, value);
+}
+
+static inline void cn_get_diag(const struct CnMat *m, FLT *v, size_t cnt) {
+	for (size_t i = 0; i < cnt; i++) {
+		v[i] = cnMatrixGet(m, i, i);
+	}
+}
+static inline void cn_set_diag(struct CnMat *m, const FLT *v) {
+	for (int i = 0; i < m->rows; i++) {
+		for (int j = 0; j < m->cols; j++) {
+			cnMatrixSet(m, i, j, i == j ? (v ? v[i] : 1.) : 0.);
+		}
+	}
+}
+
+static inline bool cn_is_symmetrical(const struct CnMat *m) {
+	if(m->rows != m->cols) return false;
+
+	for (int i = 0; i < m->rows; i++) {
+		for (int j = 0; j < i; j++) {
+			if(cnMatrixGet(m, i, j) != cnMatrixGet(m, j, i))
+				return false;
+		}
+	}
+	return true;
+}
+
+static inline void cn_add_diag(struct CnMat *m, const CnMat* t, FLT scale) {
+	assert(m->rows == m->cols);
+	assert(m->rows == t->rows);
+	assert(t->cols == 1);
+	for (int i = 0; i < m->rows; i++) {
+		cnMatrixSet(m, i, i, cnMatrixGet(m, i, i) + cn_as_const_vector(t)[i] * scale);
+	}
+}
+
+CN_EXPORT_FUNCTION void cn_print_mat(const CnMat* M);
+static inline void cn_set_diag_val(struct CnMat *m, FLT v) {
+	for (int i = 0; i < m->rows; i++) {
+		for (int j = 0; j < m->cols; j++) {
+			cnMatrixSet(m, i, j, i == j ? v : 0.);
+		}
+	}
+}
+
+static inline void cn_eye(struct CnMat *m, const FLT *v) {
+	for (int i = 0; i < m->rows; i++) {
+		for (int j = 0; j < m->cols; j++) {
+			cnMatrixSet(m, i, j, i == j ? (v ? v[i] : 1.) : 0.);
+		}
+	}
+}
+
+static inline void cn_copy_in_row_major_roi(struct CnMat *dst, const FLT *src, size_t src_stride, int start_i,
+											int start_j, int end_i, int end_j) {
+	for (int i = start_i; i < end_i; i++) {
+		for (int j = start_j; j < end_j; j++) {
+			cnMatrixSet(dst, i, j, src[j + i * src_stride]);
+		}
+	}
+}
+
+static inline void cn_copy_in_row_major(struct CnMat *dst, const FLT *src, size_t src_stride) {
+	cn_copy_in_row_major_roi(dst, src, src_stride, 0, 0, dst->rows, dst->cols);
+}
+static inline FLT cn_sum(const struct CnMat *A) {
+	FLT rtn = 0;
+	for (int i = 0; i < A->rows; i++) {
+		for (int j = 0; j < A->cols; j++) {
+			rtn += cnMatrixGet(A, i, j);
+		}
+	}
+	return rtn;
+}
+static inline FLT cn_trace(const struct CnMat *A) {
+	FLT rtn = 0;
+	int min_dim = A->rows;
+	if (min_dim > A->cols)
+		min_dim = A->cols;
+	for (int i = 0; i < min_dim; i++) {
+		for (int j = 0; j < min_dim; j++) {
+			rtn += cnMatrixGet(A, i, j);
+		}
+	}
+
+	return rtn;
+}
+
+static inline void cn_copy_data_in(struct CnMat *A, bool isRowMajor, const FLT *d) {
+    assert(A && d);
+#ifdef CN_MATRIX_IS_COL_MAJOR
+	bool needsFlip = isRowMajor;
+#else
+	bool needsFlip = !isRowMajor;
+#endif
+	if (needsFlip) {
+		CnMat t = cnMat(A->cols, A->rows, (FLT *)d);
+		cnTranspose(&t, A);
+	} else {
+		memcpy(A->data, d, A->rows * A->cols * sizeof(FLT));
+	}
+}
+
+static inline void cn_row_major_to_internal(struct CnMat *A, const FLT *d) { cn_copy_data_in(A, true, d); }
+
+static inline void cn_col_major_to_internal(struct CnMat *A, const FLT *d) { cn_copy_data_in(A, false, d); }
+
+static inline CnMat cnMat_from_row_major(int rows, int cols, FLT *data) {
+	CnMat rtn = cnMat(rows, cols, data);
+	cn_row_major_to_internal(&rtn, data);
+	return rtn;
+}
+
+static inline CnMat cnMat_from_col_major(int rows, int cols, FLT *data) {
+	CnMat rtn = cnMat(rows, cols, data);
+	cn_col_major_to_internal(&rtn, data);
+	return rtn;
+}
+
+static inline void cn_elementwise_subtract(struct CnMat *dst, const struct CnMat *A, const struct CnMat *B) {
+    assert(dst->rows == A->rows && dst->cols == A->cols);
+    assert(dst->rows == B->rows && dst->cols == B->cols);
+	for (int i = 0; i < A->rows; i++) {
+		for (int j = 0; j < A->cols; j++) {
+			cnMatrixSet(dst, i, j, cnMatrixGet(A, i, j) - cnMatrixGet(B, i, j));
+		}
+	}
+}
+static inline void cn_elementwise_add(struct CnMat *dst, const struct CnMat *A, const struct CnMat *B) {
+    assert(dst->rows == A->rows && dst->cols == A->cols);
+    assert(dst->rows == B->rows && dst->cols == B->cols);
+    for (int i = 0; i < A->rows; i++) {
+        for (int j = 0; j < A->cols; j++) {
+            cnMatrixSet(dst, i, j, cnMatrixGet(A, i, j) + cnMatrixGet(B, i, j));
+        }
+    }
+}
+
+static inline void cn_multiply_scalar(struct CnMat *dst, const struct CnMat *src, FLT scale) {
+    assert(dst->rows == src->rows && dst->cols == src->cols);
+	for(int i = 0;i < dst->cols * dst->rows;i++) dst->data[i] = src->data[i] * scale;
+}
+
+CN_EXPORT_FUNCTION void cn_ABAt_add(struct CnMat *out, const struct CnMat *A, const struct CnMat *B, const struct CnMat *C);
+CN_EXPORT_FUNCTION const char* cnMatrixBackend();
+
+
+static inline FLT cn_norm2(const struct CnMat *A) {
+	FLT r = 0;
+	for(int i = 0;i < A->cols * A->rows;i++) r += A->data[i] * A->data[i];
+	return r;
+}
+#ifndef CN_MATRIX_IS_COL_MAJOR
+static inline CnMat cn_row(struct CnMat *M, int r) {
+	assert(r < M->rows);
+	return cnMat(1, M->cols, M->data + M->step * r);
+}
+#endif
+
+#define CNMATRIX_LOCAL_COPY(dst, src) \
+{                            \
+(dst) = *(src);                             \
+(dst).step = (dst).cols;         \
+(dst).data = (FLT*)alloca(sizeof(FLT) * (src)->rows * (src)->cols); \
+cnCopy(src, &(dst), 0);      \
+}\
+
+#define CNMATRIX_LOCAL_COPY_IF_ALIAS(dst, src) if((dst).data == (src)->data) {CNMATRIX_LOCAL_COPY(dst, src);}
+
+#ifdef __cplusplus
+}
+
+#include <vector>
+static inline std::vector<FLT> cnMatToVector(const CnMat& m) {
+    assert(m.cols == 1);
+    std::vector<FLT> v;
+    v.resize(m.rows);
+    for(int i = 0;i < m.rows;i++)
+        v[i] = m.data[i];
+    return v;
+}
+static inline std::vector<std::vector<FLT>> cnMatToVectorVector(const CnMat& m) {
+    std::vector<std::vector<FLT>> vv;
+    vv.resize(m.rows);
+    for(int i = 0;i < m.rows;i++) {
+        vv[i].resize(m.cols);
+        for (int j = 0; j < m.cols; j++) {
+            vv[i][j] = cnMatrixGet(&m, i, j);
+        }
+    }
+    return vv;
+}
+#define CNMATRIX_INCLUDED_FIRST
+#include <cnmatrix/cn_matrix.hpp>
+
+#endif
diff --git a/libs/cnmatrix/include/cnmatrix/cn_matrix.hpp b/libs/cnmatrix/include/cnmatrix/cn_matrix.hpp
new file mode 100644
index 00000000..c39b63be
--- /dev/null
+++ b/libs/cnmatrix/include/cnmatrix/cn_matrix.hpp
@@ -0,0 +1,37 @@
+#pragma once
+#ifndef CNMATRIX_INCLUDED_FIRST
+#error "Include <cnmatrix/cn_matrix.h>; not cn_matrix.hpp"
+#endif
+#include <cnmatrix/cn_matrix.h>
+#include <memory>
+
+namespace cnmatrix {
+    struct Matrix {
+        ::CnMat mat;
+        std::shared_ptr<FLT> storage;
+        Matrix(int rows, int cols = 1, FLT* data = 0) {
+            if(data == 0) {
+                storage = std::shared_ptr<FLT>((FLT *)calloc(rows * cols, sizeof(FLT)), free);
+                data = storage.get();
+            }
+            mat = cnMat(rows, cols, data);
+        }
+        static Matrix Like(const Matrix& a) {
+            return Matrix(a.mat.rows, a.mat.cols);
+        }
+        static Matrix Like(const CnMat & a) {
+            return Matrix(a.rows, a.cols);
+        }
+        operator const CnMat*() const { return &mat; }
+        operator CnMat*() { return &mat; }
+        operator const CnMat&() const { return mat; }
+        operator CnMat&() { return mat; }
+        Matrix(const CnMat& cpy) : Matrix(cpy.rows, cpy.cols){
+            cnCopy(&cpy, *this, 0);
+        }
+        FLT operator()(int i, int j) const { return mat(i, j);}
+        FLT& operator()(int i, int j) { return mat(i,j); }
+        FLT operator()(int i) const { return mat(i); }
+        FLT& operator()(int i) { return mat(i); }
+    };
+}
\ No newline at end of file
diff --git a/libs/cnmatrix/src/CMakeLists.txt b/libs/cnmatrix/src/CMakeLists.txt
new file mode 100644
index 00000000..c238febf
--- /dev/null
+++ b/libs/cnmatrix/src/CMakeLists.txt
@@ -0,0 +1,86 @@
+SET(CN_MATRIX_SRCS ../include/cnmatrix/cn_matrix.h cn_matrix.c)
+
+IF(USE_EIGEN)
+  SET(CN_MATRIX_SRCS ${CN_MATRIX_SRCS} eigen/core.cpp eigen/gemm.cpp eigen/svd.cpp eigen/internal.h)
+ELSE()
+  SET(CN_MATRIX_SRCS ${CN_MATRIX_SRCS} cn_matrix.blas.c)
+ENDIF()
+
+IF(WIN32)
+  set(packages_config "<packages><package id=\"lapacke\" version=\"0.1.0\" targetFramework=\"native\" /><package id=\"OpenBLAS\" version=\"0.2.14.1\" targetFramework=\"native\" /></packages>")
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/packages.config ${packages_config})
+  ADD_DEFINITIONS(-DHAVE_LAPACK_CONFIG_H -DLAPACK_COMPLEX_STRUCTURE)
+  set(CN_MATRIX_SRCS ${CN_MATRIX_SRCS} ${CMAKE_CURRENT_BINARY_DIR}/packages.config)
+endif()
+
+macro(find_library_path var pkg)
+  find_library(${var}_RAWFIND ${pkg} ${ARGN})
+  if(${var}_RAWFIND)
+    set(${var} ${${var}_RAWFIND})
+    message("Found ${pkg} at ${${var}}")
+  else()
+    message("Could not find ${pkg}")
+    set(${var} ${pkg})
+  endif()
+endmacro()
+find_library_path(CBLAS_LOCATION cblas HINTS /usr/local/opt/openblas/lib)
+find_library_path(OPENBLAS_LOCATION openblas HINTS /usr/local/opt/openblas/lib)
+find_library_path(LAPACKE_LOCATION lapacke HINTS /usr/local/opt/lapack/lib)
+
+if(USE_OPENBLAS OR WIN32)
+  set(BLAS_BACKEND "openblas")
+  include_directories(openblas)
+else()
+  set(BLAS_BACKEND "${CBLAS_LOCATION}")  
+  find_file(CBLAS_FULL_PATH cblas.h HINT "/usr/local/opt/openblas/include")
+  message("Searching for 'cblas.h'; found at ${CBLAS_FULL_PATH}")
+  IF(CBLAS_FULL_PATH)
+    get_filename_component(CBLAS_PATH ${CBLAS_FULL_PATH} DIRECTORY)
+    include_directories( ${CBLAS_PATH})
+  ENDIF()
+endif()
+
+
+check_include_file(lapacke/lapacke.h LAPACKE_FILE)
+if(LAPACKE_FILE)
+  add_definitions(-DLAPACKE_FOLDER)
+endif()
+
+if(USE_EIGEN)
+    message("Using eigen backend")
+else()
+    message("Using blas backend ${BLAS_BACKEND}")
+endif()
+
+add_library(cnmatrix STATIC ${CN_MATRIX_SRCS})
+target_include_directories(cnmatrix PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include/>
+  $<INSTALL_INTERFACE:include/>)
+
+if(USE_SINGLE_PRECISION)
+  target_compile_definitions(cnmatrix PUBLIC CN_USE_FLOAT)
+endif()
+
+set_target_properties(cnmatrix PROPERTIES FOLDER "libraries")
+
+IF(USE_EIGEN)
+  add_definitions     ( ${EIGEN3_DEFINITIONS} )
+  include_directories ( ${EIGEN3_INCLUDE_DIR} )
+
+  if(WIN32)
+    target_compile_options(cnmatrix PRIVATE -bigobj)
+  else()
+    target_compile_options(cnmatrix PRIVATE -fno-exceptions -fPIC)
+  endif()
+ELSE()
+  IF(UNIX)
+    target_link_libraries(cnmatrix ${BLAS_BACKEND} ${LAPACKE_LOCATION} m)
+  elseif(WIN32)
+    include_directories(${CMAKE_BINARY_DIR}/packages/OpenBLAS.0.2.14.1/lib/native/include/)
+    target_link_libraries(cnmatrix
+      ${CMAKE_BINARY_DIR}/packages/OpenBLAS.0.2.14.1/lib/native/lib/${WIN_PLATFORM}/libopenblas.dll.a
+      )
+  ENDIF()
+ENDIF()
+
+install(TARGETS cnmatrix DESTINATION lib)
diff --git a/libs/cnmatrix/src/cn_matrix.blas.c b/libs/cnmatrix/src/cn_matrix.blas.c
new file mode 100644
index 00000000..de349cb2
--- /dev/null
+++ b/libs/cnmatrix/src/cn_matrix.blas.c
@@ -0,0 +1,601 @@
+#include <cblas.h>
+#ifdef LAPACKE_FOLDER
+#include <lapacke/lapacke.h>
+#include <lapacke/lapacke_utils.h>
+#else
+#include <lapacke.h>
+#include <lapacke_utils.h>
+#endif
+
+#include "math.h"
+#include "stdbool.h"
+#include "stdio.h"
+#include "string.h"
+#include "cnmatrix/cn_matrix.h"
+
+#include <limits.h>
+#include <stdarg.h>
+
+#ifdef _WIN32
+#define SURVIVE_LOCAL_ONLY
+#include <malloc.h>
+#define alloca _alloca
+#else
+#define SURVIVE_LOCAL_ONLY __attribute__((visibility("hidden")))
+#endif
+
+#define CN_Error(code, msg) assert(0 && msg); // cv::error( code, msg, CN_Func, __FILE__, __LINE__ )
+
+const int DECOMP_SVD = 1;
+const int DECOMP_LU = 2;
+
+void print_mat(const CnMat *M);
+
+static size_t mat_size_bytes(const CnMat *mat) { return (size_t)sizeof(FLT) * mat->cols * mat->rows; }
+
+#ifdef CN_USE_FLOAT
+#define cblas_gemm cblas_sgemm
+#define cblas_symm cblas_ssymm
+#define LAPACKE_getrs LAPACKE_sgetrs
+#define LAPACKE_getrf LAPACKE_sgetrf
+#define LAPACKE_getri LAPACKE_sgetri
+#define LAPACKE_gelss LAPACKE_sgelss
+#define LAPACKE_gesvd LAPACKE_sgesvd
+#define LAPACKE_gesvd_work LAPACKE_sgesvd_work
+#define LAPACKE_getri_work LAPACKE_sgetri_work
+#define LAPACKE_ge_trans LAPACKE_sge_trans
+#define LAPACKE_potrf LAPACKE_spotrf
+#else
+#define cblas_gemm cblas_dgemm
+#define cblas_symm cblas_dsymm
+#define LAPACKE_getrs LAPACKE_dgetrs
+#define LAPACKE_getrf LAPACKE_dgetrf
+#define LAPACKE_getri LAPACKE_dgetri
+#define LAPACKE_gelss LAPACKE_dgelss
+#define LAPACKE_gesvd LAPACKE_dgesvd
+#define LAPACKE_gesvd_work LAPACKE_dgesvd_work
+#define LAPACKE_getri_work LAPACKE_dgetri_work
+#define LAPACKE_ge_trans LAPACKE_dge_trans
+    #define LAPACKE_potrf LAPACKE_dpotrf
+#endif
+
+// dst = alpha * src1 * src2 + beta * src3 or dst = alpha * src2 * src1 + beta * src3 where src1 is symm
+SURVIVE_LOCAL_ONLY void cnSYMM(const CnMat *src1, const CnMat *src2, double alpha, const CnMat *src3, double beta,
+							   CnMat *dst, bool src1First) {
+
+	int rows1 = src1->rows;
+	int cols1 = src1->cols;
+
+	int rows2 = src2->rows;
+	int cols2 = src2->cols;
+
+	if (src3) {
+		int rows3 = src3->rows;
+		int cols3 = src3->cols;
+		assert(rows3 == dst->rows);
+		assert(cols3 == dst->cols);
+	}
+
+	// assert(src3 == 0 || beta != 0);
+	assert(cols1 == rows2);
+	assert(rows1 == dst->rows);
+	assert(cols2 == dst->cols);
+
+	lapack_int lda = src1->cols;
+	lapack_int ldb = src2->cols;
+
+	if (src3)
+		cnCopy(src3, dst, 0);
+	else
+		beta = 0;
+
+	assert(CN_RAW_PTR(dst) != CN_RAW_PTR(src1));
+	assert(CN_RAW_PTR(dst) != CN_RAW_PTR(src2));
+	/*
+		void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order,
+						 OPENBLAS_CONST enum CBLAS_SIDE Side,
+						 OPENBLAS_CONST enum CBLAS_UPLO Uplo,
+						 OPENBLAS_CONST blasint M,
+						 OPENBLAS_CONST blasint N,
+						 OPENBLAS_CONST double alpha,
+						 OPENBLAS_CONST double *A,
+						 OPENBLAS_CONST blasint lda,
+						 OPENBLAS_CONST double *B,
+						 OPENBLAS_CONST blasint ldb,
+						 OPENBLAS_CONST double beta,
+						 double *C,
+						 OPENBLAS_CONST blasint ldc);
+	*/
+	cblas_symm(CblasRowMajor, src1First ? CblasLeft : CblasRight, CblasUpper, dst->rows, dst->cols, alpha,
+			   CN_RAW_PTR(src1), lda, CN_RAW_PTR(src2), ldb, beta, CN_RAW_PTR(dst), dst->cols);
+}
+
+// Special case dst = alpha * src2 * src1 * src2' + beta * src3
+void mulBABt(const CnMat *src1, const CnMat *src2, double alpha, const CnMat *src3, double beta, CnMat *dst) {
+	size_t dims = src2->rows;
+	assert(src2->cols == src2->rows);
+	CN_CREATE_STACK_MAT(tmp, dims, dims);
+
+	// This has been profiled; and weirdly enough the SYMM version is slower for a 19x19 matrix. Guessing access order
+	// or some other cache thing matters more than the additional 2x multiplications.
+//#define USE_SYM
+#ifdef USE_SYM
+	cnSYMM(src1, src2, 1, 0, 0, &tmp, false);
+	cnGEMM(&tmp, src2, alpha, src3, beta, dst, CN_GEMM_B_T);
+#else
+	cnGEMM(src1, src2, 1, 0, 0, &tmp, CN_GEMM_FLAG_B_T);
+	cnGEMM(src2, &tmp, alpha, src3, beta, dst, 0);
+#endif
+	CN_FREE_STACK_MAT(tmp);
+}
+
+// dst = alpha * src1 * src2 + beta * src3
+SURVIVE_LOCAL_ONLY void cnGEMM(const CnMat *src1, const CnMat *src2, double alpha, const CnMat *src3, double beta,
+							   CnMat *dst, enum cnGEMMFlags tABC) {
+
+	int rows1 = (tABC & CN_GEMM_FLAG_A_T) ? src1->cols : src1->rows;
+	int cols1 = (tABC & CN_GEMM_FLAG_A_T) ? src1->rows : src1->cols;
+
+	int rows2 = (tABC & CN_GEMM_FLAG_B_T) ? src2->cols : src2->rows;
+	int cols2 = (tABC & CN_GEMM_FLAG_B_T) ? src2->rows : src2->cols;
+
+	CnMat src1_local = *src1;
+	CnMat src2_local = *src2;
+
+	CNMATRIX_LOCAL_COPY_IF_ALIAS(src1_local, src1);
+	CNMATRIX_LOCAL_COPY_IF_ALIAS(src2_local, src2);
+
+	if (src3) {
+		int rows3 = (tABC & CN_GEMM_FLAG_C_T) ? src3->cols : src3->rows;
+		int cols3 = (tABC & CN_GEMM_FLAG_C_T) ? src3->rows : src3->cols;
+		assert(rows3 == dst->rows);
+		assert(cols3 == dst->cols);
+	}
+
+	// assert(src3 == 0 || beta != 0);
+	assert(cols1 == rows2);
+	assert(rows1 == dst->rows);
+	assert(cols2 == dst->cols);
+
+	if(dst->rows == 0 || dst->cols == 0)
+		return;
+
+	lapack_int lda = src1_local.step;
+	lapack_int ldb = src2_local.step;
+
+	if (src3)
+		cnCopy(src3, dst, 0);
+	else
+		beta = 0;
+
+	assert(dst->cols > 0);
+	cblas_gemm(CblasRowMajor, (tABC & CN_GEMM_FLAG_A_T) ? CblasTrans : CblasNoTrans,
+			   (tABC & CN_GEMM_FLAG_B_T) ? CblasTrans : CblasNoTrans, dst->rows, dst->cols, cols1, alpha,
+			   CN_RAW_PTR(&src1_local), lda, CN_RAW_PTR(&src2_local), ldb, beta, CN_RAW_PTR(dst), dst->step);
+}
+
+// dst = scale * src ^ t * src     iff order == 1
+// dst = scale *     src * src ^ t iff order == 0
+SURVIVE_LOCAL_ONLY void cnMulTransposed(const CnMat *src, CnMat *dst, int order, const CnMat *delta, double scale) {
+	lapack_int rows = src->rows;
+	lapack_int cols = src->cols;
+
+	lapack_int drows = order == 0 ? dst->rows : dst->cols;
+	assert(drows == dst->cols);
+	assert(order == 1 ? (dst->cols == src->cols) : (dst->cols == src->rows));
+	assert(delta == 0 && "This isn't implemented yet");
+	double beta = 0;
+
+	bool isAT = order == 1;
+	bool isBT = !isAT;
+
+	lapack_int dstCols = dst->cols;
+	assert(dstCols > 0);
+	cblas_gemm(CblasRowMajor, isAT ? CblasTrans : CblasNoTrans, isBT ? CblasTrans : CblasNoTrans, dst->rows, dst->cols,
+			   order == 1 ? src->rows : src->cols, scale, CN_RAW_PTR(src), src->cols, CN_RAW_PTR(src), src->cols, beta,
+			   CN_RAW_PTR(dst), dstCols);
+}
+
+/* IEEE754 constants and macros */
+#define CN_TOGGLE_FLT(x) ((x) ^ ((int)(x) < 0 ? 0x7fffffff : 0))
+#define CN_TOGGLE_DBL(x) ((x) ^ ((int64)(x) < 0 ? CN_BIG_INT(0x7fffffffffffffff) : 0))
+
+#define CN_DbgAssert assert
+
+#define CN_CREATE_MAT_HEADER_ALLOCA(stack_mat, rows, cols)                                                             \
+	CnMat *stack_mat = cnInitMatHeader(CN_MATRIX_ALLOC(sizeof(CnMat)), rows, cols);
+
+#define CN_CREATE_MAT_ALLOCA(stack_mat, height, width)                                                                 \
+	CN_CREATE_MAT_HEADER_ALLOCA(stack_mat, height, width);                                                             \
+	(stack_mat)->data = CN_MATRIX_ALLOC(mat_size_bytes(stack_mat));
+
+#define CN_MAT_ALLOCA_FREE(stack_mat)                                                                                  \
+	{                                                                                                                  \
+		CN_MATRIX_FREE(stack_mat->data);                                                                               \
+		CN_MATRIX_FREE(stack_mat);                                                                                     \
+	}
+
+#define CREATE_CN_STACK_MAT(name, rows, cols, type)                                                                    \
+	FLT *_##name = alloca(rows * cols * sizeof(FLT));                                                                  \
+	CnMat name = cnMat(rows, cols, _##name);
+
+static lapack_int LAPACKE_gesvd_static_alloc(int matrix_layout, char jobu, char jobvt, lapack_int m, lapack_int n,
+											 FLT *a, lapack_int lda, FLT *s, FLT *u, lapack_int ldu, FLT *vt,
+											 lapack_int ldvt, FLT *superb) {
+	lapack_int info = 0;
+	lapack_int lwork = -1;
+	FLT *work = NULL;
+	FLT work_query;
+	lapack_int i;
+	if (matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR) {
+		LAPACKE_xerbla("LAPACKE_dgesvd", -1);
+		return -1;
+	}
+
+	/* Query optimal working array(s) size */
+	info = LAPACKE_gesvd_work(matrix_layout, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, &work_query, lwork);
+	if (info != 0) {
+		goto exit_level_0;
+	}
+	lwork = (lapack_int)work_query;
+	/* Allocate memory for work arrays */
+	work = (FLT *)CN_MATRIX_ALLOC(sizeof(FLT) * lwork);
+	memset(work, 0, sizeof(FLT) * lwork);
+
+	if (work == NULL) {
+		info = LAPACK_WORK_MEMORY_ERROR;
+		goto exit_level_0;
+	}
+	/* Call middle-level interface */
+	info = LAPACKE_gesvd_work(matrix_layout, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork);
+	/* Backup significant data from working array(s) */
+	for (i = 0; i < MIN(m, n) - 1; i++) {
+		superb[i] = work[i + 1];
+	}
+	CN_MATRIX_FREE(work);
+
+exit_level_0:
+	if (info == LAPACK_WORK_MEMORY_ERROR) {
+		LAPACKE_xerbla("LAPACKE_dgesvd", info);
+	}
+	return info;
+}
+
+static inline lapack_int LAPACKE_getri_static_alloc(int matrix_layout, lapack_int n, FLT *a, lapack_int lda,
+													const lapack_int *ipiv) {
+	lapack_int info = 0;
+	lapack_int lwork = -1;
+	FLT *work = NULL;
+	FLT work_query;
+	if (matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR) {
+		LAPACKE_xerbla("LAPACKE_dgetri", -1);
+		return -1;
+	}
+	/* Query optimal working array(s) size */
+	info = LAPACKE_getri_work(matrix_layout, n, a, lda, ipiv, &work_query, lwork);
+	if (info != 0) {
+		goto exit_level_0;
+	}
+	lwork = (lapack_int)work_query;
+	/* Allocate memory for work arrays */
+	work = (FLT *)alloca(sizeof(FLT) * lwork);
+	memset(work, 0, sizeof(FLT) * lwork);
+	if (work == NULL) {
+		info = LAPACK_WORK_MEMORY_ERROR;
+		goto exit_level_0;
+	}
+	/* Call middle-level interface */
+	info = LAPACKE_getri_work(matrix_layout, n, a, lda, ipiv, work, lwork);
+	/* Release memory and exit */
+
+exit_level_0:
+	if (info == LAPACK_WORK_MEMORY_ERROR) {
+		LAPACKE_xerbla("LAPACKE_dgetri", info);
+	}
+	return info;
+}
+
+SURVIVE_LOCAL_ONLY double cnInvert(const CnMat *srcarr, CnMat *dstarr, enum cnInvertMethod method) {
+	lapack_int inf;
+	lapack_int rows = srcarr->rows;
+	lapack_int cols = srcarr->cols;
+	lapack_int lda = srcarr->step;
+
+	cnCopy(srcarr, dstarr, 0);
+	FLT *a = CN_RAW_PTR(dstarr);
+
+#ifdef DEBUG_PRINT
+	printf("a: \n");
+	print_mat(srcarr);
+#endif
+	if (method == CN_INVERT_METHOD_LU) {
+		lapack_int *ipiv = CN_MATRIX_ALLOC(sizeof(lapack_int) * MIN(srcarr->rows, srcarr->cols));
+
+		lapack_int lda_t = MAX(1, rows);
+
+		FLT *a_t = (FLT *)CN_MATRIX_ALLOC(sizeof(FLT) * lda_t * MAX(1, cols));
+		LAPACKE_ge_trans(LAPACK_ROW_MAJOR, rows, cols, a, lda, a_t, lda_t);
+
+		inf = LAPACKE_getrf(LAPACK_COL_MAJOR, rows, cols, a_t, lda, ipiv);
+		assert(inf == 0);
+
+		inf = LAPACKE_getri_static_alloc(LAPACK_COL_MAJOR, rows, a_t, lda, ipiv);
+		assert(inf >= 0);
+
+		LAPACKE_ge_trans(LAPACK_COL_MAJOR, rows, cols, a_t, lda, a, lda_t);
+
+		if (inf > 0) {
+			printf("Warning: Singular matrix: \n");
+			// print_mat(srcarr);
+		}
+
+		CN_MATRIX_FREE(a_t);
+		CN_MATRIX_FREE(ipiv);
+		// free(ipiv);
+
+	} else if (method == DECOMP_SVD) {
+		CN_CREATE_STACK_MAT(w, 1, MIN(dstarr->rows, dstarr->cols));
+		CN_CREATE_STACK_MAT(u, dstarr->cols, dstarr->cols);
+		CN_CREATE_STACK_MAT(v, dstarr->rows, dstarr->rows);
+		CN_CREATE_STACK_MAT(um, w.cols, w.cols);
+
+		cnSVD(dstarr, &w, &u, &v, 0);
+
+		cnSetZero(&um);
+		for (int i = 0; i < w.cols; i++) {
+			if (_w[i] != 0.0)
+				cnMatrixSet(&um, i, i, 1. / (_w)[i]);
+		}
+
+        CN_CREATE_STACK_MAT(tmp, dstarr->cols, dstarr->rows);
+
+		cnGEMM(&v, &um, 1, 0, 0, &tmp, 0);
+		cnGEMM(&tmp, &u, 1, 0, 0, dstarr, CN_GEMM_FLAG_B_T);
+
+		CN_FREE_STACK_MAT(um);
+		CN_FREE_STACK_MAT(v);
+		CN_FREE_STACK_MAT(u);
+		CN_FREE_STACK_MAT(w);
+	} else {
+		assert(0 && "Bad argument");
+		return -1;
+	}
+
+	return 0;
+}
+
+#define CN_CLONE_MAT_ALLOCA(stack_mat, mat)                                                                            \
+	CN_CREATE_MAT_ALLOCA(stack_mat, mat->rows, mat->cols)                                                              \
+	cnCopy(mat, stack_mat, 0);
+
+static int cnSolve_LU(const CnMat *Aarr, const CnMat *Barr, CnMat *xarr) {
+	lapack_int inf;
+	lapack_int arows = Aarr->rows;
+	lapack_int acols = Aarr->cols;
+	lapack_int xcols = Barr->cols;
+	lapack_int xrows = Barr->rows;
+	lapack_int lda = acols; // Aarr->step / sizeof(double);
+
+	assert(Aarr->cols == xarr->rows);
+	assert(Barr->rows == Aarr->rows);
+	assert(xarr->cols == Barr->cols);
+
+	cnCopy(Barr, xarr, 0);
+	FLT *a_ws = CN_MATRIX_ALLOC(mat_size_bytes(Aarr));
+	memcpy(a_ws, CN_RAW_PTR(Aarr), mat_size_bytes(Aarr));
+
+	lapack_int brows = xarr->rows;
+	lapack_int bcols = xarr->cols;
+	lapack_int ldb = bcols; // Barr->step / sizeof(double);
+
+	lapack_int *ipiv = CN_MATRIX_ALLOC(sizeof(lapack_int) * MIN(Aarr->rows, Aarr->cols));
+
+	inf = LAPACKE_getrf(LAPACK_ROW_MAJOR, arows, acols, (a_ws), lda, ipiv);
+	assert(inf >= 0);
+	if (inf > 0) {
+		printf("Warning: Singular matrix: \n");
+		// print_mat(a_ws);
+	}
+
+#ifdef DEBUG_PRINT
+	printf("Solve A * x = B:\n");
+	// print_mat(a_ws);
+	print_mat(Barr);
+#endif
+
+	inf = LAPACKE_getrs(LAPACK_ROW_MAJOR, CblasNoTrans, arows, bcols, (a_ws), lda, ipiv, CN_RAW_PTR(xarr), ldb);
+	assert(inf == 0);
+
+	CN_MATRIX_FREE(a_ws);
+	CN_MATRIX_FREE(ipiv);
+	return 0;
+}
+
+void cnSqRootSymmetric(const CnMat *srcarr, CnMat *dstarr) {
+    assert(srcarr->rows == srcarr->cols);
+    assert(dstarr->rows == dstarr->cols);
+    assert(dstarr->rows == srcarr->cols);
+
+    cnCopy(srcarr, dstarr, 0);
+    int info = LAPACKE_potrf(LAPACK_ROW_MAJOR, 'L', srcarr->cols, dstarr->data, dstarr->step);
+    for(int i = 0;i < dstarr->cols;i++) {
+        for(int j = i + 1;j < dstarr->cols;j++) {
+            cnMatrixSet(dstarr, i, j, 0);
+        }
+    }
+    assert(info >= 0);
+}
+
+static inline int cnSolve_SVD(const CnMat *Aarr, const CnMat *Barr, CnMat *xarr) {
+	lapack_int arows = Aarr->rows;
+	lapack_int acols = Aarr->cols;
+	lapack_int xcols = Barr->cols;
+
+	bool xLargerThanB = Barr->rows > acols;
+	CnMat *xCpy = 0;
+	if (xLargerThanB) {
+		CN_CLONE_MAT_ALLOCA(xCpyStack, Barr);
+		xCpy = xCpyStack;
+	} else {
+		xCpy = xarr;
+		memcpy(CN_RAW_PTR(xarr), CN_RAW_PTR(Barr), mat_size_bytes(Barr));
+	}
+
+	// CnMat *aCpy = cnCloneMat(Aarr);
+	FLT *aCpy = CN_MATRIX_ALLOC(mat_size_bytes(Aarr));
+	memcpy(aCpy, CN_RAW_PTR(Aarr), mat_size_bytes(Aarr));
+
+	FLT *S = CN_MATRIX_ALLOC(sizeof(FLT) * MIN(arows, acols));
+	// FLT *S = malloc(sizeof(FLT) * MIN(arows, acols));
+	FLT rcond = -1;
+	lapack_int *rank = CN_MATRIX_ALLOC(sizeof(lapack_int) * MIN(arows, acols));
+	lapack_int inf =
+		LAPACKE_gelss(LAPACK_ROW_MAJOR, arows, acols, xcols, (aCpy), acols, CN_RAW_PTR(xCpy), xcols, S, rcond, rank);
+	assert(xarr->rows == acols);
+	assert(xarr->cols == xCpy->cols);
+
+	if (xLargerThanB) {
+		xCpy->rows = acols;
+		cnCopy(xCpy, xarr, 0);
+		// cnReleaseMat(&xCpy);
+	}
+
+	CN_MATRIX_FREE(rank);
+	CN_MATRIX_FREE(aCpy);
+	CN_MATRIX_FREE(S);
+	if (xLargerThanB) {
+		CN_MAT_ALLOCA_FREE(xCpy);
+	}
+
+	assert(inf == 0);
+	if (inf != 0)
+		return inf;
+	return 0;
+}
+
+SURVIVE_LOCAL_ONLY int cnSolve(const CnMat *Aarr, const CnMat *Barr, CnMat *xarr, enum cnInvertMethod method) {
+	if (method == CN_INVERT_METHOD_LU) {
+		return cnSolve_LU(Aarr, Barr, xarr);
+	} else if (method == CN_INVERT_METHOD_SVD || method == CN_INVERT_METHOD_QR) {
+		return cnSolve_SVD(Aarr, Barr, xarr);
+	} else {
+		assert("Unknown method to solve" && 0);
+	}
+	return -1;
+}
+
+SURVIVE_LOCAL_ONLY void cnTranspose(const CnMat *M, CnMat *dst) {
+	bool inPlace = M == dst || CN_RAW_PTR(M) == CN_RAW_PTR(dst);
+	FLT *src = CN_RAW_PTR(M);
+
+	if (inPlace) {
+		src = alloca(mat_size_bytes(M));
+		memcpy(src, CN_RAW_PTR(M), mat_size_bytes(M));
+	} else {
+		assert(M->rows == dst->cols);
+		assert(M->cols == dst->rows);
+	}
+
+	for (unsigned i = 0; i < M->rows; i++) {
+		for (unsigned j = 0; j < M->cols; j++) {
+			CN_RAW_PTR(dst)[j * M->rows + i] = src[i * M->cols + j];
+		}
+	}
+}
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define MEMORY_SANITIZER_IGNORE __attribute__((no_sanitize("memory")))
+#endif
+#endif
+#ifndef MEMORY_SANITIZER_IGNORE
+#define MEMORY_SANITIZER_IGNORE
+#endif
+
+#define CALLOCA(size) memset(alloca(size), 0, size)
+
+SURVIVE_LOCAL_ONLY void cnSVD(CnMat *aarr, CnMat *warr, CnMat *uarr, CnMat *varr, enum cnSVDFlags flags) {
+	char jobu = 'A';
+	char jobvt = 'A';
+
+	lapack_int inf;
+
+	if ((flags & CN_SVD_MODIFY_A) == 0) {
+		aarr = cnCloneMat(aarr);
+	}
+
+	if (uarr == 0)
+		jobu = 'N';
+	if (varr == 0)
+		jobvt = 'N';
+
+	lapack_int arows = aarr->rows, acols = aarr->cols;
+
+	FLT *pw = warr ? CN_RAW_PTR(warr) : (FLT *)CALLOCA(sizeof(FLT) * arows * acols);
+	FLT *pu = uarr ? CN_RAW_PTR(uarr) : (FLT *)CALLOCA(sizeof(FLT) * arows * arows);
+	FLT *pv = varr ? CN_RAW_PTR(varr) : (FLT *)CALLOCA(sizeof(FLT) * acols * acols);
+
+	lapack_int ulda = uarr ? uarr->step : aarr->step;
+	lapack_int plda = varr ? varr->step : aarr->step;
+
+	FLT *superb = CALLOCA(sizeof(FLT) * MIN(arows, acols));
+	inf = LAPACKE_gesvd_static_alloc(LAPACK_ROW_MAJOR, jobu, jobvt, arows, acols, CN_RAW_PTR(aarr), acols, pw, pu, ulda,
+									 pv, plda, superb);
+
+	switch (inf) {
+	case -6:
+		assert(false && "matrix has NaNs");
+		break;
+	case 0:
+		break;
+	default:
+		assert(inf == 0);
+	}
+
+	if (uarr && (flags & CN_SVD_U_T)) {
+		cnTranspose(uarr, uarr);
+	}
+
+	if (varr && (flags & CN_SVD_V_T) == 0) {
+		cnTranspose(varr, varr);
+	}
+
+	if ((flags & CN_SVD_MODIFY_A) == 0) {
+	    free(aarr->data);
+		cnReleaseMat(&aarr);
+	}
+}
+
+const char* cnMatrixBackend() {
+	return "BLAS";
+}
+
+SURVIVE_LOCAL_ONLY double cnDet(const CnMat *M) {
+	assert(M->rows == M->cols);
+	assert(M->rows <= 3 && "cnDet unimplemented for matrices >3");
+
+	FLT *m = CN_RAW_PTR(M);
+
+	switch (M->rows) {
+	case 1:
+		return m[0];
+	case 2: {
+		return m[0] * m[3] - m[1] * m[2];
+	}
+	case 3: {
+		FLT m00 = m[0], m01 = m[1], m02 = m[2], m10 = m[3], m11 = m[4], m12 = m[5], m20 = m[6], m21 = m[7], m22 = m[8];
+
+		return m00 * (m11 * m22 - m12 * m21) - m01 * (m10 * m22 - m12 * m20) + m02 * (m10 * m21 - m11 * m20);
+	}
+	default:
+		abort();
+	}
+}
+
+// https://fossies.org/linux/OpenBLAS/USAGE.md
+extern void __attribute__ ((__weak__)) openblas_set_num_threads(int);
+void __attribute__((constructor)) force_thread_count() {
+  if(openblas_set_num_threads) {
+    openblas_set_num_threads(1);
+  }
+}
diff --git a/libs/cnmatrix/src/cn_matrix.c b/libs/cnmatrix/src/cn_matrix.c
new file mode 100644
index 00000000..290048e7
--- /dev/null
+++ b/libs/cnmatrix/src/cn_matrix.c
@@ -0,0 +1,224 @@
+#include "cnmatrix/cn_matrix.h"
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifdef _WIN32
+#define CN_LOCAL_ONLY
+#include <malloc.h>
+#define alloca _alloca
+#else
+#define CN_LOCAL_ONLY __attribute__((visibility("hidden")))
+#endif
+
+#define CN_Error(code, msg) assert(0 && msg); // cv::error( code, msg, CN_Func, __FILE__, __LINE__ )
+CN_LOCAL_ONLY CnMat *cnCloneMat(const CnMat *mat) {
+	CnMat *rtn = cnCreateMat(mat->rows, mat->cols);
+	cnCopy(mat, rtn, 0);
+	return rtn;
+}
+
+static size_t mat_size_bytes(const CnMat *mat) { return (size_t)sizeof(FLT) * mat->cols * mat->rows; }
+
+FLT cnDistance(const CnMat *a, const CnMat *b) {
+    assert(a->cols == b->cols);
+    assert(a->rows == b->rows);
+    FLT r = 0;
+    for (int i = 0; i < a->cols * a->rows; i++) {
+        FLT dx = a->data[i] - b->data[i];
+        r += dx*dx;
+    }
+    return FLT_SQRT(r);
+}
+FLT cnNorm2(const CnMat *s) {
+  FLT r = 0;
+  const FLT* in = s->data;
+  for (int i = 0; i < s->cols * s->rows; i++) {
+    r += in[i] * in[i];
+  }
+  return r;
+}
+FLT cnNorm(const CnMat *s) { return FLT_SQRT(cnNorm2(s)); }
+
+#define ITER_MATRIX(a, b, expr) \
+	assert(a->cols * a->rows == b->cols * b->rows);\
+	for (int i = 0; i < a->rows; i++) \
+		for (int j = 0; j < a->cols; j++) {              \
+							  FLT A = cnMatrixGet(a, i,j);                 \
+							  FLT B = cnMatrixGet(b, i,j);                 \
+			expr;\
+		}\
+
+#define SET_MATRIX(dst, a, b, expr) \
+	assert(a->cols * a->rows == dest->cols * dest->rows);\
+	assert(b->cols * b->rows == dest->cols * dest->rows);\
+	for (int i = 0; i < dest->rows; i++) \
+		for (int j = 0; j < dest->cols; j++) {              \
+							  FLT A = cnMatrixGet(a, i,j);                 \
+							  FLT B = cnMatrixGet(b, i,j);                 \
+			cnMatrixSet(dst, i, j, expr);\
+		}\
+
+#define SET_MATRIX_UNARY(dst, a, expr) \
+	assert(a->cols * a->rows == dest->cols * dest->rows);\
+	for (int i = 0; i < dest->rows; i++) \
+		for (int j = 0; j < dest->cols; j++) {              \
+							  FLT A = cnMatrixGet(a, i,j);                 \
+			cnMatrixSet(dst, i, j, expr);\
+		}\
+
+#define SET_MATRIX_EXPR(dest, expr) \
+	for (int i = 0; i < dest->rows; i++) \
+		for (int j = 0; j < dest->cols; j++) {              \
+			cnMatrixSet(dest, i, j, expr);\
+		}\
+
+void cnSub(CnMat *dest, const CnMat *a, const CnMat *b) {
+	SET_MATRIX(dest, a, b, A - B);
+}
+void cnAdd(CnMat *dest, const CnMat *a, const CnMat *b) {
+	SET_MATRIX(dest, a, b, A + B)
+}
+void cnAddScaled(CnMat *dest, const CnMat *a, FLT as, const CnMat *b, FLT bs) {
+	SET_MATRIX(dest, a, b, A * as + B * bs)
+}
+void cnScale(CnMat *dest, const CnMat *a, FLT s) {
+	SET_MATRIX_UNARY(dest, a, A * s);
+}
+void cnElementwiseMultiply(CnMat *dest, const CnMat *a, const CnMat *b) {
+	SET_MATRIX(dest, a, b, A * B)
+}
+
+FLT cnDot(const CnMat* a, const CnMat* b) {
+  FLT rtn = 0;
+  ITER_MATRIX(a, b, rtn += A * B);
+  return rtn;
+}
+
+static inline int linmath_imin(int x, int y) { return x < y ? x : y; }
+
+void cnCopy(const CnMat *src, CnMat *dest, const CnMat *mask) {
+  assert(mask == 0 && "This isn't implemented yet");
+  if (src->rows == dest->rows && src->cols == dest->cols && src->cols == src->step && dest->cols == dest->step) {
+    memcpy(CN_RAW_PTR(dest), CN_RAW_PTR(src), mat_size_bytes(src));
+  } else {
+    for (int i = 0; i < linmath_imin(src->rows, dest->rows); i++)
+      for (int j = 0; j < linmath_imin(src->cols, dest->cols); j++)
+	cnMatrixSet(dest, i, j, cnMatrixGet(src, i, j));
+  }       
+}
+
+static FLT linmath_normrand(FLT mu, FLT sigma) {
+    static const double epsilon = 0.0000001;
+
+    static double z1= NAN;
+    static bool generate = false;
+    generate = !generate;
+
+    if (!generate && false)
+        return z1 * sigma + mu;
+
+    double u1, u2;
+    do {
+        u1 = rand() * (1.0 / RAND_MAX);
+        u2 = rand() * (1.0 / RAND_MAX);
+    } while (u1 <= epsilon);
+
+    double z0 = sqrt(-2.0 * log(u1)) * cos(M_PI * 2. * u2);
+    z1 = sqrt(-2.0 * log(u1)) * sin(M_PI * 2. * u2);
+    return z0 * sigma + mu;
+}
+
+void cnRand(CnMat *arr, FLT mu, FLT sigma) {
+	SET_MATRIX_EXPR(arr, linmath_normrand(mu, sigma));
+}
+
+CN_LOCAL_ONLY void cnSetZero(CnMat *arr) {
+	SET_MATRIX_EXPR(arr, 0);
+}
+CN_LOCAL_ONLY void cvSetIdentity(CnMat *arr) {
+	SET_MATRIX_EXPR(arr, i == j);
+}
+
+CN_LOCAL_ONLY void cnReleaseMat(CnMat **mat) {
+	free(*mat);
+	*mat = 0;
+}
+
+/* the alignment of all the allocated buffers */
+#define CN_MALLOC_ALIGN 16
+CN_LOCAL_ONLY void *cnAlloc(size_t size) { return malloc(size); }
+
+CN_LOCAL_ONLY void cnCreateData(CnMat *arr) {
+  size_t step = arr->step;
+	CnMat *mat = (CnMat *)arr;
+
+	if (mat->rows == 0 || mat->cols == 0)
+		return;
+
+	if (CN_FLT_PTR(mat) != 0)
+		CN_Error(CN_StsError, "Data is already allocated");
+    assert(step != 0);
+
+	int64_t total_size = (int64_t)step * mat->rows * sizeof(FLT);
+	mat->data = (FLT *)cnAlloc(total_size);
+}
+
+CnMat *cnInitMatHeader(CnMat *arr, int rows, int cols) {
+	assert(!(rows < 0 || cols < 0));
+
+	arr->step = cols;
+	arr->rows = rows;
+	arr->cols = cols;
+	arr->data = 0;
+
+	return arr;
+}
+
+CN_LOCAL_ONLY CnMat *cnCreateMatHeader(int rows, int cols) {
+	return cnInitMatHeader((CnMat *)cnAlloc(sizeof(CnMat)), rows, cols);
+}
+CN_LOCAL_ONLY CnMat *cnCreateMat(int height, int width) {
+	CnMat *arr = cnCreateMatHeader(height, width);
+	cnCreateData(arr);
+
+	return arr;
+}
+
+inline void cn_ABAt_add(struct CnMat *out, const struct CnMat *A, const struct CnMat *B, const struct CnMat *C) {
+	CN_CREATE_STACK_MAT(tmp, A->rows, B->cols);
+	cnGEMM(A, B, 1, 0, 0, &tmp, 0);
+    cnGEMM(&tmp, A, 1, C, 1, out, CN_GEMM_FLAG_B_T);
+	CN_FREE_STACK_MAT(tmp);
+}
+
+void cnCopyROI(const CnMat *src, CnMat *dest, int start_i, int start_j) {
+    for (int i = 0; i < src->rows; i++) {
+        for (int j = 0; j < src->cols; j++) {
+            cnMatrixSet(dest, start_i + i, start_j + j, cnMatrixGet(src, i, j));
+        }
+    }
+}
+void cn_print_mat(const CnMat *M) {
+	bool newlines = M->cols > 1;
+	char term = newlines ? '\n' : ' ';
+	if (!M) {
+		fprintf(stdout, "null%c", term);
+		return;
+	}
+	fprintf(stdout, "%2d x %2d:%c", M->rows, M->cols, term);
+
+	for (unsigned i = 0; i < M->rows; i++) {
+		for (unsigned j = 0; j < M->cols; j++) {
+			FLT v = cnMatrixGet(M, i, j);
+			//                         "+1.4963228e-10,
+			if (v == 0)
+				fprintf(stdout, "             0, ");
+			else
+				fprintf(stdout, "%+7.7e, ", v);
+		}
+		if (newlines && M->cols > 1)
+			fprintf(stdout, "\n");
+	}
+	fprintf(stdout, "\n");
+}
diff --git a/libs/cnmatrix/src/eigen/core.cpp b/libs/cnmatrix/src/eigen/core.cpp
new file mode 100644
index 00000000..5fd07539
--- /dev/null
+++ b/libs/cnmatrix/src/eigen/core.cpp
@@ -0,0 +1,53 @@
+//#define EIGEN_RUNTIME_NO_MALLOC
+#include "internal.h"
+const char* cnMatrixBackend() {
+	return "Eigen";
+}
+
+void cnSqRootSymmetric(const CnMat *srcarr, CnMat *dstarr) {
+    auto src = CONVERT_TO_EIGEN_PTR(srcarr);
+    auto dst = CONVERT_TO_EIGEN_PTR(dstarr);
+
+    EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(false);
+    dst.noalias() = Eigen::LLT<MatrixType>(src).matrixL().toDenseMatrix();
+}
+
+void cnMulTransposed(const CnMat *src, CnMat *dst, int order, const CnMat *delta, double scale) {
+	auto srcEigen = CONVERT_TO_EIGEN_PTR(src);
+	auto dstEigen = CONVERT_TO_EIGEN_PTR(dst);
+
+	int drows = order == 0 ? dst->rows : dst->cols;
+	assert(drows == dst->cols);
+	assert(order == 1 ? (dst->cols == src->cols) : (dst->cols == src->rows));
+
+	if (delta) {
+		auto deltaEigen = CONVERT_TO_EIGEN_PTR(delta);
+		if (order == 0)
+			dstEigen.noalias() = scale * (srcEigen - deltaEigen) * (srcEigen - deltaEigen).transpose();
+		else
+			dstEigen.noalias() = scale * (srcEigen - deltaEigen).transpose() * (src - delta);
+	} else {
+		if (order == 0)
+			dstEigen.noalias() = scale * srcEigen * srcEigen.transpose();
+		else
+			dstEigen.noalias() = scale * srcEigen.transpose() * srcEigen;
+	}
+}
+
+void cnTranspose(const CnMat *M, CnMat *dst) {
+	auto src = CONVERT_TO_EIGEN_PTR(M);
+	auto dstEigen = CONVERT_TO_EIGEN_PTR(dst);
+	if (CN_FLT_PTR(M) == CN_FLT_PTR(dst))
+		dstEigen = src.transpose().eval();
+	else
+		dstEigen.noalias() = src.transpose();
+}
+
+void print_mat(const CnMat *M);
+
+double cnDet(const CnMat *M) {
+	EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(false);
+	auto MEigen = CONVERT_TO_EIGEN_PTR(M);
+	return MEigen.determinant();
+}
+
diff --git a/libs/cnmatrix/src/eigen/gemm.cpp b/libs/cnmatrix/src/eigen/gemm.cpp
new file mode 100644
index 00000000..5c344c12
--- /dev/null
+++ b/libs/cnmatrix/src/eigen/gemm.cpp
@@ -0,0 +1,66 @@
+#include "internal.h"
+
+extern "C" void cnGEMM(const CnMat *_src1tmp, const CnMat *_src2tmp, double alpha, const CnMat *_src3tmp, double beta,
+					   CnMat *_dst, enum cnGEMMFlags tABC) {
+    CnMat _src1 = *_src1tmp;
+    CnMat _src2 = *_src2tmp;
+    CnMat _src3 = { };
+	if (_src3tmp) {
+        _src3 = *_src3tmp;
+		//assert(_src3->data != _src2->data);
+		//assert(_src3->data != _src1->data);
+        CNMATRIX_LOCAL_COPY_IF_ALIAS(_src3, _dst);
+		//assert(_src3->data != _dst->data);
+	}
+	//assert(_src2->data != _src1->data);
+	//assert(_src2->data != _dst->data);
+	//assert(_src1->data != _dst->data);
+
+    CNMATRIX_LOCAL_COPY_IF_ALIAS(_src1, _dst);
+    CNMATRIX_LOCAL_COPY_IF_ALIAS(_src2, _dst);
+
+	int rows1 = (tABC & CN_GEMM_FLAG_A_T) ? _src1.cols : _src1.rows;
+	int cols1 = (tABC & CN_GEMM_FLAG_A_T) ? _src1.rows : _src1.cols;
+
+	int rows2 = (tABC & CN_GEMM_FLAG_B_T) ? _src2.cols : _src2.rows;
+	int cols2 = (tABC & CN_GEMM_FLAG_B_T) ? _src2.rows : _src2.cols;
+
+	if (_src3.data) {
+		int rows3 = (tABC & CN_GEMM_FLAG_C_T) ? _src3.cols : _src3.rows;
+		int cols3 = (tABC & CN_GEMM_FLAG_C_T) ? _src3.rows : _src3.cols;
+		assert(rows3 == _dst->rows);
+		assert(cols3 == _dst->cols);
+	}
+
+	// assert(src3 == 0 || beta != 0);
+	assert(cols1 == rows2);
+	assert(rows1 == _dst->rows);
+	assert(cols2 == _dst->cols);
+
+	auto src1 = CONVERT_TO_EIGEN(_src1);
+	auto src2 = CONVERT_TO_EIGEN(_src2);
+
+	auto dst = CONVERT_TO_EIGEN_PTR(_dst);
+
+	EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(false);
+	if (tABC & CN_GEMM_FLAG_A_T)
+		if (tABC & CN_GEMM_FLAG_B_T)
+			dst.noalias() = alpha * src1.transpose() * src2.transpose();
+		else
+			dst.noalias() = alpha * src1.transpose() * src2;
+	else {
+		if (tABC & CN_GEMM_FLAG_B_T)
+			dst.noalias() = alpha * src1 * src2.transpose();
+		else
+			dst.noalias() = alpha * src1 * src2;
+	}
+
+	if (_src3.data) {
+		auto src3 = CONVERT_TO_EIGEN(_src3);
+		if (tABC & CN_GEMM_FLAG_C_T)
+			dst.noalias() += beta * src3.transpose();
+		else
+			dst.noalias() += beta * src3;
+	}
+	//assert(cn_is_finite(_dst));
+}
diff --git a/libs/cnmatrix/src/eigen/internal.h b/libs/cnmatrix/src/eigen/internal.h
new file mode 100644
index 00000000..9d450351
--- /dev/null
+++ b/libs/cnmatrix/src/eigen/internal.h
@@ -0,0 +1,23 @@
+#define EIGEN_NO_DEBUG
+#pragma GCC optimize ("O3")
+
+#include "cnmatrix/cn_matrix.h"
+#include <Eigen/Core>
+#include <Eigen/Dense>
+
+#ifdef EIGEN_RUNTIME_NO_MALLOC
+#define EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(v) EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(v)
+#else
+#define EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(v)
+#endif
+
+#ifdef cn_MATRIX_IS_COL_MAJOR
+typedef Eigen::Matrix<FLT, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor, 50, 50> MatrixType;
+#else
+typedef Eigen::Matrix<FLT, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor, 50, 50> MatrixType;
+#endif
+typedef Eigen::OuterStride<> StrideType;
+typedef Eigen::Map<MatrixType, 0, StrideType> MapType;
+
+#define CONVERT_TO_EIGEN(A) MapType(CN_FLT_PTR(&A), (A).rows, (A).cols, StrideType((A).step))
+#define CONVERT_TO_EIGEN_PTR(A) (A ? CONVERT_TO_EIGEN(*A) : MapType(0, 0, 0, StrideType(0)))
diff --git a/libs/cnmatrix/src/eigen/svd.cpp b/libs/cnmatrix/src/eigen/svd.cpp
new file mode 100644
index 00000000..895acd2e
--- /dev/null
+++ b/libs/cnmatrix/src/eigen/svd.cpp
@@ -0,0 +1,80 @@
+#include "internal.h"
+#include <Eigen/LU>
+#include <Eigen/SVD>
+#include <Eigen/QR>
+
+
+double cnInvert(const CnMat *srcarr, CnMat *dstarr, enum cnInvertMethod method) {
+	auto src = CONVERT_TO_EIGEN_PTR(srcarr);
+	auto dst = CONVERT_TO_EIGEN_PTR(dstarr);
+
+	assert(srcarr->rows == dstarr->cols);
+	assert(srcarr->cols == dstarr->rows);
+
+	EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(false);
+	if (method == CN_INVERT_METHOD_LU) {
+		assert(srcarr->rows == srcarr->cols);
+		dst.noalias() = src.inverse();
+	} else {
+		dst.noalias() = src.completeOrthogonalDecomposition().pseudoInverse();
+	}
+	return 0;
+}
+
+extern "C" int cnSolve(const CnMat *_Aarr, const CnMat *_Barr, CnMat *_xarr, enum cnInvertMethod method) {
+	auto Aarr = CONVERT_TO_EIGEN_PTR(_Aarr);
+	auto Barr = CONVERT_TO_EIGEN_PTR(_Barr);
+	auto xarr = CONVERT_TO_EIGEN_PTR(_xarr);
+
+	if (method == CN_INVERT_METHOD_LU) {
+		xarr.noalias() = Aarr.partialPivLu().solve(Barr);
+	} else if (method == CN_INVERT_METHOD_QR) {
+		xarr.noalias() = Aarr.colPivHouseholderQr().solve(Barr);
+	} else {
+		EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(true);
+		auto cnd = Aarr.jacobiSvd(
+			Eigen::ComputeFullU |
+			Eigen::ComputeFullV); 
+		EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(false);
+		xarr.noalias() = cnd.solve(Barr);
+	}
+	return 0;
+}
+
+extern "C" void cnSVD(CnMat *aarr, CnMat *warr, CnMat *uarr, CnMat *varr, enum cnSVDFlags flags) {
+	auto aarrEigen = CONVERT_TO_EIGEN_PTR(aarr);
+	auto warrEigen = CONVERT_TO_EIGEN_PTR(warr);
+
+	int options = 0;
+	if (uarr)
+		options |= Eigen::ComputeFullU;
+	if (varr)
+		options |= Eigen::ComputeFullV;
+	EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(true);
+	auto cnd = aarrEigen.jacobiSvd(options);
+	EIGEN_RUNTIME_SET_IS_MALLOC_ALLOWED(false);
+
+	if (warrEigen.cols() == 1) {
+		warrEigen.noalias() = cnd.singularValues();
+	} else if (warrEigen.rows() == 1) {
+		warrEigen.noalias() = cnd.singularValues().transpose();
+	} else {
+		warrEigen.diagonal().noalias() = cnd.singularValues();
+	}
+
+	if (uarr) {
+		auto uarrEigen = CONVERT_TO_EIGEN_PTR(uarr);
+		if (flags & CN_SVD_U_T)
+			uarrEigen.noalias() = cnd.matrixU().transpose();
+		else
+			uarrEigen.noalias() = cnd.matrixU();
+	}
+
+	if (varr) {
+		auto varrEigen = CONVERT_TO_EIGEN_PTR(varr);
+		if (flags & CN_SVD_V_T)
+			varrEigen.noalias() = cnd.matrixV().transpose();
+		else
+			varrEigen.noalias() = cnd.matrixV();
+	}
+}
diff --git a/libs/cnmatrix/tests/CMakeLists.txt b/libs/cnmatrix/tests/CMakeLists.txt
new file mode 100644
index 00000000..1dffe933
--- /dev/null
+++ b/libs/cnmatrix/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories(../include)
+
+add_executable(cnmatrix_test cn_matrixtest.c)
+target_link_libraries(cnmatrix_test cnmatrix)
+
+add_test(NAME cnmatrix_test COMMAND cnmatrix_test)
diff --git a/libs/cnmatrix/tests/cn_matrixtest.c b/libs/cnmatrix/tests/cn_matrixtest.c
new file mode 100644
index 00000000..44707e89
--- /dev/null
+++ b/libs/cnmatrix/tests/cn_matrixtest.c
@@ -0,0 +1,356 @@
+#include "cnmatrix/cn_matrix.h"
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+void print_mat(const CnMat *M) {
+	for (int i = 0; i < M->rows; i++) {
+		for (int j = 0; j < M->cols; j++) {
+			printf("%f,\t", cnMatrixGet(M, i, j));
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+bool assertFLTEquals(FLT a, FLT b) { return fabs(a - b) < 0.0001; }
+
+int assertFLTAEquals(FLT *a, FLT *b, int length) {
+	for (int i = 0; i < length; i++) {
+		if (assertFLTEquals(a[i], b[i]) != true) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+int checkMatFLTAEquals(const struct CnMat *a, const FLT *b) {
+	for (int i = 0; i < a->rows; i++) {
+		for (int j = 0; j < a->cols; j++) {
+			if (assertFLTEquals(cnMatrixGet(a, i, j), b[i * a->cols + j]) == false)
+				return i * a->cols + j;
+		}
+	}
+	return -1;
+}
+
+#define PRINT_MAT(name)                                                                                                \
+	printf(#name "\n");                                                                                                \
+	print_mat(&name);
+
+void test_gemm() {
+	FLT _2x3[2 * 3] = {1, 2, 3, 4, 5, 6};
+	CnMat m2x3 = cnMat_from_row_major(2, 3, _2x3);
+
+	FLT _3x2[2 * 3] = {1, 3, 5, 2, 4, 6};
+	CnMat m3x2 = cnMat_from_col_major(3, 2, _3x2);
+
+	FLT _2x2[2 * 2] = {0};
+	CnMat m2x2 = cnMat_from_row_major(2, 2, _2x2);
+
+	FLT _3x3[3 * 3] = {0};
+	CnMat m3x3 = cnMat_from_row_major(3, 3, _3x3);
+
+	cnGEMM(&m2x3, &m3x2, 1, 0, 0, &m2x2, 0);
+	cnGEMM(&m3x2, &m2x3, 1, 0, 0, &m3x3, 0);
+
+	PRINT_MAT(m2x3);
+	PRINT_MAT(m3x2);
+	PRINT_MAT(m3x3);
+	PRINT_MAT(m2x2);
+
+	{
+		FLT m2x2_gt[] = {
+			22.000000,
+			28.000000,
+			49.000000,
+			64.000000,
+		};
+		assert(checkMatFLTAEquals(&m2x2, m2x2_gt) < 0);
+	}
+
+	FLT m2x3_gt[] = {
+		1.000000, 2.000000, 3.000000, 4.000000, 5.000000, 6.000000,
+	};
+	assert(checkMatFLTAEquals(&m2x3, m2x3_gt) < 0);
+
+	cnGEMM(&m2x3, &m2x3, 1, 0, 0, &m3x3, CN_GEMM_FLAG_A_T);
+	cnGEMM(&m2x3, &m2x3, 1, 0, 0, &m2x2, CN_GEMM_FLAG_B_T);
+	PRINT_MAT(m3x3);
+	PRINT_MAT(m2x2);
+
+	{
+		FLT m3x3_gt[] = {
+			17.000000, 22.000000, 27.000000, 22.000000, 29.000000, 36.000000, 27.000000, 36.000000, 45.000000,
+		};
+		assert(checkMatFLTAEquals(&m3x3, m3x3_gt) < 0);
+	}
+
+	FLT m2x2_gt[] = {
+		14.000000,
+		32.000000,
+		32.000000,
+		77.000000,
+	};
+	assert(checkMatFLTAEquals(&m2x2, m2x2_gt) < 0);
+
+	cnGEMM(&m2x3, &m3x2, 1, 0, 0, &m3x3, CN_GEMM_FLAG_A_T | CN_GEMM_FLAG_B_T);
+	//  cvGEMM(&m3x2, &m2x3, 1, 0, 0, &m2x2, CN_GEMM_A_T | CN_GEMM_B_T);
+
+	FLT m3x3_gt[] = {
+		9.000000, 19.000000, 29.000000, 12.000000, 26.000000, 40.000000, 15.000000, 33.000000, 51.000000,
+	};
+	assert(checkMatFLTAEquals(&m3x3, m3x3_gt) < 0);
+    PRINT_MAT(m3x3);
+
+    FLT alias_gt[] = {1368,    2972,    4576, 2958,    6430,    9902, 4548,    9888,   15228};
+    cnGEMM(&m3x3, &m3x3, 3, &m3x3, 2, &m3x3, CN_GEMM_FLAG_A_T);
+    PRINT_MAT(m3x3);
+    assert(checkMatFLTAEquals(&m3x3, alias_gt) < 0);
+
+}
+
+static void test_solve() {
+	{
+		FLT _A[3] = {1, 2, 3};
+		FLT _B[3] = {4, 8, 12};
+		FLT _x[1] = {0};
+
+		CnMat A = cnMat_from_row_major(3, 1, _A);
+		CnMat B = cnMat_from_row_major(3, 1, _B);
+		CnMat x = cnMat_from_row_major(1, 1, _x);
+
+		cnSolve(&A, &B, &x, CN_INVERT_METHOD_SVD);
+
+		assert(fabs(_x[0] - 4) < .001);
+	}
+
+	{
+		FLT _A[3] = {1, 2, 3};
+		FLT _B[9] = {4, 5, 6, 7, 8, 9, 10, 11, 12};
+		FLT _x[3] = {0};
+
+		CnMat A = cnMat_from_row_major(3, 1, _A);
+		CnMat B = cnMat_from_row_major(3, 3, _B);
+		CnMat x = cnMat_from_row_major(1, 3, _x);
+
+		cnSolve(&A, &B, &x, CN_INVERT_METHOD_SVD);
+	}
+}
+
+static void test_invert() {
+	printf("Invert:\n");
+
+	FLT _3x3[3 * 3] = {1, 2, 3, 4, 5, 6, 7, 8, 12};
+	CnMat m3x3 = cnMat_from_row_major(3, 3, _3x3);
+
+	FLT _d3x3[3 * 3] = {0};
+	CnMat d3x3 = cnMat_from_row_major(3, 3, _d3x3);
+
+	FLT _i3x3[3 * 3] = {0};
+	CnMat i3x3 = cnMat_from_row_major(3, 3, _i3x3);
+
+	cnInvert(&m3x3, &d3x3, CN_INVERT_METHOD_LU);
+
+	cnGEMM(&m3x3, &d3x3, 1, 0, 0, &i3x3, 0);
+	print_mat(&d3x3);
+	print_mat(&i3x3);
+
+	FLT t = cn_trace(&i3x3);
+	FLT s = cn_sum(&i3x3);
+
+	assertFLTEquals(t, 3.);
+	assertFLTEquals(s - t, 0);
+}
+static void test_invert2() {
+    FLT _P[] = {+1.4192384e-03, +8.9898875e-06,              0, +9.7868476e-03, +4.2227069e-05,              0, +4.3565721e-10, +7.8105213e-09,
+                +8.9898875e-06, +1.4821856e-03, +1.4099495e-09, +4.2254865e-05, +1.0083015e-02,              0,              0, +7.9221930e-06,
+                0, +1.4099495e-09, +1.0021303e-07,              0, +2.1108031e-08, +9.9999912e-08,              0, +3.1559669e-07,
+                +9.7868476e-03, +4.2254865e-05,              0, +1.4500066e-01, +2.9311173e-04,              0, +9.9479136e-08, +8.8850986e-08,
+                +4.2227069e-05, +1.0083015e-02, +2.1108031e-08, +2.9311173e-04, +1.4712435e-01,              0,              0, +4.5619949e-04,
+                0,              0, +9.9999912e-08,              0,              0, +1.0001000e-01,              0, +1.7626956e-10,
+                +4.3565721e-10,              0,              0, +9.9479136e-08,              0,              0, +1.0009992e-02,              0,
+                +7.8105213e-09, +7.9221930e-06, +3.1559669e-07, +8.8850986e-08, +4.5619949e-04, +1.7626956e-10,              0, +2.7158585e-02,};
+    CnMat P = cnMat_from_row_major(8, 8, _P);
+    CN_CREATE_STACK_MAT(iP, 8, 8);
+    cnInvert(&P, &iP, CN_INVERT_METHOD_SVD);
+    CN_CREATE_STACK_MAT(I, 8,8);
+    cn_set_diag_val(&I, 1);
+
+    CN_CREATE_STACK_MAT(should_be_I, 8, 8);
+    cnGEMM(&P, &iP, 1, 0, 0, &should_be_I, 0);
+
+    assert(checkMatFLTAEquals(&I, should_be_I.data));
+
+    print_mat(&I);
+    print_mat(&P);
+    print_mat(&iP);
+}
+static void test_invert3() {
+    FLT _P[] = {+1.4192384e-03, +8.9898875e-06,              0, +9.7868476e-03, +4.2227069e-05,              0, +4.3565721e-10, +7.8105213e-09,
+                +8.9898875e-06, +1.4821856e-03, +1.4099495e-09, +4.2254865e-05, +1.0083015e-02,              0,              0, +7.9221930e-06,
+                0, +1.4099495e-09, +1.0021303e-07,              0, +2.1108031e-08, +9.9999912e-08,              0, +3.1559669e-07,
+                +9.7868476e-03, +4.2254865e-05,              0, +1.4500066e-01, +2.9311173e-04,              0, +9.9479136e-08, +8.8850986e-08,
+                +4.2227069e-05, +1.0083015e-02, +2.1108031e-08, +2.9311173e-04, +1.4712435e-01,              0,              0, +4.5619949e-04,
+                0,              0, +9.9999912e-08,              0,              0, +1.0001000e-01,              0, +1.7626956e-10,
+                +4.3565721e-10,              0,              0, +9.9479136e-08,              0,              0, +1.0009992e-02,              0,
+                +7.8105213e-09, +7.9221930e-06, +3.1559669e-07, +8.8850986e-08, +4.5619949e-04, +1.7626956e-10,              0, +2.7158585e-02,};
+
+    CnMat LittleP = cnMat_from_row_major(8, 8, _P);
+    CN_CREATE_STACK_MAT(BigP, 16, 16);
+    CnMat P = cnMatView(8, 8, &BigP, 8, 8);
+    cnCopy(&LittleP, &P, 0);
+    assertFLTEquals(_P[0], cnMatrixGet(&BigP, 8, 8));
+
+    CnMat iP = cnMatView(8, 8, &BigP, 0, 0);
+    cnInvert(&P, &iP, CN_INVERT_METHOD_SVD);
+    CN_CREATE_STACK_MAT(I, 8,8);
+    cn_set_diag_val(&I, 1);
+
+    CnMat should_be_I = cnMatView(8, 8, &BigP, 8, 0);
+    cnGEMM(&P, &iP, 1, 0, 0, &should_be_I, 0);
+
+    assert(checkMatFLTAEquals(&I, should_be_I.data));
+
+    print_mat(&I);
+    print_mat(&P);
+    print_mat(&iP);
+}
+static void test_svd(CnMat* m3x3, CnMat*w,CnMat*u,CnMat*v) {
+	printf("SVD:\n");
+
+	cnSVD(m3x3, w, u, v, CN_SVD_U_T);
+
+	PRINT_MAT(*w);
+	PRINT_MAT(*u);
+	PRINT_MAT(*v);
+
+	CN_CREATE_STACK_MAT(fS, w->rows, w->rows);
+	cn_set_diag(&fS, cn_as_const_vector(w));
+	CN_CREATE_STACK_MAT(us, w->rows, w->rows);
+	CN_CREATE_STACK_MAT(usvt, w->rows, w->rows);
+	cnGEMM(u, &fS, 1, 0, 0, &us, CN_GEMM_FLAG_A_T);
+	cnGEMM(&us, v, 1, 0, 0, &usvt, CN_GEMM_FLAG_B_T);
+
+    print_mat(&us);
+    print_mat(&usvt);
+    assert(checkMatFLTAEquals(&usvt, m3x3->data));
+	/*
+	 * 0.16489968788789366, 5.6995693010571894e-05, -1.2222549954843702e-05,
+  5.6995693010571894e-05, 0.035984545614046543, -5.7946162138306271e-05, -1.2222549954843702e-05,
+  -5.7946162138306271e-05, 0.0046282902518506412
+	 */
+}
+static void test_svd0() {
+	FLT _3x3[3 * 3] = {1, 2, 3, 4, 5, 6, 7, 8, 12};
+	CnMat m3x3 = cnMat_from_row_major(3, 3, _3x3);
+
+	CN_CREATE_STACK_MAT(w, 3, 1);
+	CN_CREATE_STACK_MAT(u, 3, 3);
+	CN_CREATE_STACK_MAT(v, 3, 3);
+	test_svd(&m3x3, &w, &u, &v);
+
+	FLT wgt[] = {18.626945, 0.840495, 0.574865};
+	assertFLTAEquals(cn_as_vector(&w), wgt, 3);
+}
+static void test_svd1() {
+	FLT _3x3[3 * 3] = {0.16489968788789366, 5.6995693010571894e-05, -1.2222549954843702e-05,
+					   5.6995693010571894e-05, 0.035984545614046543, -5.7946162138306271e-05, -1.2222549954843702e-05,
+					   -5.7946162138306271e-05, 0.0046282902518506412};
+    CnMat m3x3 = cnMat_from_row_major(3, 3, _3x3);
+
+	CN_CREATE_STACK_MAT(w, 3, 1);
+	CN_CREATE_STACK_MAT(u, 3, 3);
+	CN_CREATE_STACK_MAT(v, 3, 3);
+	test_svd(&m3x3, &w, &u, &v);
+
+
+	FLT wgt[] = {0.16489971402272774, 0.035984627479101181, 0.0046281822519620109};
+	assertFLTAEquals(cn_as_vector(&w), wgt, 3);
+}
+static void test_svd2() {
+    FLT _P[] = {+1.4192384e-03, +8.9898875e-06,              0, +9.7868476e-03, +4.2227069e-05,              0, +4.3565721e-10, +7.8105213e-09,
+                +8.9898875e-06, +1.4821856e-03, +1.4099495e-09, +4.2254865e-05, +1.0083015e-02,              0,              0, +7.9221930e-06,
+                0, +1.4099495e-09, +1.0021303e-07,              0, +2.1108031e-08, +9.9999912e-08,              0, +3.1559669e-07,
+                +9.7868476e-03, +4.2254865e-05,              0, +1.4500066e-01, +2.9311173e-04,              0, +9.9479136e-08, +8.8850986e-08,
+                +4.2227069e-05, +1.0083015e-02, +2.1108031e-08, +2.9311173e-04, +1.4712435e-01,              0,              0, +4.5619949e-04,
+                0,              0, +9.9999912e-08,              0,              0, +1.0001000e-01,              0, +1.7626956e-10,
+                +4.3565721e-10,              0,              0, +9.9479136e-08,              0,              0, +1.0009992e-02,              0,
+                +7.8105213e-09, +7.9221930e-06, +3.1559669e-07, +8.8850986e-08, +4.5619949e-04, +1.7626956e-10,              0, +2.7158585e-02};
+    CnMat P = cnMat_from_row_major(8, 8, _P);
+
+
+    CN_CREATE_STACK_MAT(w, 8, 1);
+    CN_CREATE_STACK_MAT(u, 8, 8);
+    CN_CREATE_STACK_MAT(v, 8, 8);
+    test_svd(&P, &w, &u, &v);
+
+    FLT wgt[] = {1.4786e-01,
+            1.4562e-01,
+            1.0001e-01,
+            2.7157e-02,
+            1.0010e-02,
+            7.8805e-04,
+            1.0021e-07};
+    assertFLTAEquals(cn_as_vector(&w), wgt, 8);
+}
+static void test_multrans() {
+	FLT _A[3] = {1, 2, 3};
+	CnMat A = cnMat(3, 1, _A);
+
+	FLT _B[9];
+	CnMat B = cnMat(3, 3, _B);
+
+	FLT _C[1];
+	CnMat C = cnMat(1, 1, _C);
+
+	cnMulTransposed(&A, &B, 0, 0, 1);
+	PRINT_MAT(B);
+	FLT ans[] = {1, 2, 3, 2, 4, 6, 3, 6, 9};
+	for (int i = 0; i < 9; i++) {
+		assert(ans[i] == _B[i]);
+	}
+
+	cnMulTransposed(&A, &C, 1, 0, 10);
+	PRINT_MAT(C);
+	assert(_C[0] == 140);
+}
+
+static void test_sym_sqrt() {
+    FLT _A[3 * 3] = {
+            4, 12, -16,
+            12, 37, -43,
+            -16, -43, 98
+    };
+    CnMat A = cnMat_from_row_major(3, 3, _A);
+
+    FLT _B[3 * 3] = {
+            2, 0, 0,
+            6, 1, 0,
+            -8, 5, 3
+    };
+    CnMat B = cnMat_from_row_major(3, 3, _B);
+
+    CN_CREATE_STACK_MAT(C, 3, 3);
+
+    cnSqRootSymmetric(&A, &C);
+    assertFLTAEquals(_B, _C, 3 * 3);
+}
+
+int main()
+{
+    test_svd2();
+	test_invert();
+    test_invert2();
+    test_invert3();
+	test_gemm();
+	test_solve();
+	test_svd0();
+	test_svd1();
+
+	test_multrans();
+    test_sym_sqrt();
+	return 0;
+}
+