diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..777fc05
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,166 @@
+# cmake compatibility issues
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+# project name
+PROJECT(FourierConvolutionCUDALib CXX)
+if(NOT(${CMAKE_VERSION} VERSION_LESS "3.0.0"))
+cmake_policy(SET CMP0042 NEW)
+endif()
+
+if(${CMAKE_VERSION} VERSION_GREATER "3.1")
+cmake_policy(SET CMP0054 NEW)
+endif()
+
+ 
+# version number
+SET (FOURIERCONVOLUTIONCUDALIB_NAME "CUDA FOURIER CONVOLUTION LIBRARY")
+SET (FOURIERCONVOLUTIONCUDALIB_CODENAME "${PROJECT_NAME}")
+SET (FOURIERCONVOLUTIONCUDALIB_COPYRIGHT_YEARS "2013")
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR 2)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR 5)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH 0)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_TYPE SNAPSHOT)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_STRING "${FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH}-${CMAKE_BUILD_TYPE}")
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION "${FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH}")
+SET (FOURIERCONVOLUTIONCUDALIB_VENDOR_ID "mpi cbg")
+SET (FOURIERCONVOLUTIONCUDALIB_VENDOR_NAME "Max Planck Institute of Molecular Cell Biology and Genetics ")
+SET (FOURIERCONVOLUTIONCUDALIB_VENDOR_URL "www.mpi-cbg.de")
+SET (FOURIERCONVOLUTIONCUDALIB_ID "${FOURIERCONVOLUTIONCUDALIB_VENDOR_ID}.${PROJECT_NAME}")
+
+# trying to setup paths so this package can be picked up
+set(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries")
+set(INSTALL_INCLUDE_DIR include CACHE PATH "Installation directory for header files")
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+      "Choose the type of build, options are: Debug Release
+RelWithDebInfo MinSizeRel."
+      FORCE)
+endif(NOT CMAKE_BUILD_TYPE)
+MESSAGE(">> Setting up ${CMAKE_BUILD_TYPE} build")
+
+# shared path is architecture independent for now, TODO extend this to lib/bin/include
+IF(UNIX)
+  IF(APPLE)
+    set(INSTALL_SHARE_DIR ${PROJECT_NAME}.app/Contents/Resources/ CACHE PATH "Installation directory for shared files")
+    #the following was tested with OSX 10.8.5 and Xcode 5.0.2
+    #seems to me that under apple the rpath is not stripped automatically when doing the install
+    #under linux it is
+    SET(CMAKE_SKIP_RPATH ON)
+  ELSE(APPLE)
+    set(INSTALL_SHARE_DIR lib/CMake/${PROJECT_NAME} CACHE PATH "Installation directory for shared files")
+  ENDIF(APPLE)
+ELSE(UNIX)
+  IF(WIN32 AND NOT CYGWIN)
+    set(INSTALL_SHARE_DIR CMake CACHE PATH "Installation directory for shared files")
+  ELSE()
+    MESSAGE(FATAL_ERROR ">> UNKNOWN ARCHITECTURE .. unable to set share dir")
+  ENDIF()
+ENDIF(UNIX)
+ 
+# Make relative paths absolute (needed later on)
+foreach(p LIB INCLUDE SHARE)
+  set(var INSTALL_${p}_DIR)
+  if(NOT IS_ABSOLUTE "${${var}}")
+    set(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+  endif()
+endforeach()
+
+#########################################################################################################
+## CUDA related
+# project options
+OPTION(INCLUDE_CUDA "Set to OFF to not search for CUDA" ON)
+# find project dependencies
+# find cuda
+IF(INCLUDE_CUDA)
+    FIND_PACKAGE(CUDA)
+    IF(CUDA_FOUND)
+	SET(CUDA_VERBOSE_BUILD ON)
+	
+	set(CUDA_NVCC_FLAGS -gencode arch=compute_10,code=sm_10;-gencode arch=compute_20,code=sm_20)
+	SET(CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
+	IF(APPLE)
+	  IF(${CUDA_HOST_COMPILER} MATCHES "/usr/bin/.*cc" OR EXISTS "/usr/bin/llvm-g++")
+	    MESSAGE(">> adapting CUDA_HOST_COMPILER (${CUDA_HOST_COMPILER}) to match a CUDA supported compiler (/usr/bin/llvm-g++-4.2)")
+	    SET(CUDA_HOST_COMPILER "/usr/bin/llvm-g++")
+	    SET(CMAKE_CXX_COMPILER ${CUDA_HOST_COMPILER})
+	    SET(CMAKE_C_COMPILER "/usr/bin/llvm-gcc")
+	    SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libstdc++")
+
+	    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+	  ELSE()
+	    MESSAGE(WARNING ">> unknown CUDA_HOST_COMPILER (${CUDA_HOST_COMPILER}) or /usr/bin/llvm-g++-4.2 does not exist, cuda host compiler remains set to default")
+	  ENDIF()
+	ENDIF(APPLE)
+	IF("${CUDA_VERSION}" VERSION_GREATER "5" OR "${CUDA_VERSION}" VERSION_EQUAL "5")
+	  MESSAGE(">> compiling for Compute Capability 2.x, 3.0 and 3.5 only ")
+          set(CUDA_NVCC_FLAGS "-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35")
+	ELSE()
+	  MESSAGE(">> CUDA less than version 5.0 detected, compiling for Compute Capability 2.x only ")
+          set(CUDA_NVCC_FLAGS "-gencode arch=compute_20,code=sm_20;-gencode arch=compute_10,code=sm_10")
+	ENDIF()
+	set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE};-O2;--use_fast_math)
+	set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};-g;-G)
+    ELSE(CUDA_FOUND)
+        MESSAGE(FATAL_ERROR ">> CUDA not found. Exiting ...")
+    ENDIF(CUDA_FOUND)
+	
+	IF(WIN32)
+		IF(${CUDA_VERSION} VERSION_GREATER "7" OR ${CUDA_VERSION} VERSION_EQUAL "7")
+			IF(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+				message(FATAL_ERROR ">> CUDA version 7 or higher does not support 32bit builds on Windows (found CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})")
+			ELSE()
+				message(">> [Windows] 64bit build detected, size of void pointer ${CMAKE_SIZEOF_VOID_P}")
+			ENDIF()
+		
+		ENDIF()
+		SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
+
+	ENDIF(WIN32)
+ENDIF(INCLUDE_CUDA)
+ 
+
+# add subdirectories
+ADD_SUBDIRECTORY(src)
+
+FIND_PACKAGE (Boost 1.42 COMPONENTS system filesystem unit_test_framework thread QUIET)
+IF(Boost_FOUND)
+ADD_SUBDIRECTORY(tests)
+enable_testing()
+include("CTestLists.txt")
+ELSE()
+MESSAGE(">> Boost libraries not found, skipping building setting up tests")
+ENDIF()
+
+# Export the package for use from the build-tree
+# (this registers the build-tree with a global CMake-registry)
+#              v = binary    v = library
+
+export(PACKAGE ${PROJECT_NAME})
+
+# Create the fourierconvolutioncudalib-config.cmake and fourierconvolutioncudalib-config-version files
+file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_SHARE_DIR}"
+   "${INSTALL_INCLUDE_DIR}")
+# ... for the build tree
+set(CONF_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}" "${PROJECT_BINARY_DIR}")
+configure_file(fourierconvolutioncudalib-config.cmake.in
+  "${PROJECT_BINARY_DIR}/fourierconvolutioncudalib-config.cmake" @ONLY)
+# ... for the install tree
+set(CONF_INCLUDE_DIRS "\${FOURIERCONVOLUTIONCUDALIB}/${REL_INCLUDE_DIR}")
+configure_file(fourierconvolutioncudalib-config.cmake.in
+  "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/fourierconvolutioncudalib-config.cmake" @ONLY)
+# ... for both
+configure_file(fourierconvolutioncudalib-config-version.cmake.in
+  "${PROJECT_BINARY_DIR}/fourierconvolutioncudalib-config-version.cmake" @ONLY)
+
+# Install the fourierconvolutioncudalib-config.cmake and fourierconvolutioncudalib-config-version.cmake
+install(FILES
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/fourierconvolutioncudalib-config.cmake"
+  "${PROJECT_BINARY_DIR}/fourierconvolutioncudalib-config-version.cmake"
+  DESTINATION "${INSTALL_SHARE_DIR}" COMPONENT dev)
+
+# Install the export set for use with the install-tree 
+install(EXPORT fourierconvolutioncudalib-targets 
+  DESTINATION "${INSTALL_SHARE_DIR}" COMPONENT dev)
+
+
+
diff --git a/CTestLists.txt b/CTestLists.txt
new file mode 100644
index 0000000..30dd760
--- /dev/null
+++ b/CTestLists.txt
@@ -0,0 +1,10 @@
+# include(BoostTestTargets)
+# add_boost_test(Independent
+#  SOURCES
+#  test_Independent.cpp
+#  TESTS
+#  Independent_suite)
+
+
+add_test(NAME gpu_convolve COMMAND test_gpu_convolve)
+
diff --git a/README.md b/README.md
index a0ae362..ff631a5 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,10 @@ FourierConvolutionCUDALib
 
 Implementation of 3d non-separable convolution using CUDA &amp; FFT Convolution, originally implemented by Fernando Amat for our Nature Methods paper (http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2929.html).
 
-To compile it under Linux/Mac/Windows I suggest NSight. Clone this repository into your cuda-workspace directory. Then make a new shared library project with the same name as the directory.
+Windows Build Instructions
+--------------------------
+
+To compile it under Windows, NSight available from the CUDA SDK is suggested. Clone this repository into your cuda-workspace directory. Then make a new shared library project with the same name as the directory.
 
 Under Project > Properties > Build > Settings > Tool Settings > NVCC Linker add -lcufft and -lcuda to the command line pattern so that it looks like this:
 
@@ -11,3 +14,47 @@ ${COMMAND} ${FLAGS} -lcufft -lcuda ${OUTPUT_FLAG} ${OUTPUT_PREFIX} ${OUTPUT} ${I
 
 Now build the .so/.dll library and put it into the Fiji directory.
 
+The cmake build system is also supported under Windows 7 64bit now!
+
+```bash
+$ cd X:\path\to\repo
+$ mkdir build
+$ cd build
+$ cmake.exe -G "Visual Studio 12 2013 Win64" -DCMAKE_INSTALL_PREFIX=C:/msys64/home/steinbac/ftmp -DBOOST_LIBRARYDIR=C:/boost_1_58_0/msvc-12-x86_64/lib64-msvc-12.0 -DBOOST_ROOT=C:/boost_1_58_0/msvc-12-x86_64 ..
+$ cmake.exe --build . --target ALL_BUILD --config Release
+$ ctest.exe -C Release #(optional) the above builds in Release mode
+$ cmake.exe --build . --target install --config Release
+```
+
+OSX/Linux Build Instructions
+----------------------------
+
+First, make sure that CUDA must be available through `PATH`, `LD_LIBRARY_PATH` or equivalent. The build system is based on cmake, so please install this at any version higher or equal than 2.8.
+
+```bash
+$ cd /path/to/repo
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_INSTALL_PREFIX=/directory/of/your/choice .. #default is /usr/lib/ or /usr/lib64 depending on your OS
+$ make
+$ ctest #(optional) in case you'd like to make sure the library does what it should on your system
+$ make install
+```
+
+
+Dependencies
+------------
+
+The unit tests that come with the library require boost (version 1.42 or higher) to be available to the cmake build system. If cmake is unable to detect them, try:
+
+```
+$ cmake -DCMAKE_INSTALL_PREFIX=/directory/of/your/choice -DBOOST_ROOT=/path/to/boost/root .. #default is /usr/lib/ or /usr/lib64 depending on your OS
+```
+
+Here, ```/path/to/boost/root``` should contain the boost libraries and the boost headers.
+
+
+How to get Help
+===============
+
+In case something does not work, do not hesitate to open a ticket on our github page: https://github.com/StephanPreibisch/FourierConvolutionCUDALib
diff --git a/fourierconvolutioncudalib-config-version.cmake.in b/fourierconvolutioncudalib-config-version.cmake.in
new file mode 100644
index 0000000..8b80f8e
--- /dev/null
+++ b/fourierconvolutioncudalib-config-version.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@FOURIERCONVOLUTIONCUDALIB_VERSION@")
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/fourierconvolutioncudalib-config.cmake.in b/fourierconvolutioncudalib-config.cmake.in
new file mode 100644
index 0000000..ae9abaf
--- /dev/null
+++ b/fourierconvolutioncudalib-config.cmake.in
@@ -0,0 +1,33 @@
+# - Config file for the FooBar package
+# It defines the following variables
+#  FOURIERCONVOLUTIONCUDALIB_INCLUDE_DIRS - include directories for fourierconvolutioncudalib
+#  FOURIERCONVOLUTIONCUDALIB_LIBRARIES    - libraries to link against
+#  FOURIERCONVOLUTIONCUDALIB_EXECUTABLE   - the bar executable
+
+# Compute paths
+get_filename_component(FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+
+
+# Our library dependencies (contains definitions for IMPORTED targets)
+include("${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/fourierconvolutioncudalib-targets.cmake")
+
+# These are IMPORTED targets created by fourierconvolutioncudalib-targets.cmake
+IF(UNIX)
+  IF(APPLE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_INCLUDE_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../include" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_LIB_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../lib" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_BIN_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../bin" ABSOLUTE)
+    ELSE(APPLE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_INCLUDE_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../../include" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_LIB_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../../lib" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_BIN_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../../bin" ABSOLUTE)
+  ENDIF(APPLE)
+ENDIF(UNIX)
+
+set(FOURIERCONVOLUTIONCUDALIB_LIBRARIES  "${FOURIERCONVOLUTIONCUDALIB_LIB_DIRS}/libFourierConvolutionCUDALib.so")
+
+set(FOURIERCONVOLUTIONCUDALIB_VERSION  "@FOURIERCONVOLUTIONCUDALIB_VERSION@")
+set(FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR  "@FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR@")
+set(FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR  "@FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR@")
+set(FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH  "@FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH@")
+set(FOURIERCONVOLUTIONCUDALIB_BUILD_TYPE  "@CMAKE_BUILD_TYPE@")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..f4a275d
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,39 @@
+# include directories
+# src directory
+INCLUDE_DIRECTORIES(.)
+
+# build and link
+FIND_PACKAGE(CUDA QUIET)
+CUDA_ADD_LIBRARY(${PROJECT_NAME} convolution3Dfft.cu standardCUDAfunctions.cu SHARED)
+CUDA_ADD_LIBRARY(${PROJECT_NAME}_static convolution3Dfft.cu standardCUDAfunctions.cu)
+CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME})
+CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME}_static)
+
+IF(WIN32)
+IF(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+include (GenerateExportHeader)
+GENERATE_EXPORT_HEADER( ${PROJECT_NAME}
+			BASE_NAME ${PROJECT_NAME}
+            EXPORT_MACRO_NAME ${PROJECT_NAME}_EXPORT
+            EXPORT_FILE_NAME ${PROJECT_NAME}_Export.h
+            STATIC_DEFINE ${PROJECT_NAME}_BUILT_AS_STATIC
+)
+set_target_properties(${PROJECT_NAME}_static PROPERTIES COMPILE_FLAGS "-D${PROJECT_NAME}_BUILT_AS_STATIC")
+INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR}/src)
+ENDIF()
+ENDIF()
+
+TARGET_LINK_LIBRARIES(${PROJECT_NAME}_static ${CUDA_CUDA_LIBRARY})
+TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
+
+SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h;${PROJECT_NAME}_Export.h") 
+SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h;${PROJECT_NAME}_Export.h") 
+
+INSTALL(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static
+  EXPORT fourierconvolutioncudalib-targets 
+  LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+  ARCHIVE DESTINATION ${INSTALL_LIB_DIR}
+  PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDE_DIR}
+  )
+
+
diff --git a/src/FourierConvolutionCUDALib_Export.h b/src/FourierConvolutionCUDALib_Export.h
new file mode 100644
index 0000000..f8f1e38
--- /dev/null
+++ b/src/FourierConvolutionCUDALib_Export.h
@@ -0,0 +1,41 @@
+
+#ifndef FourierConvolutionCUDALib_EXPORT_H
+#define FourierConvolutionCUDALib_EXPORT_H
+
+#ifdef FourierConvolutionCUDALib_BUILT_AS_STATIC
+#  define FourierConvolutionCUDALib_EXPORT
+#  define FOURIERCONVOLUTIONCUDALIB_NO_EXPORT
+#else
+#  ifndef FourierConvolutionCUDALib_EXPORT
+#    ifdef FourierConvolutionCUDALib_EXPORTS
+        /* We are building this library */
+#      define FourierConvolutionCUDALib_EXPORT __declspec(dllexport)
+#    else
+        /* We are using this library */
+#      define FourierConvolutionCUDALib_EXPORT __declspec(dllimport)
+#    endif
+#  endif
+
+#  ifndef FOURIERCONVOLUTIONCUDALIB_NO_EXPORT
+#    define FOURIERCONVOLUTIONCUDALIB_NO_EXPORT 
+#  endif
+#endif
+
+#ifndef FOURIERCONVOLUTIONCUDALIB_DEPRECATED
+#  define FOURIERCONVOLUTIONCUDALIB_DEPRECATED __declspec(deprecated)
+#endif
+
+#ifndef FOURIERCONVOLUTIONCUDALIB_DEPRECATED_EXPORT
+#  define FOURIERCONVOLUTIONCUDALIB_DEPRECATED_EXPORT FourierConvolutionCUDALib_EXPORT FOURIERCONVOLUTIONCUDALIB_DEPRECATED
+#endif
+
+#ifndef FOURIERCONVOLUTIONCUDALIB_DEPRECATED_NO_EXPORT
+#  define FOURIERCONVOLUTIONCUDALIB_DEPRECATED_NO_EXPORT FOURIERCONVOLUTIONCUDALIB_NO_EXPORT FOURIERCONVOLUTIONCUDALIB_DEPRECATED
+#endif
+
+#define DEFINE_NO_DEPRECATED 0
+#if DEFINE_NO_DEPRECATED
+# define FOURIERCONVOLUTIONCUDALIB_NO_DEPRECATED
+#endif
+
+#endif
diff --git a/src/convolution3Dfft.cu b/src/convolution3Dfft.cu
index e6f022f..c759b84 100644
--- a/src/convolution3Dfft.cu
+++ b/src/convolution3Dfft.cu
@@ -1,541 +1,548 @@
-#include "convolution3Dfft.h"
-#include "book.h"
-#include "cuda.h"
-#include "cufft.h"
-#include <iostream>
-#include <math.h>
-
-
-__device__ static const float PI_2 = 6.28318530717958620f;
-__device__ static const float PI_1 =  3.14159265358979310f;
-
-////////////////////////////////////////////////////////////////////////////////
-// Modulate Fourier image of padded data by Fourier image of padded kernel
-// and normalize by FFT size
-////////////////////////////////////////////////////////////////////////////////
-//Adapted from CUDA SDK examples
-__device__ void mulAndScale(cufftComplex& a, const cufftComplex& b, const float& c)
-{
-    cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
-    a = t;
-};
-
-__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalize_kernel(cufftComplex *d_Dst, cufftComplex *d_Src, long long int dataSize,float c)
-{
-    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
-	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
-    while(i < dataSize)
-	{		
-
-		cufftComplex a = d_Src[i];
-		cufftComplex b = d_Dst[i];
-
-		mulAndScale(a, b, c);
-		d_Dst[i] = a;
-
-		i += offset;
-	}
-};
-
-//we use nearest neighbor interpolation to access FFT coefficients in the kernel
-__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalizeSubsampled_kernel(cufftComplex *d_Dst, cufftComplex *d_Src,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2,long long int datasize,float c)
-{
-
-	float r_0 = ((float)kernelDim_0) / ((float)imDim_0); //ratio between image size and kernel size to calculate access
-	float r_1 = ((float)kernelDim_1) / ((float)imDim_1);
-	float r_2 = ((float)kernelDim_2) / ((float)imDim_2);
-
-    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
-	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
-	int k_0,k_1,k_2;
-	int aux;
-	float auxExp, auxSin,auxCos;
-    while(i < datasize)
-	{
-		//for each dimension we need to access k_i*r_i  i=0, 1, 2
-		aux = 1 + imDim_2/2;
-		k_2 = i % aux;
-		aux = (i - k_2) / aux;
-		k_1 = aux % imDim_1;
-		k_0 = (aux - k_1) / imDim_1;
-
-		cufftComplex b = d_Dst[i];
-
-		//apply shift in fourier domain since we did not apply fftshift to kernel (so we could use the trick of assuming the kernel is padded with zeros and then just subsample FFT)
-		/* This is how we would do it in Matlab (linear phase change)
-		auxExp = k_0 * r_0;
-		auxExp += k_1 * r_1;
-		auxExp += k_2 * r_2;
-		auxExp *= PI_1;
-		auxSin = sin(auxExp);
-		auxCos = cos(auxExp);
-		auxExp = b.x * auxCos - b.y * auxSin;
-
-		b.y = b.x * auxSin + b.y * auxCos;
-		b.x = auxExp;
-		*/
-
-		//add the ratio to each dimension and apply nearest neighbor interpolation
-		//k_2 = min((int)(r_2*(float)k_2 + 0.5f),kernelDim_2-1);//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
-		//k_1 = min((int)(r_1*(float)k_1 + 0.5f),kernelDim_1-1);
-		//k_0 = min((int)(r_0*(float)k_0 + 0.5f),kernelDim_0-1);
-		k_2 = ((int)(r_2*(float)k_2 + 0.5f)) % kernelDim_2;//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
-		k_1 = ((int)(r_1*(float)k_1 + 0.5f)) % kernelDim_1;
-		k_0 = ((int)(r_0*(float)k_0 + 0.5f)) % kernelDim_0;
-		//calculate new coordinate relative to kernel size
-		aux = 1 + kernelDim_2/2;
-		cufftComplex a = d_Src[k_2 + aux *(k_1 + kernelDim_1 * k_0)];
-		
-		if( (k_0 + k_1 + k_2) % 2 == 1 )//after much debugging it seems the phase shift is 0 or Pi (nothing in between). In Matlab is a nice linear change as programmed above
-		{
-			a.x = -a.x;
-			a.y = -a.y;
-		}
-		mulAndScale(a, b, c);
-
-		//__syncthreads();//this actually slows down the code by a lot (0.1 sec for 512x512x512)
-		d_Dst[i] = a;
-
-		i += offset;
-	}
-};
-
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-__global__ void __launch_bounds__(MAX_THREADS_CUDA) fftShiftKernel(imageType* kernelCUDA,imageType* kernelPaddedCUDA,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2)
-{
-	int kernelSize = kernelDim_0 * kernelDim_1 * kernelDim_2;
-
-	int tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tid<kernelSize)
-	{
-		//find coordinates
-		long long int x,y,z,aux;
-		z = tid % kernelDim_2;
-		aux = (tid - z)/kernelDim_2;
-		y = aux % kernelDim_1;
-		x = (aux - y)/kernelDim_1;
-
-		//center coordinates
-		x -= kernelDim_0/2;
-		y -= kernelDim_1/2;
-		z -= kernelDim_2/2;
-
-		//circular shift if necessary
-		if(x<0) x += imDim_0;
-		if(y<0) y += imDim_1;
-		if(z<0) z += imDim_2;
-
-		//calculate position in padded kernel
-		aux = z + imDim_2 * (y + imDim_1 * x);
-
-		//copy value
-		kernelPaddedCUDA[aux] = kernelCUDA[tid];//for the most part it should be a coalescent access in oth places
-	}
-}
-
-//=====================================================================
-//-------------to debug elements--------------------------------------
-void writeOutCUDAfft(char* filename,imageType* fftCUDA,int* fftCUDAdims)
-{
-	int fftSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		fftSize *= fftCUDAdims[ii];
-	}
-
-	//copy FFT from CUDA
-	imageType* fftHOST = new imageType[2*fftSize];//complex format
-	HANDLE_ERROR(cudaMemcpy(fftHOST,fftCUDA,2*sizeof(imageType)*fftSize,cudaMemcpyDeviceToHost));
-
-	//calculate module
-	/*
-	int count = 0;
-	for(int ii=0;ii<fftSize;ii++)
-	{
-		fftHOST[ii] = sqrt(fftHOST[count]*fftHOST[count] + fftHOST[count+1]*fftHOST[count+1]);
-		count += 2;
-	}
-	*/
-
-	FILE* fid = fopen(filename,"wb");
-	if(fid == NULL)
-	{
-		printf("ERROR: at writeOutCUDAfft opening file %s\n",filename);
-		exit(2);
-	}else{
-		printf("DEBUGGING: Writing FFT (real part first,imaginary second)  from CUDA of dimensions %d x %d x %d in file %s\n",fftCUDAdims[2],fftCUDAdims[1],fftCUDAdims[0],filename);
-	}
-	//fwrite(fftHOST,sizeof(imageType),fftSize,fid);
-	for(int ii=0;ii<2*fftSize;ii+=2)
-		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
-	for(int ii=1;ii<2*fftSize;ii+=2)
-		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
-
-
-	fclose(fid);
-	delete[] fftHOST;
-}
-
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int devCUDA)
-{
-	imageType* convResult = NULL;
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-
-	//allocate memory for output result
-	convResult = new imageType[imSize];
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), imSizeFFT*sizeof(imageType) ) );
-
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-	//transforming image
-	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex number
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
-
-	//inverse FFT 
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-
-	//copy result to host
-	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-
-	return convResult;
-}
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
-imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
-{
-	imageType* convResult = NULL;
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-	imageType* kernelPaddedCUDA = NULL;
-
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	long long int kernelSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-		kernelSize *= (long long int) (kernelDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-
-	//allocate memory for output result
-	convResult = new imageType[imSize];
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
-
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-
-	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
-	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
-	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
-
-	
-	//make sure GPU finishes before we launch two different streams
-	HANDLE_ERROR(cudaDeviceSynchronize());	
-
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-	//transforming image
-	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
-	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
-
-	//inverse FFT 
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-
-	//copy result to host
-	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
-
-	return convResult;
-}
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
-void convolution3DfftCUDAInPlace(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
-{
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-	imageType* kernelPaddedCUDA = NULL;
-
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	long long int kernelSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-		kernelSize *= (long long int) (kernelDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-
-
-	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
-	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
-	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
-
-	
-	//make sure GPU finishes 
-	HANDLE_ERROR(cudaDeviceSynchronize());	
-
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	//cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;//we wait to conserve memory
-	//cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-	//transforming image
-	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
-
-	//int fftCUDAdims[dimsImage] ={imDim[0],imDim[1],1+imDim[2]/2};
-	//writeOutCUDAfft("E:/temp/fftCUDAgood.bin",kernelPaddedCUDA,fftCUDAdims);
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
-	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;//last parameter is the size of the FFT
-
-
-	//we destroy memory first so we can perform larger block convoutions
-	( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
-
-	//inverse FFT 
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//copy result to host and overwrite image
-	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-    //( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	//HANDLE_ERROR( cudaFree( kernelCUDA));
-	//HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
-
-}
-
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
-/*
-void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
-{
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-	//imageType* kernelFFTCUDA = NULL; //I need a duplicate in order to perform FFTshift (I can not do it in place)
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	long long int kernelSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-		kernelSize *= (long long int) (kernelDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-	long long int kernelSizeFFT = kernelSize + (long long int)(2*kernelDim[0]*kernelDim[1]);
-
-	
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
-	//HANDLE_ERROR( cudaMalloc( (void**)&(kernelFFTCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-
-
-	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
-	//HANDLE_ERROR( cudaMemset( kernelFFTCUDA, 0, kernelSizeFFT*sizeof(imageType) ));
-	//int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
-	//int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
-	//fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelFFTCUDA,kernelDim[0],kernelDim[1],kernelDim[2],kernelDim[0],kernelDim[1],kernelDim[2]);HANDLE_ERROR_KERNEL;
-
-	
-	//make sure GPU finishes 
-	HANDLE_ERROR(cudaDeviceSynchronize());	
-
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-
-
-	//transforming kernel
-	cufftDestroy(fftPlanFwd); HANDLE_ERROR_KERNEL;
-	cufftPlan3d(&fftPlanFwd, kernelDim[0], kernelDim[1], kernelDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
-
-
-	//int fftCUDAdims[dimsImage] ={kernelDim[0],kernelDim[1],1+kernelDim[2]/2};
-	//writeOutCUDAfft("E:/temp/fftCUDAbad.bin",kernelCUDA,fftCUDAdims);
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	//we multiply two FFT of different sizes (but one was just padded with zeros)
-	modulateAndNormalizeSubsampled_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2], imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;
-
-
-	//we destroy memory first so we can perform larger block convoutions
-	cufftDestroy(fftPlanFwd) ;HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-	//HANDLE_ERROR( cudaFree( kernelFFTCUDA));
-
-	//inverse FFT 
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//copy result to host and overwrite image
-	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	
-}
-*/
+#include "convolution3Dfft.h"
+#include "book.h"
+#include "cuda.h"
+#include "cufft.h"
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+
+//__device__ static const float PI_2 = 6.28318530717958620f;
+//__device__ static const float PI_1 =  3.14159265358979310f;
+
+////////////////////////////////////////////////////////////////////////////////
+// Modulate Fourier image of padded data by Fourier image of padded kernel
+// and normalize by FFT size
+////////////////////////////////////////////////////////////////////////////////
+//Adapted from CUDA SDK examples
+__device__ void mulAndScale(cufftComplex& a, const cufftComplex& b, const float& c)
+{
+    cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
+    a = t;
+};
+
+__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalize_kernel(cufftComplex *d_Dst, cufftComplex *d_Src, long long int dataSize,float c)
+{
+    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
+	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
+    while(i < dataSize)
+	{		
+
+		cufftComplex a = d_Src[i];
+		cufftComplex b = d_Dst[i];
+
+		mulAndScale(a, b, c);
+		d_Dst[i] = a;
+
+		i += offset;
+	}
+};
+
+//we use nearest neighbor interpolation to access FFT coefficients in the kernel
+__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalizeSubsampled_kernel(cufftComplex *d_Dst, cufftComplex *d_Src,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2,long long int datasize,float c)
+{
+
+	float r_0 = ((float)kernelDim_0) / ((float)imDim_0); //ratio between image size and kernel size to calculate access
+	float r_1 = ((float)kernelDim_1) / ((float)imDim_1);
+	float r_2 = ((float)kernelDim_2) / ((float)imDim_2);
+
+    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
+	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
+	int k_0,k_1,k_2;
+	int aux;
+	//	float auxExp, auxSin,auxCos;
+    while(i < datasize)
+	{
+		//for each dimension we need to access k_i*r_i  i=0, 1, 2
+		aux = 1 + imDim_2/2;
+		k_2 = i % aux;
+		aux = (i - k_2) / aux;
+		k_1 = aux % imDim_1;
+		k_0 = (aux - k_1) / imDim_1;
+
+		cufftComplex b = d_Dst[i];
+
+		//apply shift in fourier domain since we did not apply fftshift to kernel (so we could use the trick of assuming the kernel is padded with zeros and then just subsample FFT)
+		/* This is how we would do it in Matlab (linear phase change)
+		auxExp = k_0 * r_0;
+		auxExp += k_1 * r_1;
+		auxExp += k_2 * r_2;
+		auxExp *= PI_1;
+		auxSin = sin(auxExp);
+		auxCos = cos(auxExp);
+		auxExp = b.x * auxCos - b.y * auxSin;
+
+		b.y = b.x * auxSin + b.y * auxCos;
+		b.x = auxExp;
+		*/
+
+		//add the ratio to each dimension and apply nearest neighbor interpolation
+		//k_2 = min((int)(r_2*(float)k_2 + 0.5f),kernelDim_2-1);//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
+		//k_1 = min((int)(r_1*(float)k_1 + 0.5f),kernelDim_1-1);
+		//k_0 = min((int)(r_0*(float)k_0 + 0.5f),kernelDim_0-1);
+		k_2 = ((int)(r_2*(float)k_2 + 0.5f)) % kernelDim_2;//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
+		k_1 = ((int)(r_1*(float)k_1 + 0.5f)) % kernelDim_1;
+		k_0 = ((int)(r_0*(float)k_0 + 0.5f)) % kernelDim_0;
+		//calculate new coordinate relative to kernel size
+		aux = 1 + kernelDim_2/2;
+		cufftComplex a = d_Src[k_2 + aux *(k_1 + kernelDim_1 * k_0)];
+		
+		if( (k_0 + k_1 + k_2) % 2 == 1 )//after much debugging it seems the phase shift is 0 or Pi (nothing in between). In Matlab is a nice linear change as programmed above
+		{
+			a.x = -a.x;
+			a.y = -a.y;
+		}
+		mulAndScale(a, b, c);
+
+		//__syncthreads();//this actually slows down the code by a lot (0.1 sec for 512x512x512)
+		d_Dst[i] = a;
+
+		i += offset;
+	}
+};
+
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+__global__ void __launch_bounds__(MAX_THREADS_CUDA) fftShiftKernel(imageType* kernelCUDA,imageType* kernelPaddedCUDA,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2)
+{
+	int kernelSize = kernelDim_0 * kernelDim_1 * kernelDim_2;
+
+	int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if(tid<kernelSize)
+	{
+		//find coordinates
+		long long int x,y,z,aux;
+		z = tid % kernelDim_2;
+		aux = (tid - z)/kernelDim_2;
+		y = aux % kernelDim_1;
+		x = (aux - y)/kernelDim_1;
+
+		//center coordinates
+		x -= kernelDim_0/2;
+		y -= kernelDim_1/2;
+		z -= kernelDim_2/2;
+
+		//circular shift if necessary
+		if(x<0) x += imDim_0;
+		if(y<0) y += imDim_1;
+		if(z<0) z += imDim_2;
+
+		//calculate position in padded kernel
+		aux = z + imDim_2 * (y + imDim_1 * x);
+
+		//copy value
+		kernelPaddedCUDA[aux] = kernelCUDA[tid];//for the most part it should be a coalescent access in oth places
+	}
+}
+
+//=====================================================================
+//-------------to debug elements--------------------------------------
+void writeOutCUDAfft(char* filename,imageType* fftCUDA,int* fftCUDAdims)
+{
+	int fftSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		fftSize *= fftCUDAdims[ii];
+	}
+
+	//copy FFT from CUDA
+	imageType* fftHOST = new imageType[2*fftSize];//complex format
+	HANDLE_ERROR(cudaMemcpy(fftHOST,fftCUDA,2*sizeof(imageType)*fftSize,cudaMemcpyDeviceToHost));
+
+	//calculate module
+	/*
+	int count = 0;
+	for(int ii=0;ii<fftSize;ii++)
+	{
+		fftHOST[ii] = sqrt(fftHOST[count]*fftHOST[count] + fftHOST[count+1]*fftHOST[count+1]);
+		count += 2;
+	}
+	*/
+
+	FILE* fid = fopen(filename,"wb");
+	if(fid == NULL)
+	{
+		printf("ERROR: at writeOutCUDAfft opening file %s\n",filename);
+		exit(2);
+	}else{
+		printf("DEBUGGING: Writing FFT (real part first,imaginary second)  from CUDA of dimensions %d x %d x %d in file %s\n",fftCUDAdims[2],fftCUDAdims[1],fftCUDAdims[0],filename);
+	}
+	//fwrite(fftHOST,sizeof(imageType),fftSize,fid);
+	for(int ii=0;ii<2*fftSize;ii+=2)
+		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
+	for(int ii=1;ii<2*fftSize;ii+=2)
+		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
+
+
+	fclose(fid);
+	delete[] fftHOST;
+}
+
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+imageType* convolution3DfftCUDA_test(imageType* im,
+													 int* imDim,
+													 imageType* kernel,
+													 int devCUDA)
+{
+	imageType* convResult = NULL;
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+
+	//allocate memory for output result
+	convResult = new imageType[imSize];
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), imSizeFFT*sizeof(imageType) ) );
+
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+	//transforming image
+	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex number
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
+
+	//inverse FFT 
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+
+	//copy result to host
+	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+
+	return convResult;
+}
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
+ imageType* convolution3DfftCUDA(imageType* im,
+													 int* imDim,
+													 imageType* kernel,
+													 int* kernelDim,
+													 int devCUDA)
+{
+	imageType* convResult = NULL;
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+	imageType* kernelPaddedCUDA = NULL;
+
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	long long int kernelSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+		kernelSize *= (long long int) (kernelDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+
+	//allocate memory for output result
+	convResult = new imageType[imSize];
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
+
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+
+	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
+	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
+	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
+
+	
+	//make sure GPU finishes before we launch two different streams
+	HANDLE_ERROR(cudaDeviceSynchronize());	
+
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+	//transforming image
+	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
+	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
+
+	//inverse FFT 
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+
+	//copy result to host
+	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
+
+	return convResult;
+}
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
+ void convolution3DfftCUDAInPlace(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
+{
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+	imageType* kernelPaddedCUDA = NULL;
+
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	long long int kernelSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+		kernelSize *= (long long int) (kernelDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+
+
+	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
+	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
+	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
+
+	
+	//make sure GPU finishes 
+	HANDLE_ERROR(cudaDeviceSynchronize());	
+
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	//cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;//we wait to conserve memory
+	//cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+	//transforming image
+	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
+
+	//int fftCUDAdims[dimsImage] ={imDim[0],imDim[1],1+imDim[2]/2};
+	//writeOutCUDAfft("E:/temp/fftCUDAgood.bin",kernelPaddedCUDA,fftCUDAdims);
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
+	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;//last parameter is the size of the FFT
+
+
+	//we destroy memory first so we can perform larger block convoutions
+	( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
+
+	//inverse FFT 
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//copy result to host and overwrite image
+	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+    //( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	//HANDLE_ERROR( cudaFree( kernelCUDA));
+	//HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
+
+}
+
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
+/*
+void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
+{
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+	//imageType* kernelFFTCUDA = NULL; //I need a duplicate in order to perform FFTshift (I can not do it in place)
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	long long int kernelSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+		kernelSize *= (long long int) (kernelDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+	long long int kernelSizeFFT = kernelSize + (long long int)(2*kernelDim[0]*kernelDim[1]);
+
+	
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
+	//HANDLE_ERROR( cudaMalloc( (void**)&(kernelFFTCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+
+
+	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
+	//HANDLE_ERROR( cudaMemset( kernelFFTCUDA, 0, kernelSizeFFT*sizeof(imageType) ));
+	//int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
+	//int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
+	//fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelFFTCUDA,kernelDim[0],kernelDim[1],kernelDim[2],kernelDim[0],kernelDim[1],kernelDim[2]);HANDLE_ERROR_KERNEL;
+
+	
+	//make sure GPU finishes 
+	HANDLE_ERROR(cudaDeviceSynchronize());	
+
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+
+
+	//transforming kernel
+	cufftDestroy(fftPlanFwd); HANDLE_ERROR_KERNEL;
+	cufftPlan3d(&fftPlanFwd, kernelDim[0], kernelDim[1], kernelDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
+
+
+	//int fftCUDAdims[dimsImage] ={kernelDim[0],kernelDim[1],1+kernelDim[2]/2};
+	//writeOutCUDAfft("E:/temp/fftCUDAbad.bin",kernelCUDA,fftCUDAdims);
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	//we multiply two FFT of different sizes (but one was just padded with zeros)
+	modulateAndNormalizeSubsampled_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2], imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;
+
+
+	//we destroy memory first so we can perform larger block convoutions
+	cufftDestroy(fftPlanFwd) ;HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+	//HANDLE_ERROR( cudaFree( kernelFFTCUDA));
+
+	//inverse FFT 
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//copy result to host and overwrite image
+	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	
+}
+*/
diff --git a/src/convolution3Dfft.h b/src/convolution3Dfft.h
index 95c161a..fb21067 100644
--- a/src/convolution3Dfft.h
+++ b/src/convolution3Dfft.h
@@ -1,14 +1,18 @@
 #ifndef __CONVOLUTION_3D_FFT_H__
 #define __CONVOLUTION_3D_FFT_H__
 
-#include "standardCUDAfunctions.h"
-
 #ifdef _WIN32
-#define FUNCTION_PREFIX extern "C" __declspec(dllexport)
+//auto-generated macro file for MSVC libraries on Win7
+	#include "FourierConvolutionCUDALib_Export.h"
+	#define FUNCTION_PREFIX extern "C" FourierConvolutionCUDALib_EXPORT
+
 #else
-#define FUNCTION_PREFIX extern "C"
+
+	#define FUNCTION_PREFIX extern "C"
+	
 #endif
 
+
 //define constants
 typedef float imageType;//the kind sof images we are working with (you will need to recompile to work with other types)
 static const int MAX_THREADS_CUDA = 1024; //adjust it for your GPU. This is correct for a 2.0 architecture
@@ -22,7 +26,10 @@ static const int dimsImage = 3;//so thing can be set at co0mpile time
 WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
 NOTE: we assume kernel size is the same as image (it has been padded appropriately)
 */
-imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int devCUDA);
+FUNCTION_PREFIX imageType* convolution3DfftCUDA_test(imageType* im,
+								int* imDim,
+								imageType* kernel,
+								int devCUDA);
 
 
 /*
@@ -31,7 +38,11 @@ WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDi
 
 TODO: pad data with imSize+kernelSize-1 (kernelSize/2 on each side) to impose the boundary conditions that you want: mirror, zero, etc...). Look at conv2DFFT in the CUDA SDK examples
 */
-FUNCTION_PREFIX imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
+FUNCTION_PREFIX imageType* convolution3DfftCUDA(imageType* im,
+												int* imDim,
+												imageType* kernel,
+												int* kernelDim,
+												int devCUDA);
 
 
 /*
@@ -52,4 +63,11 @@ WARNING: If imSize > 5*kernelSize the maximum relative error in the convolution
 */
 //FUNCTION_PREFIX void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
 
+FUNCTION_PREFIX int selectDeviceWithHighestComputeCapability();
+FUNCTION_PREFIX int getCUDAcomputeCapabilityMinorVersion(int devCUDA);
+FUNCTION_PREFIX int getCUDAcomputeCapabilityMajorVersion(int devCUDA);
+FUNCTION_PREFIX int getNumDevicesCUDA();
+FUNCTION_PREFIX void getNameDeviceCUDA(int devCUDA, char *name);
+FUNCTION_PREFIX long long int getMemDeviceCUDA(int devCUDA);
+
 #endif //__CONVOLUTION_3D_FFT_H__
diff --git a/src/standardCUDAfunctions.cu b/src/standardCUDAfunctions.cu
index 23c8c92..307b449 100644
--- a/src/standardCUDAfunctions.cu
+++ b/src/standardCUDAfunctions.cu
@@ -6,17 +6,41 @@
  */
 #include "book.h"
 #include "cuda.h"
-#include "standardCUDAfunctions.h"
+
+#include "convolution3Dfft.h"
 
 //==============================================
-int getCUDAcomputeCapabilityMajorVersion(int devCUDA)
+ int selectDeviceWithHighestComputeCapability() {
+
+  int numDevices = 0;
+  HANDLE_ERROR(cudaGetDeviceCount(&numDevices));
+  int computeCapability = 0;
+  int meta = 0;
+  int value = -1;
+  int major = 0;
+  int minor = 0;
+
+  for (short devIdx = 0; devIdx < numDevices; ++devIdx) {
+    cuDeviceComputeCapability(&major, &minor, devIdx);
+    meta = 10 * major + minor;
+    if (meta > computeCapability) {
+      computeCapability = meta;
+      value = devIdx;
+    }
+  }
+
+  return value;
+}
+
+ int getCUDAcomputeCapabilityMajorVersion(int devCUDA)
 {
 	int major = 0, minor = 0;
 	cuDeviceComputeCapability(&major, &minor,devCUDA);
 
 	return major;
 }
-int getCUDAcomputeCapabilityMinorVersion(int devCUDA)
+
+ int getCUDAcomputeCapabilityMinorVersion(int devCUDA)
 {
 	int major = 0, minor = 0;
 	cuDeviceComputeCapability(&major, &minor,devCUDA);
@@ -24,20 +48,22 @@ int getCUDAcomputeCapabilityMinorVersion(int devCUDA)
 	return minor;
 }
 
-int getNumDevicesCUDA()
+ int getNumDevicesCUDA()
 {
 	int count = 0;
 	HANDLE_ERROR(cudaGetDeviceCount ( &count ));
 	return count;
 }
-void getNameDeviceCUDA(int devCUDA, char* name)
+
+ void getNameDeviceCUDA(int devCUDA, char* name)
 {
 	cudaDeviceProp prop;
 	HANDLE_ERROR( cudaGetDeviceProperties(&prop, devCUDA));
 
 	memcpy(name,prop.name,sizeof(char)*256);
 }
-long long int getMemDeviceCUDA(int devCUDA)
+
+ long long int getMemDeviceCUDA(int devCUDA)
 {
 	cudaDeviceProp prop;
 	HANDLE_ERROR( cudaGetDeviceProperties(&prop, devCUDA));
diff --git a/src/standardCUDAfunctions.h b/src/standardCUDAfunctions.h
deleted file mode 100644
index 376fe46..0000000
--- a/src/standardCUDAfunctions.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * standardCUDAfunctions.h
- *
- *  Created on: Jul 24, 2014
- *      Author: preibisch
- */
-
-#ifndef STANDARDCUDAFUNCTIONS_H_
-#define STANDARDCUDAFUNCTIONS_H_
-
-#ifdef _WIN32
-#define FUNCTION_PREFIX extern "C" __declspec(dllexport)
-#else
-#define FUNCTION_PREFIX extern "C"
-#endif
-//----------------------------------functions to decide whhich GPU to use-------------------------------
-
-FUNCTION_PREFIX int getCUDAcomputeCapabilityMinorVersion(int devCUDA);
-FUNCTION_PREFIX int getCUDAcomputeCapabilityMajorVersion(int devCUDA);
-FUNCTION_PREFIX int getNumDevicesCUDA();
-FUNCTION_PREFIX void getNameDeviceCUDA(int devCUDA, char *name);
-FUNCTION_PREFIX long long int getMemDeviceCUDA(int devCUDA);
-
-#endif /* STANDARDCUDAFUNCTIONS_H_ */
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..108a901
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,44 @@
+INCLUDE_DIRECTORIES(.)
+
+SET(Boost_USE_MULTITHREADED ON)
+SET(Boost_USE_STATIC_LIBS ON)
+FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem unit_test_framework REQUIRED)
+IF(Boost_FOUND)
+INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
+LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) 
+ENDIF()
+
+FIND_PACKAGE(CUDA)
+
+IF(CUDA_FOUND)
+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
+LINK_DIRECTORIES(${PROJECT_BINARY_DIR}/src) 
+
+CUDA_ADD_EXECUTABLE(test_gpu_convolve test_gpu_convolve.cpp image_stack_utils.cpp)
+
+IF(Boost_FOUND)
+target_link_libraries(test_gpu_convolve ${Boost_LIBRARIES} ${PROJECT_NAME}_static)
+set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-D${PROJECT_NAME}_BUILT_AS_STATIC")
+MESSAGE(">> static Boost UTF: ${Boost_LIBRARIES}")
+ELSE(Boost_FOUND)
+
+SET(Boost_USE_STATIC_LIBS OFF)
+SET(Boost_USE_MULTITHREADED ON)
+FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem unit_test_framework REQUIRED)
+IF(Boost_FOUND)
+MESSAGE(">> dynamic Boost UTF ${Boost_LIBRARIES}")
+INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
+LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) 
+
+target_link_libraries(test_gpu_convolve ${Boost_LIBRARIES} ${PROJECT_NAME})
+set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK")
+ELSE()
+MESSAGE(">> dynamic Boost UTF NOT FOUND")
+ENDIF()
+ENDIF(Boost_FOUND)
+
+ELSE(CUDA_FOUND)
+MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!")
+ENDIF(CUDA_FOUND)
+
+
diff --git a/tests/image_stack_utils.cpp b/tests/image_stack_utils.cpp
new file mode 100644
index 0000000..55a66ce
--- /dev/null
+++ b/tests/image_stack_utils.cpp
@@ -0,0 +1,68 @@
+#define _IMAGE_STACK_UTILS_CPP_
+#include <iomanip>
+#include "image_stack_utils.h"
+
+namespace multiviewnative {
+
+/**
+   \brief function to print an image_stack
+
+   \param[in] _cout input ostream
+   \param[in] _marray input multi-dim array, _marray is expected to be defined
+   in c_storage_order
+   that means, if one has a stack of
+   width = 2 (number of columns, x dimension),
+   height = 3 (number of rows, y dimension),
+   depth = 4 (number of frames, z dimension)
+   then this function expects that it can access the array as
+   _marray[z][y][x]
+   similarly, the dimensions located at _marray.shape() are expected to be
+   _marray.shape()[0] = depth
+   _marray.shape()[1] = height
+   _marray.shape()[2] = height
+   \return
+   \retval
+
+*/
+std::ostream& operator<<(std::ostream& _cout, const image_stack& _marray) {
+
+  if (image_stack::dimensionality != 3) {
+    _cout << "dim!=3\n";
+    return _cout;
+  }
+
+  if (_marray.empty()) {
+    _cout << "size=0\n";
+    return _cout;
+  }
+
+  int precision = _cout.precision();
+  _cout << std::setprecision(4);
+  const size_t* shapes = _marray.shape();
+
+  _cout << std::setw(9) << "x = ";
+  for (size_t x_index = 0; x_index < (shapes[2]); ++x_index) {
+    _cout << std::setw(8) << x_index << " ";
+  }
+  _cout << "\n";
+  _cout << std::setfill('-') << std::setw((shapes[2] + 1) * 9) << " "
+        << std::setfill(' ') << std::endl;
+
+  for (size_t z_index = 0; z_index < (shapes[0]); ++z_index) {
+    _cout << "z[" << std::setw(5) << z_index << "] \n";
+    for (size_t y_index = 0; y_index < (shapes[1]); ++y_index) {
+      _cout << "y[" << std::setw(5) << y_index << "] ";
+
+      for (size_t x_index = 0; x_index < (shapes[2]); ++x_index) {
+        _cout << std::setw(8) << _marray[z_index][y_index][x_index] << " ";
+      }
+
+      _cout << "\n";
+    }
+    _cout << "\n";
+  }
+
+  _cout << std::setprecision(precision);
+  return _cout;
+}
+}
diff --git a/tests/image_stack_utils.h b/tests/image_stack_utils.h
new file mode 100644
index 0000000..2a10964
--- /dev/null
+++ b/tests/image_stack_utils.h
@@ -0,0 +1,77 @@
+#ifndef _IMAGE_STACK_UTILS_H_
+#define _IMAGE_STACK_UTILS_H_
+#include <vector>
+#include <iostream>
+#include "boost/multi_array.hpp"
+
+namespace multiviewnative {
+
+  typedef boost::multi_array<float, 3> image_stack;
+  typedef boost::multi_array_ref<float, 3> image_stack_ref;
+  typedef boost::const_multi_array_ref<float, 3> image_stack_cref;
+  typedef boost::multi_array<float, 2> image_frame;
+  typedef boost::multi_array_ref<float, 2> image_frame_ref;
+  typedef boost::const_multi_array_ref<float, 2> image_frame_cref;
+  typedef image_stack::array_view<3>::type image_stack_view;
+  typedef image_stack::array_view<2>::type image_stack_frame;
+  typedef image_stack::array_view<1>::type image_stack_line;
+  typedef boost::multi_array_types::index_range range;
+  typedef boost::general_storage_order<3> storage;
+
+  std::ostream& operator<<(std::ostream&, const image_stack&);
+
+  template <typename DimT, typename ODimT>
+  void adapt_extents_for_fftw_inplace(const DimT& _extent, ODimT& _value,
+				      const storage& _storage =
+				      boost::c_storage_order()) {
+
+    std::fill(_value.begin(), _value.end(), 0);
+
+    std::vector<int> storage_order(_extent.size());
+    for (size_t i = 0; i < _extent.size(); ++i)
+      storage_order[i] = _storage.ordering(i);
+
+    size_t lowest_storage_index =
+      std::min_element(storage_order.begin(), storage_order.end()) -
+      storage_order.begin();
+
+    for (size_t i = 0; i < _extent.size(); ++i)
+      _value[i] =
+        (lowest_storage_index == i) ? 2 * (_extent[i] / 2 + 1) : _extent[i];
+  }
+
+  template <typename StorageT, typename DimT, typename ODimT>
+  void adapt_shape_for_fftw_inplace(const DimT& _extent,
+				    ODimT& _value,
+				    const StorageT& _storage) {
+
+    std::fill(_value.begin(), _value.end(), 0);
+
+    size_t lowest_storage_index =
+      std::min_element(_storage.begin(), _storage.end()) - _storage.begin();
+
+    for (size_t i = 0; i < _extent.size(); ++i)
+      _value[i] =
+        (lowest_storage_index == i) ? 2 * (_extent[i] / 2 + 1) : _extent[i];
+  }
+
+
+
+template <typename DimT, typename ODimT>
+void adapt_extents_for_cufft_inplace(const DimT& _extent, ODimT& _value,
+				     const storage& _storage =
+				     boost::c_storage_order()) {
+
+  adapt_extents_for_fftw_inplace(_extent, _value, _storage);
+}
+
+template <typename StorageT, typename DimT, typename ODimT>
+void adapt_shape_for_cufft_inplace(const DimT& _extent,
+				   ODimT& _value,
+				   const StorageT& _storage) {
+
+  adapt_shape_for_fftw_inplace(_extent, _value, _storage);
+}
+
+}
+#endif /* _IMAGE_STACK_UTILS_H_ */
diff --git a/tests/padd_utils.h b/tests/padd_utils.h
new file mode 100644
index 0000000..a2f6000
--- /dev/null
+++ b/tests/padd_utils.h
@@ -0,0 +1,169 @@
+#ifndef _PADD_UTILS_H_
+#define _PADD_UTILS_H_
+#include <vector>
+#include <algorithm>
+//#include <type_traits>
+#include <limits>
+#include "boost/multi_array.hpp"
+
+namespace multiviewnative {
+
+
+template <typename outT>
+struct add_minus_1 {
+
+  template <typename any_type>
+  outT operator()(const any_type& _first, const any_type& _second) {
+    return _first + _second - 1;
+  }
+};
+
+template <typename inT, typename outT>
+struct minus_1_div_2 {
+
+  outT operator()(const inT& _first) { return (_first - 1) / 2; }
+};
+
+template <typename ImageStackT>
+struct no_padd {
+
+  typedef typename ImageStackT::value_type value_type;
+  typedef typename ImageStackT::size_type size_type;
+  typedef typename ImageStackT::template array_view<
+      ImageStackT::dimensionality>::type image_stack_view;
+  typedef boost::multi_array_types::index_range range;
+
+  std::vector<size_type> extents_;
+  std::vector<size_type> offsets_;
+
+  no_padd()
+      : extents_(ImageStackT::dimensionality, 0),
+        offsets_(ImageStackT::dimensionality, 0) {}
+
+  template <typename T, typename U>
+  no_padd(T* _image_shape, U* _kernel_shape)
+      : extents_(_image_shape, _image_shape + ImageStackT::dimensionality),
+        offsets_(ImageStackT::dimensionality, 0) {
+    // static_assert(
+    //     std::numeric_limits<T>::is_integer,
+    //     "[no_padd] didn't receive integer type as image shape descriptor");
+    // static_assert(
+    //     std::numeric_limits<U>::is_integer,
+    //     "[no_padd] didn't receive integer type as kernel shape descriptor");
+  }
+
+  template <typename ImageStackRefT, typename OtherStackT>
+  void insert_at_offsets(const ImageStackRefT& _source, OtherStackT& _target) {
+    _target = _source;
+  }
+
+ 
+
+  const size_type* offsets() const { return &offsets_[0]; }
+
+  const size_type* extents() const { return &extents_[0]; }
+};
+
+template <typename ImageStackT>
+struct zero_padd {
+
+  typedef typename ImageStackT::value_type value_type;
+  typedef typename ImageStackT::size_type size_type;
+  typedef typename ImageStackT::template array_view<
+      ImageStackT::dimensionality>::type image_stack_view;
+  typedef boost::multi_array_types::index_range range;
+
+  std::vector<size_type> extents_;
+  std::vector<size_type> offsets_;
+
+  zero_padd()
+      : extents_(ImageStackT::dimensionality, 0),
+        offsets_(ImageStackT::dimensionality, 0) {}
+
+  zero_padd(const zero_padd& _other)
+      : extents_(_other.extents_), offsets_(_other.offsets_) {}
+
+  template <typename T, typename U>
+  zero_padd(T* _image, U* _kernel)
+      : extents_(ImageStackT::dimensionality, 0),
+        offsets_(ImageStackT::dimensionality, 0) {
+    // static_assert(
+    //     std::numeric_limits<T>::is_integer == true,
+    //     "[zero_padd] didn't receive integer type as image shape descriptor");
+    // static_assert(
+    //     std::numeric_limits<U>::is_integer,
+    //     "[zero_padd] didn't receive integer type as kernel shape descriptor");
+
+    std::transform(_image, _image + ImageStackT::dimensionality, _kernel,
+                   extents_.begin(), add_minus_1<T>());
+
+    std::transform(_kernel, _kernel + ImageStackT::dimensionality,
+                   offsets_.begin(), minus_1_div_2<U, size_type>());
+  }
+
+  zero_padd& operator=(const zero_padd& _other) {
+    if (this != &_other) {
+      extents_ = _other.extents_;
+      offsets_ = _other.offsets_;
+    }
+    return *this;
+  }
+
+  /**
+     \brief function that inserts _source in _target given limits by vector
+     offsets,
+     _target is expected to have dimensions
+     (source.shape()[0]+2*offsets_[0])x(source.shape()[1]+2*offsets_[1])x...
+     _target is hence expected to have dimensions extents_[0]xextents_[1]x...
+
+     as an example:
+     _source =
+     1 1
+     1 1
+     given a kernel of 3x3x3
+     would expecte a _target of
+     0 0 0 0
+     0 0 0 0
+     0 0 0 0
+     0 0 0 0
+
+     and the result would look like
+     0 0 0 0
+     0 1 1 0
+     0 1 1 0
+     0 0 0 0
+
+     \param[in] _source image stack to embed into target
+     \param[in] _target image stack of size extents_[0]xextents_[1]x...
+
+     \return
+     \retval
+
+  */
+  template <typename ImageStackRefT, typename OtherStackT>
+  void insert_at_offsets(const ImageStackRefT& _source, OtherStackT& _target) {
+
+    if (std::lexicographical_compare(
+            _target.shape(), _target.shape() + OtherStackT::dimensionality,
+            extents_.begin(), extents_.end()))
+      throw std::runtime_error(
+          "multiviewnative::zero_padd::insert_at_offsets]\ttarget image stack "
+          "is smaller or equal in size than source\n");
+
+    image_stack_view subview_padded_image = _target
+        [boost::indices[range(offsets_[0], offsets_[0] + _source.shape()[0])]
+                       [range(offsets_[1], offsets_[1] + _source.shape()[1])]
+                       [range(offsets_[2], offsets_[2] + _source.shape()[2])]];
+    subview_padded_image = _source;
+  }
+
+
+
+  const size_type* offsets() const { return &offsets_[0]; }
+
+  const size_type* extents() const { return &extents_[0]; }
+
+  virtual ~zero_padd() {};
+};
+}
+#endif /* _PADD_UTILS_H_ */
diff --git a/tests/test_algorithms.hpp b/tests/test_algorithms.hpp
new file mode 100644
index 0000000..0c79143
--- /dev/null
+++ b/tests/test_algorithms.hpp
@@ -0,0 +1,165 @@
+#ifndef _TEST_ALGORITHMS_H_
+#define _TEST_ALGORITHMS_H_
+
+#include <vector>
+#include "image_stack_utils.h"
+
+namespace multiviewnative {
+
+template <typename ImageStackT, typename DimT>
+void convolve(const ImageStackT& _image, const ImageStackT& _kernel,
+              ImageStackT& _result, const std::vector<DimT>& _offset) {
+
+  if (!_image.num_elements()) return;
+
+  std::vector<DimT> half_kernel(3);
+  for (unsigned i = 0; i < 3; ++i) half_kernel[i] = _kernel.shape()[i] / 2;
+
+  float image_value = 0;
+  float kernel_value = 0;
+  float value = 0;
+
+  for (int image_z = _offset[0]; image_z < int(_image.shape()[0] - _offset[0]);
+       ++image_z) {
+    for (int image_y = _offset[1];
+         image_y < int(_image.shape()[1] - _offset[1]); ++image_y) {
+      for (int image_x = _offset[2];
+           image_x < int(_image.shape()[2] - _offset[2]); ++image_x) {
+
+        _result[image_z][image_y][image_x] = 0.f;
+
+        image_value = 0;
+        kernel_value = 0;
+        value = 0;
+
+        // TODO: adapt loops to storage order for the sake of speed (if so,
+        // prefetch line to use SIMD)
+        for (int kernel_z = 0; kernel_z < int(_kernel.shape()[0]); ++kernel_z) {
+          for (int kernel_y = 0; kernel_y < int(_kernel.shape()[1]);
+               ++kernel_y) {
+            for (int kernel_x = 0; kernel_x < int(_kernel.shape()[2]);
+                 ++kernel_x) {
+
+              kernel_value = _kernel[_kernel.shape()[0] - 1 - kernel_z]
+                                    [_kernel.shape()[1] - 1 - kernel_y]
+                                    [_kernel.shape()[2] - 1 - kernel_x];
+              image_value = _image[image_z - half_kernel[0] + kernel_z]
+                                  [image_y - half_kernel[1] + kernel_y]
+                                  [image_x - half_kernel[2] + kernel_x];
+
+              value += kernel_value * image_value;
+            }
+          }
+        }
+        _result[image_z][image_y][image_x] = value;
+      }
+    }
+  }
+}
+
+template <typename ImageStackT, typename DimT>
+typename ImageStackT::element sum_from_offset(
+    const ImageStackT& _image, const std::vector<DimT>& _offset) {
+
+  typename ImageStackT::element value = 0.f;
+
+  // TODO: adapt loops to storage order for the sake of speed (if so, prefetch
+  // line to use SIMD)
+  for (int image_z = _offset[0]; image_z < int(_image.shape()[0] - _offset[0]);
+       ++image_z) {
+    for (int image_y = _offset[1];
+         image_y < int(_image.shape()[1] - _offset[1]); ++image_y) {
+      for (int image_x = _offset[2];
+           image_x < int(_image.shape()[2] - _offset[2]); ++image_x) {
+
+        value += _image[image_z][image_y][image_x];
+      }
+    }
+  }
+
+  return value;
+}
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+template <typename ValueT, typename DimT>
+ValueT l2norm(ValueT* _first, ValueT* _second, const DimT& _size) {
+
+  ValueT l2norm = 0.;
+
+#ifdef _OPENMP
+  int nthreads = omp_get_num_procs();
+  #pragma omp parallel for num_threads(nthreads) shared(l2norm)
+#endif
+  for (unsigned p = 0; p < _size; ++p)
+    l2norm += (_first[p] - _second[p]) * (_first[p] - _second[p]);
+
+  return l2norm;
+}
+
+template <typename ImageT, typename OtherT>
+float l2norm_within_limits(const ImageT& _first, const OtherT& _second,
+                           const float& _rel_lower_limit_per_axis,
+                           const float& _rel_upper_limit_per_axis) {
+
+  if (_rel_upper_limit_per_axis < _rel_lower_limit_per_axis) {
+    std::cerr << "[l2norm_within_limits]\treceived weird interval ["
+              << _rel_lower_limit_per_axis << "," << _rel_upper_limit_per_axis
+              << "]\n";
+    return 0;
+  }
+
+  float l2norm = 0.;
+
+#pragma omp parallel for num_threads(omp_get_num_procs()) shared(l2norm)
+  for (int image_z = _first.shape()[0] * _rel_lower_limit_per_axis;
+       image_z < int(_first.shape()[0] * _rel_upper_limit_per_axis);
+       ++image_z) {
+    for (int image_y = _first.shape()[1] * _rel_lower_limit_per_axis;
+         image_y < int(_first.shape()[1] * _rel_upper_limit_per_axis);
+         ++image_y) {
+      for (int image_x = _first.shape()[2] * _rel_lower_limit_per_axis;
+           image_x < int(_first.shape()[2] * _rel_upper_limit_per_axis);
+           ++image_x) {
+
+        float intermediate = _first[image_z][image_y][image_x] -
+                             _second[image_z][image_y][image_x];
+        l2norm += (intermediate) * (intermediate);
+      }
+    }
+  }
+
+  return l2norm;
+}
+
+template <typename ValueT, typename DimT>
+ValueT l1norm(ValueT* _first, ValueT* _second, const DimT& _size) {
+
+  ValueT l1norm = 0.;
+
+#pragma omp parallel for num_threads(omp_get_num_procs()) shared(l1norm)
+  for (unsigned p = 0; p < _size; ++p)
+    l1norm += std::fabs(_first[p] - _second[p]);
+
+  return l1norm;
+}
+
+template <typename ValueT, typename DimT>
+ValueT max_value(ValueT* _data, const DimT& _size) {
+
+  ValueT max = *std::max_element(_data, _data + _size);
+
+  return max;
+}
+
+template <typename ValueT, typename DimT>
+ValueT min_value(ValueT* _data, const DimT& _size) {
+
+  ValueT min = *std::min_element(_data, _data + _size);
+
+  return min;
+}
+}
+#endif /* _TEST_ALGORITHMS_H_ */
diff --git a/tests/test_fixtures.hpp b/tests/test_fixtures.hpp
new file mode 100644
index 0000000..10e3956
--- /dev/null
+++ b/tests/test_fixtures.hpp
@@ -0,0 +1,304 @@
+#ifndef _TEST_FIXTURES_H_
+#define _TEST_FIXTURES_H_
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+#include <numeric>
+#include <functional>
+#include <algorithm>
+
+#include <boost/static_assert.hpp>
+#include "boost/multi_array.hpp"
+
+// http://www.boost.org/doc/libs/1_55_0/libs/multi_array/doc/user.html
+// http://stackoverflow.com/questions/2168082/how-to-rewrite-array-from-row-order-to-column-order
+#include "image_stack_utils.h"
+#include "test_algorithms.hpp"
+
+namespace multiviewnative {
+
+template <unsigned short KernelDimSize = 3, unsigned ImageDimSize = 8>
+struct convolutionFixture3D {
+
+  static const int n_dim = 3;
+  static const unsigned halfKernel = KernelDimSize / 2u;
+  static const unsigned imageDimSize = ImageDimSize;
+  static const unsigned kernelDimSize = KernelDimSize;
+
+  const unsigned image_size_;
+  std::vector<int> image_dims_;
+  std::vector<int> padded_image_dims_;
+  std::vector<int> asymm_padded_image_dims_;
+  image_stack image_;
+  image_stack one_;
+  image_stack padded_image_;
+  image_stack padded_one_;
+  image_stack asymm_padded_image_;
+  image_stack asymm_padded_one_;
+  image_stack image_folded_by_horizontal_;
+  image_stack image_folded_by_vertical_;
+  image_stack image_folded_by_depth_;
+  image_stack image_folded_by_all1_;
+  image_stack one_folded_by_asymm_cross_kernel_;
+  image_stack one_folded_by_asymm_one_kernel_;
+  image_stack one_folded_by_asymm_identity_kernel_;
+
+  const unsigned kernel_size_;
+  std::vector<int> kernel_dims_;
+  std::vector<int> asymm_kernel_dims_;
+  image_stack trivial_kernel_;
+  image_stack identity_kernel_;
+  image_stack vertical_kernel_;
+  image_stack horizont_kernel_;
+  image_stack depth_kernel_;
+  image_stack all1_kernel_;
+  image_stack asymm_cross_kernel_;
+  image_stack asymm_one_kernel_;
+  image_stack asymm_identity_kernel_;
+
+  std::vector<int> symm_offsets_;
+  std::vector<range> symm_ranges_;
+  std::vector<int> asymm_offsets_;
+  std::vector<range> asymm_ranges_;
+
+  BOOST_STATIC_ASSERT(KernelDimSize % 2 != 0);
+
+ public:
+  convolutionFixture3D()
+      : image_size_((unsigned)std::pow(ImageDimSize, n_dim)),
+        image_dims_(n_dim, ImageDimSize),
+        padded_image_dims_(n_dim, ImageDimSize + 2 * (KernelDimSize / 2)),
+        asymm_padded_image_dims_(n_dim),
+        image_(boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_(boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        padded_image_(boost::extents[ImageDimSize + 2 * (KernelDimSize / 2)]
+                                    [ImageDimSize + 2 * (KernelDimSize / 2)]
+                                    [ImageDimSize + 2 * (KernelDimSize / 2)]),
+        padded_one_(boost::extents[ImageDimSize + 2 * (KernelDimSize / 2)]
+                                  [ImageDimSize + 2 * (KernelDimSize / 2)]
+                                  [ImageDimSize + 2 * (KernelDimSize / 2)]),
+        asymm_padded_image_(),
+        asymm_padded_one_(),
+        image_folded_by_horizontal_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        image_folded_by_vertical_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        image_folded_by_depth_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        image_folded_by_all1_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_folded_by_asymm_cross_kernel_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_folded_by_asymm_one_kernel_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_folded_by_asymm_identity_kernel_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        kernel_size_((unsigned)std::pow(KernelDimSize, n_dim)),
+        kernel_dims_(n_dim, KernelDimSize),
+        asymm_kernel_dims_(n_dim, KernelDimSize),
+        trivial_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        identity_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        vertical_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        horizont_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        depth_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        all1_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        asymm_cross_kernel_(boost::extents[KernelDimSize + 1][KernelDimSize]
+                                          [KernelDimSize - 1]),
+        asymm_one_kernel_(boost::extents[KernelDimSize + 1][KernelDimSize]
+                                        [KernelDimSize - 1]),
+        asymm_identity_kernel_(boost::extents[KernelDimSize + 1][KernelDimSize]
+                                             [KernelDimSize - 1]),
+        symm_offsets_(n_dim, halfKernel),
+        symm_ranges_(n_dim, range(halfKernel, halfKernel + ImageDimSize)),
+        asymm_offsets_(n_dim, 0),
+        asymm_ranges_(n_dim) {
+
+    // FILL KERNELS
+
+    std::fill(trivial_kernel_.data(), trivial_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(identity_kernel_.data(), identity_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(vertical_kernel_.data(), vertical_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(depth_kernel_.data(), depth_kernel_.data() + kernel_size_, 0.f);
+    std::fill(horizont_kernel_.data(), horizont_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(all1_kernel_.data(), all1_kernel_.data() + kernel_size_, 1.f);
+    std::fill(asymm_cross_kernel_.data(),
+              asymm_cross_kernel_.data() + asymm_cross_kernel_.num_elements(),
+              0.f);
+    std::fill(asymm_one_kernel_.data(),
+              asymm_one_kernel_.data() + asymm_one_kernel_.num_elements(), 0.f);
+    std::fill(
+        asymm_identity_kernel_.data(),
+        asymm_identity_kernel_.data() + asymm_identity_kernel_.num_elements(),
+        0.f);
+
+    identity_kernel_.data()[kernel_size_ / 2] = 1.;
+
+    for (unsigned int index = 0; index < KernelDimSize; ++index) {
+      horizont_kernel_[halfKernel][halfKernel][index] = float(index + 1);
+      vertical_kernel_[halfKernel][index][halfKernel] = float(index + 1);
+      depth_kernel_[index][halfKernel][halfKernel] = float(index + 1);
+    }
+
+    asymm_identity_kernel_[asymm_cross_kernel_.shape()[0] / 2]
+                          [asymm_cross_kernel_.shape()[1] / 2]
+                          [asymm_cross_kernel_.shape()[2] / 2] = 1.f;
+
+    for (int z_index = 0; z_index < int(asymm_cross_kernel_.shape()[0]);
+         ++z_index) {
+      for (int y_index = 0; y_index < int(asymm_cross_kernel_.shape()[1]);
+           ++y_index) {
+        for (int x_index = 0; x_index < int(asymm_cross_kernel_.shape()[2]);
+             ++x_index) {
+
+          if (z_index == (int)asymm_cross_kernel_.shape()[0] / 2 &&
+              y_index == (int)asymm_cross_kernel_.shape()[1] / 2) {
+            asymm_cross_kernel_[z_index][y_index][x_index] = x_index + 1;
+            asymm_one_kernel_[z_index][y_index][x_index] = 1;
+          }
+
+          if (x_index == (int)asymm_cross_kernel_.shape()[2] / 2 &&
+              y_index == (int)asymm_cross_kernel_.shape()[1] / 2) {
+            asymm_cross_kernel_[z_index][y_index][x_index] = z_index + 101;
+            asymm_one_kernel_[z_index][y_index][x_index] = 1;
+          }
+
+          if (x_index == (int)asymm_cross_kernel_.shape()[2] / 2 &&
+              z_index == (int)asymm_cross_kernel_.shape()[0] / 2) {
+            asymm_cross_kernel_[z_index][y_index][x_index] = y_index + 11;
+            asymm_one_kernel_[z_index][y_index][x_index] = 1;
+          }
+        }
+      }
+    }
+
+    // FILL IMAGES
+    unsigned padded_image_axis = ImageDimSize + 2 * halfKernel;
+    unsigned padded_image_size = std::pow(padded_image_axis, n_dim);
+    std::fill(image_.data(), image_.data() + image_size_, 0.f);
+    std::fill(one_.data(), one_.data() + image_size_, 0.f);
+    std::fill(padded_image_.data(), padded_image_.data() + padded_image_size,
+              0.f);
+    std::fill(padded_one_.data(), padded_one_.data() + padded_image_size, 0.f);
+    padded_one_[padded_image_axis / 2][padded_image_axis / 2]
+               [padded_image_axis / 2] = 1.f;
+    one_[ImageDimSize / 2][ImageDimSize / 2][ImageDimSize / 2] = 1.f;
+
+    unsigned image_index = 0;
+    for (int z_index = 0; z_index < int(image_dims_[0]); ++z_index) {
+      for (int y_index = 0; y_index < int(image_dims_[1]); ++y_index) {
+        for (int x_index = 0; x_index < int(image_dims_[2]); ++x_index) {
+          image_[z_index][y_index][x_index] = float(image_index++);
+        }
+      }
+    }
+
+    // PADD THE IMAGE FOR CONVOLUTION
+    range axis_subrange = range(halfKernel, halfKernel + ImageDimSize);
+    image_stack_view padded_image_original = padded_image_
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    padded_image_original = image_;
+
+    image_stack padded_image_folded_by_horizontal = padded_image_;
+    image_stack padded_image_folded_by_vertical = padded_image_;
+    image_stack padded_image_folded_by_depth = padded_image_;
+    image_stack padded_image_folded_by_all1 = padded_image_;
+
+    // PREPARE ASYMM IMAGES
+    // std::vector<unsigned> asymm_padded_image_shape(n_dim);
+
+    std::copy(asymm_cross_kernel_.shape(),
+              asymm_cross_kernel_.shape() + image_stack::dimensionality,
+              asymm_kernel_dims_.begin());
+
+
+    for (int i = 0; i < n_dim; ++i) {
+      asymm_offsets_[i] = asymm_cross_kernel_.shape()[i]/2;
+      asymm_ranges_[i] =
+          range(asymm_offsets_[i], asymm_offsets_[i] + ImageDimSize);
+      asymm_padded_image_dims_[i] = ImageDimSize + 2 * asymm_offsets_[i];
+    }
+
+    asymm_padded_image_.resize(asymm_padded_image_dims_);
+    asymm_padded_one_.resize(asymm_padded_image_dims_);
+    std::fill(asymm_padded_image_.data(),
+              asymm_padded_image_.data() + asymm_padded_image_.num_elements(),
+              0.f);
+    std::fill(asymm_padded_one_.data(),
+              asymm_padded_one_.data() + asymm_padded_one_.num_elements(), 0.f);
+    asymm_padded_one_[asymm_padded_one_.shape()[0] / 2]
+                     [asymm_padded_one_.shape()[1] / 2]
+                     [asymm_padded_one_.shape()[2] / 2] = 1.f;
+
+    image_stack_view asymm_padded_image_original = asymm_padded_image_
+        [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]][asymm_ranges_[2]]];
+    asymm_padded_image_original = image_;
+
+    image_stack asymm_padded_one_folded_by_asymm_cross_kernel =
+        asymm_padded_one_;
+    image_stack asymm_padded_one_folded_by_asymm_one_kernel = asymm_padded_one_;
+    image_stack asymm_padded_one_folded_by_asymm_identity_kernel =
+        asymm_padded_one_;
+
+    // CONVOLVE
+    convolve(padded_image_, horizont_kernel_, padded_image_folded_by_horizontal,
+             symm_offsets_);
+    convolve(padded_image_, vertical_kernel_, padded_image_folded_by_vertical,
+             symm_offsets_);
+    convolve(padded_image_, depth_kernel_, padded_image_folded_by_depth,
+             symm_offsets_);
+    convolve(padded_image_, all1_kernel_, padded_image_folded_by_all1,
+             symm_offsets_);
+
+    convolve(asymm_padded_one_, asymm_cross_kernel_,
+             asymm_padded_one_folded_by_asymm_cross_kernel, asymm_offsets_);
+    convolve(asymm_padded_one_, asymm_one_kernel_,
+             asymm_padded_one_folded_by_asymm_one_kernel, asymm_offsets_);
+    convolve(asymm_padded_one_, asymm_identity_kernel_,
+             asymm_padded_one_folded_by_asymm_identity_kernel, asymm_offsets_);
+
+    // EXTRACT NON-PADDED CONTENT FROM CONVOLVED IMAGE STACKS
+    image_folded_by_horizontal_ = padded_image_folded_by_horizontal
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    image_folded_by_vertical_ = padded_image_folded_by_vertical
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    image_folded_by_depth_ = padded_image_folded_by_depth
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    image_folded_by_all1_ = padded_image_folded_by_all1
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+
+    one_folded_by_asymm_cross_kernel_ =
+        asymm_padded_one_folded_by_asymm_cross_kernel
+            [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]]
+                           [asymm_ranges_[2]]];
+    one_folded_by_asymm_one_kernel_ =
+        asymm_padded_one_folded_by_asymm_one_kernel
+            [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]]
+                           [asymm_ranges_[2]]];
+    one_folded_by_asymm_identity_kernel_ =
+        asymm_padded_one_folded_by_asymm_identity_kernel
+            [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]]
+                           [asymm_ranges_[2]]];
+  }
+
+  virtual ~convolutionFixture3D() {};
+
+  static const unsigned image_axis_size = ImageDimSize;
+  static const unsigned kernel_axis_size = KernelDimSize;
+};
+
+typedef convolutionFixture3D<> default_3D_fixture;
+
+}  // namespace
+
+#endif
diff --git a/tests/test_gpu_convolve.cpp b/tests/test_gpu_convolve.cpp
new file mode 100644
index 0000000..26fd485
--- /dev/null
+++ b/tests/test_gpu_convolve.cpp
@@ -0,0 +1,195 @@
+#define BOOST_TEST_MODULE GPU_CONVOLUTION
+#include "boost/test/unit_test.hpp"
+#include "test_fixtures.hpp"
+#include "padd_utils.h"
+#include <numeric>
+
+#include "convolution3Dfft.h"
+
+
+
+BOOST_FIXTURE_TEST_SUITE(legacy_convolution,
+                         multiviewnative::default_3D_fixture)
+
+BOOST_AUTO_TEST_CASE(trivial_convolve) {
+
+  float* image = image_.data();
+  std::vector<float> kernel(kernel_size_,0);
+  
+  convolution3DfftCUDAInPlace(image, &image_dims_[0], 
+			      &kernel[0], &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum = std::accumulate(image, image + image_size_, 0.f);
+  BOOST_CHECK_CLOSE(sum, 0.f, .00001);
+
+}
+
+
+BOOST_AUTO_TEST_CASE(identity_convolve) {
+
+  using namespace multiviewnative;
+
+  float sum_expected = std::accumulate(
+      image_.data(), image_.data() + image_.num_elements(), 0.f);
+
+  zero_padd<image_stack> padder(image_.shape(), identity_kernel_.shape());
+  image_stack padded_image(padder.extents_, image_.storage_order());
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              identity_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(horizontal_convolve) {
+  using namespace multiviewnative;
+
+  float sum_expected =
+      std::accumulate(image_folded_by_horizontal_.data(),
+                      image_folded_by_horizontal_.data() +
+                          image_folded_by_horizontal_.num_elements(),
+                      0.f);
+
+  zero_padd<image_stack> padder(image_.shape(), horizont_kernel_.shape());
+  image_stack padded_image(padder.extents_, image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              horizont_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  image_ = padded_image
+      [boost::indices
+           [range(padder.offsets()[0], padder.offsets()[0] + image_dims_[0])]
+           [range(padder.offsets()[1], padder.offsets()[1] + image_dims_[1])]
+           [range(padder.offsets()[2], padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+
+  BOOST_REQUIRE_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(vertical_convolve) {
+
+  multiviewnative::zero_padd<multiviewnative::image_stack> padder(
+      image_.shape(), vertical_kernel_.shape());
+  multiviewnative::image_stack padded_image(padder.extents_,
+                                            image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              vertical_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum_expected =
+      std::accumulate(image_folded_by_vertical_.data(),
+                      image_folded_by_vertical_.data() +
+                          image_folded_by_vertical_.num_elements(),
+                      0.f);
+
+  image_ = padded_image
+      [boost::indices
+           [multiviewnative::range(padder.offsets()[0],
+                                   padder.offsets()[0] + image_dims_[0])]
+           [multiviewnative::range(padder.offsets()[1],
+                                   padder.offsets()[1] + image_dims_[1])]
+           [multiviewnative::range(padder.offsets()[2],
+                                   padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(depth_convolve) {
+
+  multiviewnative::zero_padd<multiviewnative::image_stack> padder(
+      image_.shape(), depth_kernel_.shape());
+  multiviewnative::image_stack padded_image(padder.extents_,
+                                            image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              depth_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum_expected = std::accumulate(
+      image_folded_by_depth_.data(),
+      image_folded_by_depth_.data() + image_folded_by_depth_.num_elements(),
+      0.f);
+
+  image_ = padded_image
+      [boost::indices
+           [multiviewnative::range(padder.offsets()[0],
+                                   padder.offsets()[0] + image_dims_[0])]
+           [multiviewnative::range(padder.offsets()[1],
+                                   padder.offsets()[1] + image_dims_[1])]
+           [multiviewnative::range(padder.offsets()[2],
+                                   padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(all1_convolve) {
+
+  multiviewnative::zero_padd<multiviewnative::image_stack> padder(
+      image_.shape(), all1_kernel_.shape());
+  multiviewnative::image_stack padded_image(padder.extents_,
+                                            image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              all1_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum_expected = std::accumulate(
+      image_folded_by_all1_.data(),
+      image_folded_by_all1_.data() + image_folded_by_all1_.num_elements(), 0.f);
+
+  image_ = padded_image
+      [boost::indices
+           [multiviewnative::range(padder.offsets()[0],
+                                   padder.offsets()[0] + image_dims_[0])]
+           [multiviewnative::range(padder.offsets()[1],
+                                   padder.offsets()[1] + image_dims_[1])]
+           [multiviewnative::range(padder.offsets()[2],
+                                   padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_SUITE_END()