From 18e2d8ee536a7dbadf827f227cd88841f4a4dcf9 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbach@scionics.de>
Date: Fri, 9 Jan 2015 11:26:01 +0100
Subject: [PATCH 01/24] cmake files installed, lib builds on Linux

---
 CMakeLists.txt                                | 136 ++++++++++++++++++
 ...convolutioncudalib-config-version.cmake.in |  11 ++
 fourierconvolutioncudalib-config.cmake.in     |  33 +++++
 src/CMakeLists.txt                            |  16 +++
 4 files changed, 196 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 fourierconvolutioncudalib-config-version.cmake.in
 create mode 100644 fourierconvolutioncudalib-config.cmake.in
 create mode 100644 src/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..d9431e3
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,136 @@
+# cmake compatibility issues
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+# project name
+PROJECT(FourierConvolutionCUDALib CXX)
+ 
+# version number
+SET (FOURIERCONVOLUTIONCUDALIB_NAME "CUDA FOURIER CONVOLUTION LIBRARY")
+SET (FOURIERCONVOLUTIONCUDALIB_CODENAME "${PROJECT_NAME}")
+SET (FOURIERCONVOLUTIONCUDALIB_COPYRIGHT_YEARS "2013")
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR 2)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR 5)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH 0)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_TYPE SNAPSHOT)
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION_STRING "${FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH}-${CMAKE_BUILD_TYPE}")
+SET (FOURIERCONVOLUTIONCUDALIB_VERSION "${FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR}.${FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH}")
+SET (FOURIERCONVOLUTIONCUDALIB_VENDOR_ID "mpi cbg")
+SET (FOURIERCONVOLUTIONCUDALIB_VENDOR_NAME "Max Planck Institute of Molecular Cell Biology and Genetics ")
+SET (FOURIERCONVOLUTIONCUDALIB_VENDOR_URL "www.mpi-cbg.de")
+SET (FOURIERCONVOLUTIONCUDALIB_ID "${FOURIERCONVOLUTIONCUDALIB_VENDOR_ID}.${PROJECT_NAME}")
+
+# trying to setup paths so this package can be picked up
+set(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries")
+set(INSTALL_INCLUDE_DIR include CACHE PATH "Installation directory for header files")
+
+# shared path is architecture independent for now, TODO extend this to lib/bin/include
+IF(UNIX)
+  IF(APPLE)
+    set(INSTALL_SHARE_DIR ${PROJECT_NAME}.app/Contents/Resources/ CACHE PATH "Installation directory for shared files")
+    #the following was tested with OSX 10.8.5 and Xcode 5.0.2
+    #seems to me that under apple the rpath is not stripped automatically when doing the install
+    #under linux it is
+    SET(CMAKE_SKIP_RPATH ON)
+  ELSE(APPLE)
+    set(INSTALL_SHARE_DIR lib/CMake/${PROJECT_NAME} CACHE PATH "Installation directory for shared files")
+  ENDIF(APPLE)
+ELSE(UNIX)
+  IF(WIN32 AND NOT CYGWIN)
+    set(INSTALL_SHARE_DIR CMake CACHE PATH "Installation directory for shared files")
+  ELSE(WIN32)
+    MESSAGE(FATAL_ERROR ">> UNKNOWN ARCHITECTURE .. unable to set share dir")
+  ENDIF(WIN32)
+ENDIF(UNIX)
+ 
+# Make relative paths absolute (needed later on)
+foreach(p LIB INCLUDE SHARE)
+  set(var INSTALL_${p}_DIR)
+  if(NOT IS_ABSOLUTE "${${var}}")
+    set(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+  endif()
+endforeach()
+
+# set project build type
+IF(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
+   SET(CMAKE_BUILD_TYPE Release)
+ENDIF(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
+MESSAGE(">> Setting up ${CMAKE_BUILD_TYPE} build")
+
+#########################################################################################################
+## CUDA related
+# project options
+OPTION(INCLUDE_CUDA "Set to OFF to not search for CUDA" ON)
+# find project dependencies
+# find cuda
+IF(INCLUDE_CUDA)
+    FIND_PACKAGE(CUDA)
+    IF(CUDA_FOUND)
+	SET(CUDA_VERBOSE_BUILD ON)
+	
+	set(CUDA_NVCC_FLAGS -gencode arch=compute_10,code=sm_10;-gencode arch=compute_20,code=sm_20)
+	SET(CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
+	IF(APPLE)
+	  IF(${CUDA_HOST_COMPILER} MATCHES "/usr/bin/.*cc" OR EXISTS "/usr/bin/llvm-g++")
+	    MESSAGE(">> adapting CUDA_HOST_COMPILER (${CUDA_HOST_COMPILER}) to match a CUDA supported compiler (/usr/bin/llvm-g++-4.2)")
+	    SET(CUDA_HOST_COMPILER "/usr/bin/llvm-g++")
+	    SET(CMAKE_CXX_COMPILER ${CUDA_HOST_COMPILER})
+	    SET(CMAKE_C_COMPILER "/usr/bin/llvm-gcc")
+	    #gcc-4.2 is quite old and usually does not support 
+	    STRING(REPLACE "${HOST_COMPILER_RELEASE_FLAGS}" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+	    STRING(REPLACE "${HOST_COMPILER_RELEASE_FLAGS}" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+	    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+	  ELSE()
+	    MESSAGE(WARNING ">> unknown CUDA_HOST_COMPILER (${CUDA_HOST_COMPILER}) or /usr/bin/llvm-g++-4.2 does not exist, cuda host compiler remains set to default")
+	  ENDIF()
+	ENDIF(APPLE)
+	IF("${CUDA_VERSION}" VERSION_GREATER "5" OR "${CUDA_VERSION}" VERSION_EQUAL "5")
+	  MESSAGE(">> compiling for Compute Capability 2.x, 3.0 and 3.5 only ")
+          set(CUDA_NVCC_FLAGS "-gencode arch=compute_20,code=sm_20;-gencode arch=compute_30,code=sm_30;-gencode arch=compute_35,code=sm_35")
+	ELSE()
+	  MESSAGE(">> CUDA less than version 5.0 detected, compiling for Compute Capability 2.x only ")
+          set(CUDA_NVCC_FLAGS "-gencode arch=compute_20,code=sm_20;-gencode arch=compute_10,code=sm_10")
+	ENDIF()
+	set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE};-O2;--use_fast_math)
+	set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG};-g;-G)
+    ELSE(CUDA_FOUND)
+        MESSAGE(FATAL_ERROR ">> CUDA not found. Exiting ...")
+    ENDIF(CUDA_FOUND)
+ENDIF(INCLUDE_CUDA)
+ 
+
+# add subdirectories
+ADD_SUBDIRECTORY(src)
+
+# Export the package for use from the build-tree
+# (this registers the build-tree with a global CMake-registry)
+#              v = binary    v = library
+
+export(PACKAGE ${PROJECT_NAME})
+
+# Create the fourierconvolutioncudalib-config.cmake and fourierconvolutioncudalib-config-version files
+file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_SHARE_DIR}"
+   "${INSTALL_INCLUDE_DIR}")
+# ... for the build tree
+set(CONF_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}" "${PROJECT_BINARY_DIR}")
+configure_file(fourierconvolutioncudalib-config.cmake.in
+  "${PROJECT_BINARY_DIR}/fourierconvolutioncudalib-config.cmake" @ONLY)
+# ... for the install tree
+set(CONF_INCLUDE_DIRS "\${FOURIERCONVOLUTIONCUDALIB}/${REL_INCLUDE_DIR}")
+configure_file(fourierconvolutioncudalib-config.cmake.in
+  "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/fourierconvolutioncudalib-config.cmake" @ONLY)
+# ... for both
+configure_file(fourierconvolutioncudalib-config-version.cmake.in
+  "${PROJECT_BINARY_DIR}/fourierconvolutioncudalib-config-version.cmake" @ONLY)
+
+# Install the fourierconvolutioncudalib-config.cmake and fourierconvolutioncudalib-config-version.cmake
+install(FILES
+  "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/fourierconvolutioncudalib-config.cmake"
+  "${PROJECT_BINARY_DIR}/fourierconvolutioncudalib-config-version.cmake"
+  DESTINATION "${INSTALL_SHARE_DIR}" COMPONENT dev)
+
+# Install the export set for use with the install-tree 
+install(EXPORT fourierconvolutioncudalib-targets 
+  DESTINATION "${INSTALL_SHARE_DIR}" COMPONENT dev)
+
+
+
diff --git a/fourierconvolutioncudalib-config-version.cmake.in b/fourierconvolutioncudalib-config-version.cmake.in
new file mode 100644
index 0000000..8b80f8e
--- /dev/null
+++ b/fourierconvolutioncudalib-config-version.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@FOURIERCONVOLUTIONCUDALIB_VERSION@")
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/fourierconvolutioncudalib-config.cmake.in b/fourierconvolutioncudalib-config.cmake.in
new file mode 100644
index 0000000..ae9abaf
--- /dev/null
+++ b/fourierconvolutioncudalib-config.cmake.in
@@ -0,0 +1,33 @@
+# - Config file for the FooBar package
+# It defines the following variables
+#  FOURIERCONVOLUTIONCUDALIB_INCLUDE_DIRS - include directories for fourierconvolutioncudalib
+#  FOURIERCONVOLUTIONCUDALIB_LIBRARIES    - libraries to link against
+#  FOURIERCONVOLUTIONCUDALIB_EXECUTABLE   - the bar executable
+
+# Compute paths
+get_filename_component(FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+
+
+# Our library dependencies (contains definitions for IMPORTED targets)
+include("${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/fourierconvolutioncudalib-targets.cmake")
+
+# These are IMPORTED targets created by fourierconvolutioncudalib-targets.cmake
+IF(UNIX)
+  IF(APPLE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_INCLUDE_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../include" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_LIB_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../lib" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_BIN_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../bin" ABSOLUTE)
+    ELSE(APPLE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_INCLUDE_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../../include" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_LIB_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../../lib" ABSOLUTE)
+    get_filename_component(FOURIERCONVOLUTIONCUDALIB_BIN_DIRS "${FOURIERCONVOLUTIONCUDALIB_CMAKE_DIR}/../../../bin" ABSOLUTE)
+  ENDIF(APPLE)
+ENDIF(UNIX)
+
+set(FOURIERCONVOLUTIONCUDALIB_LIBRARIES  "${FOURIERCONVOLUTIONCUDALIB_LIB_DIRS}/libFourierConvolutionCUDALib.so")
+
+set(FOURIERCONVOLUTIONCUDALIB_VERSION  "@FOURIERCONVOLUTIONCUDALIB_VERSION@")
+set(FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR  "@FOURIERCONVOLUTIONCUDALIB_VERSION_MAJOR@")
+set(FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR  "@FOURIERCONVOLUTIONCUDALIB_VERSION_MINOR@")
+set(FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH  "@FOURIERCONVOLUTIONCUDALIB_VERSION_PATCH@")
+set(FOURIERCONVOLUTIONCUDALIB_BUILD_TYPE  "@CMAKE_BUILD_TYPE@")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..198adba
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,16 @@
+# include directories
+# src directory
+INCLUDE_DIRECTORIES(.)
+
+# build and link
+CUDA_ADD_LIBRARY(${PROJECT_NAME} convolution3Dfft.cu standardCUDAfunctions.cu SHARED)
+CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME})
+SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
+INSTALL(TARGETS ${PROJECT_NAME}
+  EXPORT fourierconvolutioncudalib-targets 
+  LIBRARY DESTINATION "${INSTALL_LIB_DIR}" COMPONENT shlib
+  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" COMPONENT inc)
+
+
+
+

From 76ae2781e58b8cb86ac6c8cc6e03209d1f7cb6a7 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbach@scionics.de>
Date: Fri, 9 Jan 2015 11:33:43 +0100
Subject: [PATCH 02/24] added build instructions for Linux and Mac

---
 README.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index f7023cb..20c91e2 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,15 @@ FourierConvolutionCUDALib
 
 Implementation of 3d non-separable convolution using CUDA &amp; FFT Convolution, originally implemented by Fernando Amat for our Nature Methods paper (http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2929.html).
 
-To compile it under Linux/Mac/Windows I suggest NSight. Clone this repository into your cuda-workspace directory. Then make a new shared library project with the same name as the directory.
+To compile it under Linux & OS X (cuda must be available through `PATH`, `LD_LIBRARY_PATH` or equivalent):
 
-Under Project > Properties > Build > Settings > Tool Settings > NVCC Linker add -lcufft and -lcuda to the command line pattern so that it looks like this:
-
-${COMMAND} ${FLAGS} -lcufft -lcuda ${OUTPUT_FLAG} ${OUTPUT_PREFIX} ${OUTPUT} ${INPUTS}
-
-Now build the .so/.dll library and put it into the Fiji directory.
+```bash
+$ cd /path/to/repo
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_INSTALL_PREFIX=/directory/of/your/choice .. #default is /usr/bin/ or similar
+$ make
+$ make install
+```
 
 NOTE: If you are compiling under Windows, you need to change all 'extern "C"' definitions to 'extern "C" __declspec(dllexport)' for all function calls in the convolution3Dfft.h and convolution3Dfft.cu.

From de24d017686fade6b6edcb5503d26f084c1ab558 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Mon, 12 Jan 2015 15:49:14 +0100
Subject: [PATCH 03/24] fixed build on OSX, need to validate

---
 CMakeLists.txt     | 8 +++++---
 src/CMakeLists.txt | 8 ++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9431e3..9e78c05 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,9 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 
 # project name
 PROJECT(FourierConvolutionCUDALib CXX)
+if(NOT(${CMAKE_VERSION} VERSION_LESS "3.0.0"))
+cmake_policy(SET CMP0042 NEW)
+endif()
  
 # version number
 SET (FOURIERCONVOLUTIONCUDALIB_NAME "CUDA FOURIER CONVOLUTION LIBRARY")
@@ -75,9 +78,8 @@ IF(INCLUDE_CUDA)
 	    SET(CUDA_HOST_COMPILER "/usr/bin/llvm-g++")
 	    SET(CMAKE_CXX_COMPILER ${CUDA_HOST_COMPILER})
 	    SET(CMAKE_C_COMPILER "/usr/bin/llvm-gcc")
-	    #gcc-4.2 is quite old and usually does not support 
-	    STRING(REPLACE "${HOST_COMPILER_RELEASE_FLAGS}" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-	    STRING(REPLACE "${HOST_COMPILER_RELEASE_FLAGS}" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+	    SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libstdc++")
+
 	    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 	  ELSE()
 	    MESSAGE(WARNING ">> unknown CUDA_HOST_COMPILER (${CUDA_HOST_COMPILER}) or /usr/bin/llvm-g++-4.2 does not exist, cuda host compiler remains set to default")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 198adba..ad52d9f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,8 +3,16 @@
 INCLUDE_DIRECTORIES(.)
 
 # build and link
+FIND_PACKAGE(CUDA QUIET)
 CUDA_ADD_LIBRARY(${PROJECT_NAME} convolution3Dfft.cu standardCUDAfunctions.cu SHARED)
 CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME})
+
+IF(APPLE)
+MESSAGE("adding ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib")
+   TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib)
+ ENDIF(APPLE)
+
+
 SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
 INSTALL(TARGETS ${PROJECT_NAME}
   EXPORT fourierconvolutioncudalib-targets 

From 3c29fb5b4680ece9cdf3395150e72f01431cb8f4 Mon Sep 17 00:00:00 2001
From: steinbac <steinbac@myers-pc-8.mpi-cbg.de>
Date: Tue, 7 Apr 2015 15:21:26 +0200
Subject: [PATCH 04/24] added libcuda dependency

---
 src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ad52d9f..e0135b0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,6 +6,7 @@ INCLUDE_DIRECTORIES(.)
 FIND_PACKAGE(CUDA QUIET)
 CUDA_ADD_LIBRARY(${PROJECT_NAME} convolution3Dfft.cu standardCUDAfunctions.cu SHARED)
 CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME})
+TARGET_LINK_LIBRARIES(${PROJECT_NAME} cuda)
 
 IF(APPLE)
 MESSAGE("adding ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib")

From a3f71efd2116ed6dff5357a682e7a3258fe1f8c2 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 14 Apr 2015 11:37:42 +0200
Subject: [PATCH 05/24] pulled in updates from master

---
 src/convolution3Dfft.h      | 12 +++++++++---
 src/standardCUDAfunctions.h | 15 ++++++++++-----
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/convolution3Dfft.h b/src/convolution3Dfft.h
index eedff69..95c161a 100644
--- a/src/convolution3Dfft.h
+++ b/src/convolution3Dfft.h
@@ -3,6 +3,12 @@
 
 #include "standardCUDAfunctions.h"
 
+#ifdef _WIN32
+#define FUNCTION_PREFIX extern "C" __declspec(dllexport)
+#else
+#define FUNCTION_PREFIX extern "C"
+#endif
+
 //define constants
 typedef float imageType;//the kind sof images we are working with (you will need to recompile to work with other types)
 static const int MAX_THREADS_CUDA = 1024; //adjust it for your GPU. This is correct for a 2.0 architecture
@@ -25,7 +31,7 @@ WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDi
 
 TODO: pad data with imSize+kernelSize-1 (kernelSize/2 on each side) to impose the boundary conditions that you want: mirror, zero, etc...). Look at conv2DFFT in the CUDA SDK examples
 */
-extern "C" imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
+FUNCTION_PREFIX imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
 
 
 /*
@@ -36,7 +42,7 @@ WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDi
 
 TODO: pad data with imSize+kernelSize-1 (kernelSize/2 on each side) to impose the boundary conditions that you want: mirror, zero, etc...). Look at conv2DFFT in the CUDA SDK examples
 */
-extern "C" void convolution3DfftCUDAInPlace(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
+FUNCTION_PREFIX void convolution3DfftCUDAInPlace(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
 
 /*
 \brief Like convolution3DfftCUDAInPlace but we do not pad the kernel with zeros before convolution. We just do nearest neighbor interpolation of the small size kernel FFT. In preliminary tests we do not seem to lose significant precision in the results
@@ -44,6 +50,6 @@ extern "C" void convolution3DfftCUDAInPlace(imageType* im,int* imDim,imageType*
 NOTE: on a Tesla C2075 the maximum image size seems to be 1024 x  875 x 512 (in float). Approximately 1750 MB. It takes 2.5secs for this maximum size (mostly transferring memory back and forth to GPU)
 WARNING: If imSize > 5*kernelSize the maximum relative error in the convolution is 2% (SO WE SACRIFICE PRECISION FOR MEMORY AND LITTLE SPEED UP)
 */
-//extern "C" __declspec(dllexport) void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
+//FUNCTION_PREFIX void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
 
 #endif //__CONVOLUTION_3D_FFT_H__
diff --git a/src/standardCUDAfunctions.h b/src/standardCUDAfunctions.h
index 60c8414..376fe46 100644
--- a/src/standardCUDAfunctions.h
+++ b/src/standardCUDAfunctions.h
@@ -8,12 +8,17 @@
 #ifndef STANDARDCUDAFUNCTIONS_H_
 #define STANDARDCUDAFUNCTIONS_H_
 
+#ifdef _WIN32
+#define FUNCTION_PREFIX extern "C" __declspec(dllexport)
+#else
+#define FUNCTION_PREFIX extern "C"
+#endif
 //----------------------------------functions to decide whhich GPU to use-------------------------------
 
-extern "C" int getCUDAcomputeCapabilityMinorVersion(int devCUDA);
-extern "C" int getCUDAcomputeCapabilityMajorVersion(int devCUDA);
-extern "C" int getNumDevicesCUDA();
-extern "C" void getNameDeviceCUDA(int devCUDA, char *name);
-extern "C" long long int getMemDeviceCUDA(int devCUDA);
+FUNCTION_PREFIX int getCUDAcomputeCapabilityMinorVersion(int devCUDA);
+FUNCTION_PREFIX int getCUDAcomputeCapabilityMajorVersion(int devCUDA);
+FUNCTION_PREFIX int getNumDevicesCUDA();
+FUNCTION_PREFIX void getNameDeviceCUDA(int devCUDA, char *name);
+FUNCTION_PREFIX long long int getMemDeviceCUDA(int devCUDA);
 
 #endif /* STANDARDCUDAFUNCTIONS_H_ */

From f5f2cafd2e61c1e7703cd86356c30ce8d5c54097 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 14 Apr 2015 11:50:38 +0200
Subject: [PATCH 06/24] added documentation on how to build under Windows using
 Nsight and how to build on OSX/Linux using cmake

---
 README.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 20c91e2..450df30 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,21 @@ FourierConvolutionCUDALib
 
 Implementation of 3d non-separable convolution using CUDA &amp; FFT Convolution, originally implemented by Fernando Amat for our Nature Methods paper (http://www.nature.com/nmeth/journal/v11/n6/full/nmeth.2929.html).
 
-To compile it under Linux & OS X (cuda must be available through `PATH`, `LD_LIBRARY_PATH` or equivalent):
+Windows Build Instructions
+--------------------------
+
+To compile it under Windows, NSight available from the CUDA SDK is suggested. Clone this repository into your cuda-workspace directory. Then make a new shared library project with the same name as the directory.
+
+Under Project > Properties > Build > Settings > Tool Settings > NVCC Linker add -lcufft and -lcuda to the command line pattern so that it looks like this:
+
+${COMMAND} ${FLAGS} -lcufft -lcuda ${OUTPUT_FLAG} ${OUTPUT_PREFIX} ${OUTPUT} ${INPUTS}
+
+Now build the .so/.dll library and put it into the Fiji directory.
+
+OSX/Linux Build Instructions
+----------------------------
+
+First, make sure that CUDA must be available through `PATH`, `LD_LIBRARY_PATH` or equivalent. The build system is based on cmake, so please install this at any version higher or equal than 2.8.
 
 ```bash
 $ cd /path/to/repo
@@ -14,4 +28,4 @@ $ make
 $ make install
 ```
 
-NOTE: If you are compiling under Windows, you need to change all 'extern "C"' definitions to 'extern "C" __declspec(dllexport)' for all function calls in the convolution3Dfft.h and convolution3Dfft.cu.
+

From f487ef8151e231a117d42ac4301e4a659b9220cd Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Thu, 23 Apr 2015 10:32:06 +0200
Subject: [PATCH 07/24] * added unit tests to package (using boot unified test
 framework) * expanded standardCUDAfunctions.cu by a function that tries to
 find   the GPU on the host with the highest compute capability,   this
 function is used for the unit tests only   (it assumes that this devices
 comes with the highest DRAM) * tests compile and run on Linux

---
 CMakeLists.txt               |   9 ++
 CTestLists.txt               |  10 ++
 src/standardCUDAfunctions.cu |  22 +++
 src/standardCUDAfunctions.h  |   1 +
 tests/CMakeLists.txt         |  40 +++++
 tests/image_stack_utils.cpp  |  68 ++++++++
 tests/image_stack_utils.h    |  77 +++++++++
 tests/test_algorithms.hpp    | 165 +++++++++++++++++++
 tests/test_fixtures.hpp      | 304 +++++++++++++++++++++++++++++++++++
 tests/test_gpu_convolve.cpp  |  25 +++
 10 files changed, 721 insertions(+)
 create mode 100644 CTestLists.txt
 create mode 100644 tests/CMakeLists.txt
 create mode 100644 tests/image_stack_utils.cpp
 create mode 100644 tests/image_stack_utils.h
 create mode 100644 tests/test_algorithms.hpp
 create mode 100644 tests/test_fixtures.hpp
 create mode 100644 tests/test_gpu_convolve.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e78c05..b727fb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,15 @@ ENDIF(INCLUDE_CUDA)
 # add subdirectories
 ADD_SUBDIRECTORY(src)
 
+FIND_PACKAGE (Boost 1.42 COMPONENTS system filesystem unit_test_framework thread REQUIRED)
+IF(Boost_FOUND)
+ADD_SUBDIRECTORY(tests)
+enable_testing()
+include("CTestLists.txt")
+ELSE()
+MESSAGE(WARNING ">> Boost libraries not found, unable to compile tests (skipping them)")
+ENDIF()
+
 # Export the package for use from the build-tree
 # (this registers the build-tree with a global CMake-registry)
 #              v = binary    v = library
diff --git a/CTestLists.txt b/CTestLists.txt
new file mode 100644
index 0000000..30dd760
--- /dev/null
+++ b/CTestLists.txt
@@ -0,0 +1,10 @@
+# include(BoostTestTargets)
+# add_boost_test(Independent
+#  SOURCES
+#  test_Independent.cpp
+#  TESTS
+#  Independent_suite)
+
+
+add_test(NAME gpu_convolve COMMAND test_gpu_convolve)
+
diff --git a/src/standardCUDAfunctions.cu b/src/standardCUDAfunctions.cu
index 23c8c92..3d4b0f7 100644
--- a/src/standardCUDAfunctions.cu
+++ b/src/standardCUDAfunctions.cu
@@ -9,6 +9,28 @@
 #include "standardCUDAfunctions.h"
 
 //==============================================
+int selectDeviceWithHighestComputeCapability() {
+
+  int numDevices = 0;
+  HANDLE_ERROR(cudaGetDeviceCount(&numDevices));
+  int computeCapability = 0;
+  int meta = 0;
+  int value = -1;
+  int major = 0;
+  int minor = 0;
+
+  for (short devIdx = 0; devIdx < numDevices; ++devIdx) {
+    cuDeviceComputeCapability(&major, &minor, devIdx);
+    meta = 10 * major + minor;
+    if (meta > computeCapability) {
+      computeCapability = meta;
+      value = devIdx;
+    }
+  }
+
+  return value;
+}
+
 int getCUDAcomputeCapabilityMajorVersion(int devCUDA)
 {
 	int major = 0, minor = 0;
diff --git a/src/standardCUDAfunctions.h b/src/standardCUDAfunctions.h
index 376fe46..f9bde7f 100644
--- a/src/standardCUDAfunctions.h
+++ b/src/standardCUDAfunctions.h
@@ -15,6 +15,7 @@
 #endif
 //----------------------------------functions to decide whhich GPU to use-------------------------------
 
+FUNCTION_PREFIX int selectDeviceWithHighestComputeCapability();
 FUNCTION_PREFIX int getCUDAcomputeCapabilityMinorVersion(int devCUDA);
 FUNCTION_PREFIX int getCUDAcomputeCapabilityMajorVersion(int devCUDA);
 FUNCTION_PREFIX int getNumDevicesCUDA();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..64ee942
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,40 @@
+INCLUDE_DIRECTORIES(.)
+
+SET(Boost_USE_MULTITHREADED ON)
+FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem unit_test_framework REQUIRED)
+IF(Boost_FOUND)
+INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
+LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) 
+ENDIF()
+
+#FIXME: OS dependent
+IF(DEFINED ${BOOST_ROOT} AND ${BOOST_ROOT} MATCHES ".*/.*")
+  SET(BOOST_LIB_DIR ${BOOST_ROOT}/lib)
+ELSE()
+  SET(BOOST_LIB_DIR ${Boost_LIBRARY_DIRS})
+ENDIF()
+
+add_library( ext_boost_system  SHARED IMPORTED )
+set_target_properties( ext_boost_system PROPERTIES IMPORTED_LOCATION ${BOOST_LIB_DIR}/libboost_system.so )
+
+add_library( ext_boost_filesystem  SHARED IMPORTED )
+set_target_properties( ext_boost_filesystem PROPERTIES IMPORTED_LOCATION ${BOOST_LIB_DIR}/libboost_filesystem.so )
+
+add_library( ext_boost_unit_test_framework  SHARED IMPORTED )
+set_target_properties( ext_boost_unit_test_framework PROPERTIES IMPORTED_LOCATION ${BOOST_LIB_DIR}/libboost_unit_test_framework.so )
+
+
+IF(CUDA_FOUND)
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-I${PROJECT_SOURCE_DIR}/src)
+
+
+CUDA_ADD_EXECUTABLE(test_gpu_convolve test_gpu_convolve.cpp image_stack_utils.cpp)
+target_link_libraries(test_gpu_convolve ext_boost_system ext_boost_filesystem ext_boost_unit_test_framework ${PROJECT_NAME})
+set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-I${PROJECT_SOURCE_DIR}/src")
+
+ELSE(CUDA_FOUND)
+MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!")
+ENDIF(CUDA_FOUND)
+
+
diff --git a/tests/image_stack_utils.cpp b/tests/image_stack_utils.cpp
new file mode 100644
index 0000000..55a66ce
--- /dev/null
+++ b/tests/image_stack_utils.cpp
@@ -0,0 +1,68 @@
+#define _IMAGE_STACK_UTILS_CPP_
+#include <iomanip>
+#include "image_stack_utils.h"
+
+namespace multiviewnative {
+
+/**
+   \brief function to print an image_stack
+
+   \param[in] _cout input ostream
+   \param[in] _marray input multi-dim array, _marray is expected to be defined
+   in c_storage_order
+   that means, if one has a stack of
+   width = 2 (number of columns, x dimension),
+   height = 3 (number of rows, y dimension),
+   depth = 4 (number of frames, z dimension)
+   then this function expects that it can access the array as
+   _marray[z][y][x]
+   similarly, the dimensions located at _marray.shape() are expected to be
+   _marray.shape()[0] = depth
+   _marray.shape()[1] = height
+   _marray.shape()[2] = height
+   \return
+   \retval
+
+*/
+std::ostream& operator<<(std::ostream& _cout, const image_stack& _marray) {
+
+  if (image_stack::dimensionality != 3) {
+    _cout << "dim!=3\n";
+    return _cout;
+  }
+
+  if (_marray.empty()) {
+    _cout << "size=0\n";
+    return _cout;
+  }
+
+  int precision = _cout.precision();
+  _cout << std::setprecision(4);
+  const size_t* shapes = _marray.shape();
+
+  _cout << std::setw(9) << "x = ";
+  for (size_t x_index = 0; x_index < (shapes[2]); ++x_index) {
+    _cout << std::setw(8) << x_index << " ";
+  }
+  _cout << "\n";
+  _cout << std::setfill('-') << std::setw((shapes[2] + 1) * 9) << " "
+        << std::setfill(' ') << std::endl;
+
+  for (size_t z_index = 0; z_index < (shapes[0]); ++z_index) {
+    _cout << "z[" << std::setw(5) << z_index << "] \n";
+    for (size_t y_index = 0; y_index < (shapes[1]); ++y_index) {
+      _cout << "y[" << std::setw(5) << y_index << "] ";
+
+      for (size_t x_index = 0; x_index < (shapes[2]); ++x_index) {
+        _cout << std::setw(8) << _marray[z_index][y_index][x_index] << " ";
+      }
+
+      _cout << "\n";
+    }
+    _cout << "\n";
+  }
+
+  _cout << std::setprecision(precision);
+  return _cout;
+}
+}
diff --git a/tests/image_stack_utils.h b/tests/image_stack_utils.h
new file mode 100644
index 0000000..2a10964
--- /dev/null
+++ b/tests/image_stack_utils.h
@@ -0,0 +1,77 @@
+#ifndef _IMAGE_STACK_UTILS_H_
+#define _IMAGE_STACK_UTILS_H_
+#include <vector>
+#include <iostream>
+#include "boost/multi_array.hpp"
+
+namespace multiviewnative {
+
+  typedef boost::multi_array<float, 3> image_stack;
+  typedef boost::multi_array_ref<float, 3> image_stack_ref;
+  typedef boost::const_multi_array_ref<float, 3> image_stack_cref;
+  typedef boost::multi_array<float, 2> image_frame;
+  typedef boost::multi_array_ref<float, 2> image_frame_ref;
+  typedef boost::const_multi_array_ref<float, 2> image_frame_cref;
+  typedef image_stack::array_view<3>::type image_stack_view;
+  typedef image_stack::array_view<2>::type image_stack_frame;
+  typedef image_stack::array_view<1>::type image_stack_line;
+  typedef boost::multi_array_types::index_range range;
+  typedef boost::general_storage_order<3> storage;
+
+  std::ostream& operator<<(std::ostream&, const image_stack&);
+
+  template <typename DimT, typename ODimT>
+  void adapt_extents_for_fftw_inplace(const DimT& _extent, ODimT& _value,
+				      const storage& _storage =
+				      boost::c_storage_order()) {
+
+    std::fill(_value.begin(), _value.end(), 0);
+
+    std::vector<int> storage_order(_extent.size());
+    for (size_t i = 0; i < _extent.size(); ++i)
+      storage_order[i] = _storage.ordering(i);
+
+    size_t lowest_storage_index =
+      std::min_element(storage_order.begin(), storage_order.end()) -
+      storage_order.begin();
+
+    for (size_t i = 0; i < _extent.size(); ++i)
+      _value[i] =
+        (lowest_storage_index == i) ? 2 * (_extent[i] / 2 + 1) : _extent[i];
+  }
+
+  template <typename StorageT, typename DimT, typename ODimT>
+  void adapt_shape_for_fftw_inplace(const DimT& _extent,
+				    ODimT& _value,
+				    const StorageT& _storage) {
+
+    std::fill(_value.begin(), _value.end(), 0);
+
+    size_t lowest_storage_index =
+      std::min_element(_storage.begin(), _storage.end()) - _storage.begin();
+
+    for (size_t i = 0; i < _extent.size(); ++i)
+      _value[i] =
+        (lowest_storage_index == i) ? 2 * (_extent[i] / 2 + 1) : _extent[i];
+  }
+
+
+
+template <typename DimT, typename ODimT>
+void adapt_extents_for_cufft_inplace(const DimT& _extent, ODimT& _value,
+				     const storage& _storage =
+				     boost::c_storage_order()) {
+
+  adapt_extents_for_fftw_inplace(_extent, _value, _storage);
+}
+
+template <typename StorageT, typename DimT, typename ODimT>
+void adapt_shape_for_cufft_inplace(const DimT& _extent,
+				   ODimT& _value,
+				   const StorageT& _storage) {
+
+  adapt_shape_for_fftw_inplace(_extent, _value, _storage);
+}
+
+}
+#endif /* _IMAGE_STACK_UTILS_H_ */
diff --git a/tests/test_algorithms.hpp b/tests/test_algorithms.hpp
new file mode 100644
index 0000000..0c79143
--- /dev/null
+++ b/tests/test_algorithms.hpp
@@ -0,0 +1,165 @@
+#ifndef _TEST_ALGORITHMS_H_
+#define _TEST_ALGORITHMS_H_
+
+#include <vector>
+#include "image_stack_utils.h"
+
+namespace multiviewnative {
+
+template <typename ImageStackT, typename DimT>
+void convolve(const ImageStackT& _image, const ImageStackT& _kernel,
+              ImageStackT& _result, const std::vector<DimT>& _offset) {
+
+  if (!_image.num_elements()) return;
+
+  std::vector<DimT> half_kernel(3);
+  for (unsigned i = 0; i < 3; ++i) half_kernel[i] = _kernel.shape()[i] / 2;
+
+  float image_value = 0;
+  float kernel_value = 0;
+  float value = 0;
+
+  for (int image_z = _offset[0]; image_z < int(_image.shape()[0] - _offset[0]);
+       ++image_z) {
+    for (int image_y = _offset[1];
+         image_y < int(_image.shape()[1] - _offset[1]); ++image_y) {
+      for (int image_x = _offset[2];
+           image_x < int(_image.shape()[2] - _offset[2]); ++image_x) {
+
+        _result[image_z][image_y][image_x] = 0.f;
+
+        image_value = 0;
+        kernel_value = 0;
+        value = 0;
+
+        // TODO: adapt loops to storage order for the sake of speed (if so,
+        // prefetch line to use SIMD)
+        for (int kernel_z = 0; kernel_z < int(_kernel.shape()[0]); ++kernel_z) {
+          for (int kernel_y = 0; kernel_y < int(_kernel.shape()[1]);
+               ++kernel_y) {
+            for (int kernel_x = 0; kernel_x < int(_kernel.shape()[2]);
+                 ++kernel_x) {
+
+              kernel_value = _kernel[_kernel.shape()[0] - 1 - kernel_z]
+                                    [_kernel.shape()[1] - 1 - kernel_y]
+                                    [_kernel.shape()[2] - 1 - kernel_x];
+              image_value = _image[image_z - half_kernel[0] + kernel_z]
+                                  [image_y - half_kernel[1] + kernel_y]
+                                  [image_x - half_kernel[2] + kernel_x];
+
+              value += kernel_value * image_value;
+            }
+          }
+        }
+        _result[image_z][image_y][image_x] = value;
+      }
+    }
+  }
+}
+
+template <typename ImageStackT, typename DimT>
+typename ImageStackT::element sum_from_offset(
+    const ImageStackT& _image, const std::vector<DimT>& _offset) {
+
+  typename ImageStackT::element value = 0.f;
+
+  // TODO: adapt loops to storage order for the sake of speed (if so, prefetch
+  // line to use SIMD)
+  for (int image_z = _offset[0]; image_z < int(_image.shape()[0] - _offset[0]);
+       ++image_z) {
+    for (int image_y = _offset[1];
+         image_y < int(_image.shape()[1] - _offset[1]); ++image_y) {
+      for (int image_x = _offset[2];
+           image_x < int(_image.shape()[2] - _offset[2]); ++image_x) {
+
+        value += _image[image_z][image_y][image_x];
+      }
+    }
+  }
+
+  return value;
+}
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+template <typename ValueT, typename DimT>
+ValueT l2norm(ValueT* _first, ValueT* _second, const DimT& _size) {
+
+  ValueT l2norm = 0.;
+
+#ifdef _OPENMP
+  int nthreads = omp_get_num_procs();
+  #pragma omp parallel for num_threads(nthreads) shared(l2norm)
+#endif
+  for (unsigned p = 0; p < _size; ++p)
+    l2norm += (_first[p] - _second[p]) * (_first[p] - _second[p]);
+
+  return l2norm;
+}
+
+template <typename ImageT, typename OtherT>
+float l2norm_within_limits(const ImageT& _first, const OtherT& _second,
+                           const float& _rel_lower_limit_per_axis,
+                           const float& _rel_upper_limit_per_axis) {
+
+  if (_rel_upper_limit_per_axis < _rel_lower_limit_per_axis) {
+    std::cerr << "[l2norm_within_limits]\treceived weird interval ["
+              << _rel_lower_limit_per_axis << "," << _rel_upper_limit_per_axis
+              << "]\n";
+    return 0;
+  }
+
+  float l2norm = 0.;
+
+#pragma omp parallel for num_threads(omp_get_num_procs()) shared(l2norm)
+  for (int image_z = _first.shape()[0] * _rel_lower_limit_per_axis;
+       image_z < int(_first.shape()[0] * _rel_upper_limit_per_axis);
+       ++image_z) {
+    for (int image_y = _first.shape()[1] * _rel_lower_limit_per_axis;
+         image_y < int(_first.shape()[1] * _rel_upper_limit_per_axis);
+         ++image_y) {
+      for (int image_x = _first.shape()[2] * _rel_lower_limit_per_axis;
+           image_x < int(_first.shape()[2] * _rel_upper_limit_per_axis);
+           ++image_x) {
+
+        float intermediate = _first[image_z][image_y][image_x] -
+                             _second[image_z][image_y][image_x];
+        l2norm += (intermediate) * (intermediate);
+      }
+    }
+  }
+
+  return l2norm;
+}
+
+template <typename ValueT, typename DimT>
+ValueT l1norm(ValueT* _first, ValueT* _second, const DimT& _size) {
+
+  ValueT l1norm = 0.;
+
+#pragma omp parallel for num_threads(omp_get_num_procs()) shared(l1norm)
+  for (unsigned p = 0; p < _size; ++p)
+    l1norm += std::fabs(_first[p] - _second[p]);
+
+  return l1norm;
+}
+
+template <typename ValueT, typename DimT>
+ValueT max_value(ValueT* _data, const DimT& _size) {
+
+  ValueT max = *std::max_element(_data, _data + _size);
+
+  return max;
+}
+
+template <typename ValueT, typename DimT>
+ValueT min_value(ValueT* _data, const DimT& _size) {
+
+  ValueT min = *std::min_element(_data, _data + _size);
+
+  return min;
+}
+}
+#endif /* _TEST_ALGORITHMS_H_ */
diff --git a/tests/test_fixtures.hpp b/tests/test_fixtures.hpp
new file mode 100644
index 0000000..10e3956
--- /dev/null
+++ b/tests/test_fixtures.hpp
@@ -0,0 +1,304 @@
+#ifndef _TEST_FIXTURES_H_
+#define _TEST_FIXTURES_H_
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+#include <numeric>
+#include <functional>
+#include <algorithm>
+
+#include <boost/static_assert.hpp>
+#include "boost/multi_array.hpp"
+
+// http://www.boost.org/doc/libs/1_55_0/libs/multi_array/doc/user.html
+// http://stackoverflow.com/questions/2168082/how-to-rewrite-array-from-row-order-to-column-order
+#include "image_stack_utils.h"
+#include "test_algorithms.hpp"
+
+namespace multiviewnative {
+
+template <unsigned short KernelDimSize = 3, unsigned ImageDimSize = 8>
+struct convolutionFixture3D {
+
+  static const int n_dim = 3;
+  static const unsigned halfKernel = KernelDimSize / 2u;
+  static const unsigned imageDimSize = ImageDimSize;
+  static const unsigned kernelDimSize = KernelDimSize;
+
+  const unsigned image_size_;
+  std::vector<int> image_dims_;
+  std::vector<int> padded_image_dims_;
+  std::vector<int> asymm_padded_image_dims_;
+  image_stack image_;
+  image_stack one_;
+  image_stack padded_image_;
+  image_stack padded_one_;
+  image_stack asymm_padded_image_;
+  image_stack asymm_padded_one_;
+  image_stack image_folded_by_horizontal_;
+  image_stack image_folded_by_vertical_;
+  image_stack image_folded_by_depth_;
+  image_stack image_folded_by_all1_;
+  image_stack one_folded_by_asymm_cross_kernel_;
+  image_stack one_folded_by_asymm_one_kernel_;
+  image_stack one_folded_by_asymm_identity_kernel_;
+
+  const unsigned kernel_size_;
+  std::vector<int> kernel_dims_;
+  std::vector<int> asymm_kernel_dims_;
+  image_stack trivial_kernel_;
+  image_stack identity_kernel_;
+  image_stack vertical_kernel_;
+  image_stack horizont_kernel_;
+  image_stack depth_kernel_;
+  image_stack all1_kernel_;
+  image_stack asymm_cross_kernel_;
+  image_stack asymm_one_kernel_;
+  image_stack asymm_identity_kernel_;
+
+  std::vector<int> symm_offsets_;
+  std::vector<range> symm_ranges_;
+  std::vector<int> asymm_offsets_;
+  std::vector<range> asymm_ranges_;
+
+  BOOST_STATIC_ASSERT(KernelDimSize % 2 != 0);
+
+ public:
+  convolutionFixture3D()
+      : image_size_((unsigned)std::pow(ImageDimSize, n_dim)),
+        image_dims_(n_dim, ImageDimSize),
+        padded_image_dims_(n_dim, ImageDimSize + 2 * (KernelDimSize / 2)),
+        asymm_padded_image_dims_(n_dim),
+        image_(boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_(boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        padded_image_(boost::extents[ImageDimSize + 2 * (KernelDimSize / 2)]
+                                    [ImageDimSize + 2 * (KernelDimSize / 2)]
+                                    [ImageDimSize + 2 * (KernelDimSize / 2)]),
+        padded_one_(boost::extents[ImageDimSize + 2 * (KernelDimSize / 2)]
+                                  [ImageDimSize + 2 * (KernelDimSize / 2)]
+                                  [ImageDimSize + 2 * (KernelDimSize / 2)]),
+        asymm_padded_image_(),
+        asymm_padded_one_(),
+        image_folded_by_horizontal_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        image_folded_by_vertical_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        image_folded_by_depth_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        image_folded_by_all1_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_folded_by_asymm_cross_kernel_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_folded_by_asymm_one_kernel_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        one_folded_by_asymm_identity_kernel_(
+            boost::extents[ImageDimSize][ImageDimSize][ImageDimSize]),
+        kernel_size_((unsigned)std::pow(KernelDimSize, n_dim)),
+        kernel_dims_(n_dim, KernelDimSize),
+        asymm_kernel_dims_(n_dim, KernelDimSize),
+        trivial_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        identity_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        vertical_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        horizont_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        depth_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        all1_kernel_(
+            boost::extents[KernelDimSize][KernelDimSize][KernelDimSize]),
+        asymm_cross_kernel_(boost::extents[KernelDimSize + 1][KernelDimSize]
+                                          [KernelDimSize - 1]),
+        asymm_one_kernel_(boost::extents[KernelDimSize + 1][KernelDimSize]
+                                        [KernelDimSize - 1]),
+        asymm_identity_kernel_(boost::extents[KernelDimSize + 1][KernelDimSize]
+                                             [KernelDimSize - 1]),
+        symm_offsets_(n_dim, halfKernel),
+        symm_ranges_(n_dim, range(halfKernel, halfKernel + ImageDimSize)),
+        asymm_offsets_(n_dim, 0),
+        asymm_ranges_(n_dim) {
+
+    // FILL KERNELS
+
+    std::fill(trivial_kernel_.data(), trivial_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(identity_kernel_.data(), identity_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(vertical_kernel_.data(), vertical_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(depth_kernel_.data(), depth_kernel_.data() + kernel_size_, 0.f);
+    std::fill(horizont_kernel_.data(), horizont_kernel_.data() + kernel_size_,
+              0.f);
+    std::fill(all1_kernel_.data(), all1_kernel_.data() + kernel_size_, 1.f);
+    std::fill(asymm_cross_kernel_.data(),
+              asymm_cross_kernel_.data() + asymm_cross_kernel_.num_elements(),
+              0.f);
+    std::fill(asymm_one_kernel_.data(),
+              asymm_one_kernel_.data() + asymm_one_kernel_.num_elements(), 0.f);
+    std::fill(
+        asymm_identity_kernel_.data(),
+        asymm_identity_kernel_.data() + asymm_identity_kernel_.num_elements(),
+        0.f);
+
+    identity_kernel_.data()[kernel_size_ / 2] = 1.;
+
+    for (unsigned int index = 0; index < KernelDimSize; ++index) {
+      horizont_kernel_[halfKernel][halfKernel][index] = float(index + 1);
+      vertical_kernel_[halfKernel][index][halfKernel] = float(index + 1);
+      depth_kernel_[index][halfKernel][halfKernel] = float(index + 1);
+    }
+
+    asymm_identity_kernel_[asymm_cross_kernel_.shape()[0] / 2]
+                          [asymm_cross_kernel_.shape()[1] / 2]
+                          [asymm_cross_kernel_.shape()[2] / 2] = 1.f;
+
+    for (int z_index = 0; z_index < int(asymm_cross_kernel_.shape()[0]);
+         ++z_index) {
+      for (int y_index = 0; y_index < int(asymm_cross_kernel_.shape()[1]);
+           ++y_index) {
+        for (int x_index = 0; x_index < int(asymm_cross_kernel_.shape()[2]);
+             ++x_index) {
+
+          if (z_index == (int)asymm_cross_kernel_.shape()[0] / 2 &&
+              y_index == (int)asymm_cross_kernel_.shape()[1] / 2) {
+            asymm_cross_kernel_[z_index][y_index][x_index] = x_index + 1;
+            asymm_one_kernel_[z_index][y_index][x_index] = 1;
+          }
+
+          if (x_index == (int)asymm_cross_kernel_.shape()[2] / 2 &&
+              y_index == (int)asymm_cross_kernel_.shape()[1] / 2) {
+            asymm_cross_kernel_[z_index][y_index][x_index] = z_index + 101;
+            asymm_one_kernel_[z_index][y_index][x_index] = 1;
+          }
+
+          if (x_index == (int)asymm_cross_kernel_.shape()[2] / 2 &&
+              z_index == (int)asymm_cross_kernel_.shape()[0] / 2) {
+            asymm_cross_kernel_[z_index][y_index][x_index] = y_index + 11;
+            asymm_one_kernel_[z_index][y_index][x_index] = 1;
+          }
+        }
+      }
+    }
+
+    // FILL IMAGES
+    unsigned padded_image_axis = ImageDimSize + 2 * halfKernel;
+    unsigned padded_image_size = std::pow(padded_image_axis, n_dim);
+    std::fill(image_.data(), image_.data() + image_size_, 0.f);
+    std::fill(one_.data(), one_.data() + image_size_, 0.f);
+    std::fill(padded_image_.data(), padded_image_.data() + padded_image_size,
+              0.f);
+    std::fill(padded_one_.data(), padded_one_.data() + padded_image_size, 0.f);
+    padded_one_[padded_image_axis / 2][padded_image_axis / 2]
+               [padded_image_axis / 2] = 1.f;
+    one_[ImageDimSize / 2][ImageDimSize / 2][ImageDimSize / 2] = 1.f;
+
+    unsigned image_index = 0;
+    for (int z_index = 0; z_index < int(image_dims_[0]); ++z_index) {
+      for (int y_index = 0; y_index < int(image_dims_[1]); ++y_index) {
+        for (int x_index = 0; x_index < int(image_dims_[2]); ++x_index) {
+          image_[z_index][y_index][x_index] = float(image_index++);
+        }
+      }
+    }
+
+    // PADD THE IMAGE FOR CONVOLUTION
+    range axis_subrange = range(halfKernel, halfKernel + ImageDimSize);
+    image_stack_view padded_image_original = padded_image_
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    padded_image_original = image_;
+
+    image_stack padded_image_folded_by_horizontal = padded_image_;
+    image_stack padded_image_folded_by_vertical = padded_image_;
+    image_stack padded_image_folded_by_depth = padded_image_;
+    image_stack padded_image_folded_by_all1 = padded_image_;
+
+    // PREPARE ASYMM IMAGES
+    // std::vector<unsigned> asymm_padded_image_shape(n_dim);
+
+    std::copy(asymm_cross_kernel_.shape(),
+              asymm_cross_kernel_.shape() + image_stack::dimensionality,
+              asymm_kernel_dims_.begin());
+
+
+    for (int i = 0; i < n_dim; ++i) {
+      asymm_offsets_[i] = asymm_cross_kernel_.shape()[i]/2;
+      asymm_ranges_[i] =
+          range(asymm_offsets_[i], asymm_offsets_[i] + ImageDimSize);
+      asymm_padded_image_dims_[i] = ImageDimSize + 2 * asymm_offsets_[i];
+    }
+
+    asymm_padded_image_.resize(asymm_padded_image_dims_);
+    asymm_padded_one_.resize(asymm_padded_image_dims_);
+    std::fill(asymm_padded_image_.data(),
+              asymm_padded_image_.data() + asymm_padded_image_.num_elements(),
+              0.f);
+    std::fill(asymm_padded_one_.data(),
+              asymm_padded_one_.data() + asymm_padded_one_.num_elements(), 0.f);
+    asymm_padded_one_[asymm_padded_one_.shape()[0] / 2]
+                     [asymm_padded_one_.shape()[1] / 2]
+                     [asymm_padded_one_.shape()[2] / 2] = 1.f;
+
+    image_stack_view asymm_padded_image_original = asymm_padded_image_
+        [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]][asymm_ranges_[2]]];
+    asymm_padded_image_original = image_;
+
+    image_stack asymm_padded_one_folded_by_asymm_cross_kernel =
+        asymm_padded_one_;
+    image_stack asymm_padded_one_folded_by_asymm_one_kernel = asymm_padded_one_;
+    image_stack asymm_padded_one_folded_by_asymm_identity_kernel =
+        asymm_padded_one_;
+
+    // CONVOLVE
+    convolve(padded_image_, horizont_kernel_, padded_image_folded_by_horizontal,
+             symm_offsets_);
+    convolve(padded_image_, vertical_kernel_, padded_image_folded_by_vertical,
+             symm_offsets_);
+    convolve(padded_image_, depth_kernel_, padded_image_folded_by_depth,
+             symm_offsets_);
+    convolve(padded_image_, all1_kernel_, padded_image_folded_by_all1,
+             symm_offsets_);
+
+    convolve(asymm_padded_one_, asymm_cross_kernel_,
+             asymm_padded_one_folded_by_asymm_cross_kernel, asymm_offsets_);
+    convolve(asymm_padded_one_, asymm_one_kernel_,
+             asymm_padded_one_folded_by_asymm_one_kernel, asymm_offsets_);
+    convolve(asymm_padded_one_, asymm_identity_kernel_,
+             asymm_padded_one_folded_by_asymm_identity_kernel, asymm_offsets_);
+
+    // EXTRACT NON-PADDED CONTENT FROM CONVOLVED IMAGE STACKS
+    image_folded_by_horizontal_ = padded_image_folded_by_horizontal
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    image_folded_by_vertical_ = padded_image_folded_by_vertical
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    image_folded_by_depth_ = padded_image_folded_by_depth
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+    image_folded_by_all1_ = padded_image_folded_by_all1
+        [boost::indices[axis_subrange][axis_subrange][axis_subrange]];
+
+    one_folded_by_asymm_cross_kernel_ =
+        asymm_padded_one_folded_by_asymm_cross_kernel
+            [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]]
+                           [asymm_ranges_[2]]];
+    one_folded_by_asymm_one_kernel_ =
+        asymm_padded_one_folded_by_asymm_one_kernel
+            [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]]
+                           [asymm_ranges_[2]]];
+    one_folded_by_asymm_identity_kernel_ =
+        asymm_padded_one_folded_by_asymm_identity_kernel
+            [boost::indices[asymm_ranges_[0]][asymm_ranges_[1]]
+                           [asymm_ranges_[2]]];
+  }
+
+  virtual ~convolutionFixture3D() {};
+
+  static const unsigned image_axis_size = ImageDimSize;
+  static const unsigned kernel_axis_size = KernelDimSize;
+};
+
+typedef convolutionFixture3D<> default_3D_fixture;
+
+}  // namespace
+
+#endif
diff --git a/tests/test_gpu_convolve.cpp b/tests/test_gpu_convolve.cpp
new file mode 100644
index 0000000..ea3f385
--- /dev/null
+++ b/tests/test_gpu_convolve.cpp
@@ -0,0 +1,25 @@
+#define BOOST_TEST_DYN_LINK
+#define BOOST_TEST_MODULE GPU_CONVOLUTION
+#include "boost/test/unit_test.hpp"
+#include "test_fixtures.hpp"
+#include <numeric>
+#include "convolution3Dfft.h"
+
+BOOST_FIXTURE_TEST_SUITE(legacy_convolution,
+                         multiviewnative::default_3D_fixture)
+
+BOOST_AUTO_TEST_CASE(trivial_convolve) {
+
+  float* image = image_.data();
+  float* kernel = new float[kernel_size_];
+  std::fill(kernel, kernel + kernel_size_, 0.f);
+
+  convolution3DfftCUDAInPlace(image, &image_dims_[0], kernel, &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum = std::accumulate(image, image + image_size_, 0.f);
+  BOOST_CHECK_CLOSE(sum, 0.f, .00001);
+
+  delete[] kernel;
+}
+BOOST_AUTO_TEST_SUITE_END()

From 3443bea7169969abbd169ff7679c1a63d39e8476 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Thu, 23 Apr 2015 10:46:56 +0200
Subject: [PATCH 08/24] added new file that contains padding utils, completed
 testing suite for GPU based convolution

---
 tests/padd_utils.h          | 169 ++++++++++++++++++++++++++++++++++
 tests/test_gpu_convolve.cpp | 176 +++++++++++++++++++++++++++++++++++-
 2 files changed, 341 insertions(+), 4 deletions(-)
 create mode 100644 tests/padd_utils.h

diff --git a/tests/padd_utils.h b/tests/padd_utils.h
new file mode 100644
index 0000000..a2f6000
--- /dev/null
+++ b/tests/padd_utils.h
@@ -0,0 +1,169 @@
+#ifndef _PADD_UTILS_H_
+#define _PADD_UTILS_H_
+#include <vector>
+#include <algorithm>
+//#include <type_traits>
+#include <limits>
+#include "boost/multi_array.hpp"
+
+namespace multiviewnative {
+
+
+template <typename outT>
+struct add_minus_1 {
+
+  template <typename any_type>
+  outT operator()(const any_type& _first, const any_type& _second) {
+    return _first + _second - 1;
+  }
+};
+
+template <typename inT, typename outT>
+struct minus_1_div_2 {
+
+  outT operator()(const inT& _first) { return (_first - 1) / 2; }
+};
+
+template <typename ImageStackT>
+struct no_padd {
+
+  typedef typename ImageStackT::value_type value_type;
+  typedef typename ImageStackT::size_type size_type;
+  typedef typename ImageStackT::template array_view<
+      ImageStackT::dimensionality>::type image_stack_view;
+  typedef boost::multi_array_types::index_range range;
+
+  std::vector<size_type> extents_;
+  std::vector<size_type> offsets_;
+
+  no_padd()
+      : extents_(ImageStackT::dimensionality, 0),
+        offsets_(ImageStackT::dimensionality, 0) {}
+
+  template <typename T, typename U>
+  no_padd(T* _image_shape, U* _kernel_shape)
+      : extents_(_image_shape, _image_shape + ImageStackT::dimensionality),
+        offsets_(ImageStackT::dimensionality, 0) {
+    // static_assert(
+    //     std::numeric_limits<T>::is_integer,
+    //     "[no_padd] didn't receive integer type as image shape descriptor");
+    // static_assert(
+    //     std::numeric_limits<U>::is_integer,
+    //     "[no_padd] didn't receive integer type as kernel shape descriptor");
+  }
+
+  template <typename ImageStackRefT, typename OtherStackT>
+  void insert_at_offsets(const ImageStackRefT& _source, OtherStackT& _target) {
+    _target = _source;
+  }
+
+ 
+
+  const size_type* offsets() const { return &offsets_[0]; }
+
+  const size_type* extents() const { return &extents_[0]; }
+};
+
+template <typename ImageStackT>
+struct zero_padd {
+
+  typedef typename ImageStackT::value_type value_type;
+  typedef typename ImageStackT::size_type size_type;
+  typedef typename ImageStackT::template array_view<
+      ImageStackT::dimensionality>::type image_stack_view;
+  typedef boost::multi_array_types::index_range range;
+
+  std::vector<size_type> extents_;
+  std::vector<size_type> offsets_;
+
+  zero_padd()
+      : extents_(ImageStackT::dimensionality, 0),
+        offsets_(ImageStackT::dimensionality, 0) {}
+
+  zero_padd(const zero_padd& _other)
+      : extents_(_other.extents_), offsets_(_other.offsets_) {}
+
+  template <typename T, typename U>
+  zero_padd(T* _image, U* _kernel)
+      : extents_(ImageStackT::dimensionality, 0),
+        offsets_(ImageStackT::dimensionality, 0) {
+    // static_assert(
+    //     std::numeric_limits<T>::is_integer == true,
+    //     "[zero_padd] didn't receive integer type as image shape descriptor");
+    // static_assert(
+    //     std::numeric_limits<U>::is_integer,
+    //     "[zero_padd] didn't receive integer type as kernel shape descriptor");
+
+    std::transform(_image, _image + ImageStackT::dimensionality, _kernel,
+                   extents_.begin(), add_minus_1<T>());
+
+    std::transform(_kernel, _kernel + ImageStackT::dimensionality,
+                   offsets_.begin(), minus_1_div_2<U, size_type>());
+  }
+
+  zero_padd& operator=(const zero_padd& _other) {
+    if (this != &_other) {
+      extents_ = _other.extents_;
+      offsets_ = _other.offsets_;
+    }
+    return *this;
+  }
+
+  /**
+     \brief function that inserts _source in _target given limits by vector
+     offsets,
+     _target is expected to have dimensions
+     (source.shape()[0]+2*offsets_[0])x(source.shape()[1]+2*offsets_[1])x...
+     _target is hence expected to have dimensions extents_[0]xextents_[1]x...
+
+     as an example:
+     _source =
+     1 1
+     1 1
+     given a kernel of 3x3x3
+     would expecte a _target of
+     0 0 0 0
+     0 0 0 0
+     0 0 0 0
+     0 0 0 0
+
+     and the result would look like
+     0 0 0 0
+     0 1 1 0
+     0 1 1 0
+     0 0 0 0
+
+     \param[in] _source image stack to embed into target
+     \param[in] _target image stack of size extents_[0]xextents_[1]x...
+
+     \return
+     \retval
+
+  */
+  template <typename ImageStackRefT, typename OtherStackT>
+  void insert_at_offsets(const ImageStackRefT& _source, OtherStackT& _target) {
+
+    if (std::lexicographical_compare(
+            _target.shape(), _target.shape() + OtherStackT::dimensionality,
+            extents_.begin(), extents_.end()))
+      throw std::runtime_error(
+          "multiviewnative::zero_padd::insert_at_offsets]\ttarget image stack "
+          "is smaller or equal in size than source\n");
+
+    image_stack_view subview_padded_image = _target
+        [boost::indices[range(offsets_[0], offsets_[0] + _source.shape()[0])]
+                       [range(offsets_[1], offsets_[1] + _source.shape()[1])]
+                       [range(offsets_[2], offsets_[2] + _source.shape()[2])]];
+    subview_padded_image = _source;
+  }
+
+
+
+  const size_type* offsets() const { return &offsets_[0]; }
+
+  const size_type* extents() const { return &extents_[0]; }
+
+  virtual ~zero_padd() {};
+};
+}
+#endif /* _PADD_UTILS_H_ */
diff --git a/tests/test_gpu_convolve.cpp b/tests/test_gpu_convolve.cpp
index ea3f385..7fa3a12 100644
--- a/tests/test_gpu_convolve.cpp
+++ b/tests/test_gpu_convolve.cpp
@@ -2,6 +2,7 @@
 #define BOOST_TEST_MODULE GPU_CONVOLUTION
 #include "boost/test/unit_test.hpp"
 #include "test_fixtures.hpp"
+#include "padd_utils.h"
 #include <numeric>
 #include "convolution3Dfft.h"
 
@@ -11,15 +12,182 @@ BOOST_FIXTURE_TEST_SUITE(legacy_convolution,
 BOOST_AUTO_TEST_CASE(trivial_convolve) {
 
   float* image = image_.data();
-  float* kernel = new float[kernel_size_];
-  std::fill(kernel, kernel + kernel_size_, 0.f);
+  std::vector<float> kernel(kernel_size_,0);
 
-  convolution3DfftCUDAInPlace(image, &image_dims_[0], kernel, &kernel_dims_[0],
+  convolution3DfftCUDAInPlace(image, &image_dims_[0], 
+			      &kernel[0], &kernel_dims_[0],
                               selectDeviceWithHighestComputeCapability());
 
   float sum = std::accumulate(image, image + image_size_, 0.f);
   BOOST_CHECK_CLOSE(sum, 0.f, .00001);
 
-  delete[] kernel;
 }
+
+
+BOOST_AUTO_TEST_CASE(identity_convolve) {
+
+  using namespace multiviewnative;
+
+  float sum_expected = std::accumulate(
+      image_.data(), image_.data() + image_.num_elements(), 0.f);
+
+  zero_padd<image_stack> padder(image_.shape(), identity_kernel_.shape());
+  image_stack padded_image(padder.extents_, image_.storage_order());
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              identity_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(horizontal_convolve) {
+  using namespace multiviewnative;
+
+  float sum_expected =
+      std::accumulate(image_folded_by_horizontal_.data(),
+                      image_folded_by_horizontal_.data() +
+                          image_folded_by_horizontal_.num_elements(),
+                      0.f);
+
+  zero_padd<image_stack> padder(image_.shape(), horizont_kernel_.shape());
+  image_stack padded_image(padder.extents_, image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              horizont_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  image_ = padded_image
+      [boost::indices
+           [range(padder.offsets()[0], padder.offsets()[0] + image_dims_[0])]
+           [range(padder.offsets()[1], padder.offsets()[1] + image_dims_[1])]
+           [range(padder.offsets()[2], padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+
+  BOOST_REQUIRE_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(vertical_convolve) {
+
+  multiviewnative::zero_padd<multiviewnative::image_stack> padder(
+      image_.shape(), vertical_kernel_.shape());
+  multiviewnative::image_stack padded_image(padder.extents_,
+                                            image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              vertical_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum_expected =
+      std::accumulate(image_folded_by_vertical_.data(),
+                      image_folded_by_vertical_.data() +
+                          image_folded_by_vertical_.num_elements(),
+                      0.f);
+
+  image_ = padded_image
+      [boost::indices
+           [multiviewnative::range(padder.offsets()[0],
+                                   padder.offsets()[0] + image_dims_[0])]
+           [multiviewnative::range(padder.offsets()[1],
+                                   padder.offsets()[1] + image_dims_[1])]
+           [multiviewnative::range(padder.offsets()[2],
+                                   padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(depth_convolve) {
+
+  multiviewnative::zero_padd<multiviewnative::image_stack> padder(
+      image_.shape(), depth_kernel_.shape());
+  multiviewnative::image_stack padded_image(padder.extents_,
+                                            image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              depth_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum_expected = std::accumulate(
+      image_folded_by_depth_.data(),
+      image_folded_by_depth_.data() + image_folded_by_depth_.num_elements(),
+      0.f);
+
+  image_ = padded_image
+      [boost::indices
+           [multiviewnative::range(padder.offsets()[0],
+                                   padder.offsets()[0] + image_dims_[0])]
+           [multiviewnative::range(padder.offsets()[1],
+                                   padder.offsets()[1] + image_dims_[1])]
+           [multiviewnative::range(padder.offsets()[2],
+                                   padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
+BOOST_AUTO_TEST_CASE(all1_convolve) {
+
+  multiviewnative::zero_padd<multiviewnative::image_stack> padder(
+      image_.shape(), all1_kernel_.shape());
+  multiviewnative::image_stack padded_image(padder.extents_,
+                                            image_.storage_order());
+
+  padder.insert_at_offsets(image_, padded_image);
+
+  std::vector<int> extents_as_int(padder.extents_.size());
+  std::copy(padder.extents_.begin(), padder.extents_.end(),
+            extents_as_int.begin());
+
+  convolution3DfftCUDAInPlace(padded_image.data(), &extents_as_int[0],
+                              all1_kernel_.data(), &kernel_dims_[0],
+                              selectDeviceWithHighestComputeCapability());
+
+  float sum_expected = std::accumulate(
+      image_folded_by_all1_.data(),
+      image_folded_by_all1_.data() + image_folded_by_all1_.num_elements(), 0.f);
+
+  image_ = padded_image
+      [boost::indices
+           [multiviewnative::range(padder.offsets()[0],
+                                   padder.offsets()[0] + image_dims_[0])]
+           [multiviewnative::range(padder.offsets()[1],
+                                   padder.offsets()[1] + image_dims_[1])]
+           [multiviewnative::range(padder.offsets()[2],
+                                   padder.offsets()[2] + image_dims_[2])]];
+
+  float sum = std::accumulate(image_.data(),
+                              image_.data() + image_.num_elements(), 0.f);
+  BOOST_CHECK_CLOSE(sum, sum_expected, .00001);
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 9486df3348fe310d454f6b9b1101e340eb62a723 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Thu, 23 Apr 2015 10:54:44 +0200
Subject: [PATCH 09/24] added more information on using the tests to the
 README.md

---
 README.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 450df30..4e97960 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,8 @@ ${COMMAND} ${FLAGS} -lcufft -lcuda ${OUTPUT_FLAG} ${OUTPUT_PREFIX} ${OUTPUT} ${I
 
 Now build the .so/.dll library and put it into the Fiji directory.
 
+This build does not support unit tests.
+
 OSX/Linux Build Instructions
 ----------------------------
 
@@ -23,9 +25,26 @@ First, make sure that CUDA must be available through `PATH`, `LD_LIBRARY_PATH` o
 $ cd /path/to/repo
 $ mkdir build
 $ cd build
-$ cmake -DCMAKE_INSTALL_PREFIX=/directory/of/your/choice .. #default is /usr/bin/ or similar
+$ cmake -DCMAKE_INSTALL_PREFIX=/directory/of/your/choice .. #default is /usr/lib/ or /usr/lib64 depending on your OS
 $ make
+$ ctest #(optional) in case you'd like to make sure the library does what it should on your system
 $ make install
 ```
 
 
+Dependencies
+------------
+
+The unit tests that come with the library require boost (version 1.42 or higher) to be available to the cmake build system. If cmake is unable to detect them, try:
+
+```
+$ cmake -DCMAKE_INSTALL_PREFIX=/directory/of/your/choice -DBOOST_ROOT=/path/to/boost/root .. #default is /usr/lib/ or /usr/lib64 depending on your OS
+```
+
+Here, ```/path/to/boost/root``` should contain the boost libraries and the boost headers.
+
+
+How to get Help
+===============
+
+In case something does not work, do not hesitate to open a ticket on our github page: https://github.com/StephanPreibisch/FourierConvolutionCUDALib

From 5836a2d729687640c60d489e0a8fb657f27b19b6 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Thu, 23 Apr 2015 11:29:29 +0200
Subject: [PATCH 10/24] fixed cmake bugs for Win32 build, Win32 with Visual
 Studio 2013 runs through, Linking against cuda.lib fails

---
 CMakeLists.txt          |  6 +++---
 src/CMakeLists.txt      | 16 +++++++++++++---
 src/convolution3Dfft.cu |  8 ++++----
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b727fb9..64b81d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,9 +40,9 @@ IF(UNIX)
 ELSE(UNIX)
   IF(WIN32 AND NOT CYGWIN)
     set(INSTALL_SHARE_DIR CMake CACHE PATH "Installation directory for shared files")
-  ELSE(WIN32)
+  ELSE()
     MESSAGE(FATAL_ERROR ">> UNKNOWN ARCHITECTURE .. unable to set share dir")
-  ENDIF(WIN32)
+  ENDIF()
 ENDIF(UNIX)
  
 # Make relative paths absolute (needed later on)
@@ -103,7 +103,7 @@ ENDIF(INCLUDE_CUDA)
 # add subdirectories
 ADD_SUBDIRECTORY(src)
 
-FIND_PACKAGE (Boost 1.42 COMPONENTS system filesystem unit_test_framework thread REQUIRED)
+FIND_PACKAGE (Boost 1.42 COMPONENTS system filesystem unit_test_framework thread)
 IF(Boost_FOUND)
 ADD_SUBDIRECTORY(tests)
 enable_testing()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e0135b0..5d2b5d1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,12 +13,22 @@ MESSAGE("adding ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib")
    TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib)
  ENDIF(APPLE)
 
-
+MESSAGE("INSTALL targets goind to ${INSTALL_LIB_DIR} ${INSTALL_INCLUDE_DIR}")
 SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
+IF(WIN32)
+INSTALL(TARGETS ${PROJECT_NAME}
+  EXPORT fourierconvolutioncudalib-targets 
+  LIBRARY DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
+  ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
+  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" #COMPONENT inc
+  )
+ELSE(WIN32)
 INSTALL(TARGETS ${PROJECT_NAME}
   EXPORT fourierconvolutioncudalib-targets 
-  LIBRARY DESTINATION "${INSTALL_LIB_DIR}" COMPONENT shlib
-  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" COMPONENT inc)
+  LIBRARY DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
+  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" #COMPONENT inc
+  )
+ENDIF(WIN32)
 
 
 
diff --git a/src/convolution3Dfft.cu b/src/convolution3Dfft.cu
index e6f022f..6b2a7aa 100644
--- a/src/convolution3Dfft.cu
+++ b/src/convolution3Dfft.cu
@@ -3,11 +3,11 @@
 #include "cuda.h"
 #include "cufft.h"
 #include <iostream>
-#include <math.h>
+#include <cmath>
+#include <algorithm>
 
-
-__device__ static const float PI_2 = 6.28318530717958620f;
-__device__ static const float PI_1 =  3.14159265358979310f;
+//__device__ static const float PI_2 = 6.28318530717958620f;
+//__device__ static const float PI_1 =  3.14159265358979310f;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Modulate Fourier image of padded data by Fourier image of padded kernel

From debbdcb24cd8826274b1c1fe38587e8443fd5ee3 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Thu, 23 Apr 2015 11:33:36 +0200
Subject: [PATCH 11/24] defined location of cuda.lib explicitely when
 generating VS 2013 solutions

---
 src/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5d2b5d1..c3d4e31 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,14 +6,18 @@ INCLUDE_DIRECTORIES(.)
 FIND_PACKAGE(CUDA QUIET)
 CUDA_ADD_LIBRARY(${PROJECT_NAME} convolution3Dfft.cu standardCUDAfunctions.cu SHARED)
 CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME})
+IF(WIN32)
+TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
+ELSE()
 TARGET_LINK_LIBRARIES(${PROJECT_NAME} cuda)
+ENDIF()
 
+# TODO: is this really needed
 IF(APPLE)
 MESSAGE("adding ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib")
    TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib)
  ENDIF(APPLE)
 
-MESSAGE("INSTALL targets goind to ${INSTALL_LIB_DIR} ${INSTALL_INCLUDE_DIR}")
 SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
 IF(WIN32)
 INSTALL(TARGETS ${PROJECT_NAME}

From 35dc7ee251dea01083792d32f21219c4268a7b27 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Thu, 23 Apr 2015 11:38:49 +0200
Subject: [PATCH 12/24] commented unused code out that generated warnings

---
 src/convolution3Dfft.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/convolution3Dfft.cu b/src/convolution3Dfft.cu
index e6f022f..92afcc4 100644
--- a/src/convolution3Dfft.cu
+++ b/src/convolution3Dfft.cu
@@ -6,8 +6,8 @@
 #include <math.h>
 
 
-__device__ static const float PI_2 = 6.28318530717958620f;
-__device__ static const float PI_1 =  3.14159265358979310f;
+//__device__ static const float PI_2 = 6.28318530717958620f;
+//__device__ static const float PI_1 =  3.14159265358979310f;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Modulate Fourier image of padded data by Fourier image of padded kernel
@@ -49,7 +49,7 @@ __global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalizeSubsamp
 	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
 	int k_0,k_1,k_2;
 	int aux;
-	float auxExp, auxSin,auxCos;
+	//	float auxExp, auxSin,auxCos;
     while(i < datasize)
 	{
 		//for each dimension we need to access k_i*r_i  i=0, 1, 2

From fb428a8b957f856651b301cdd2cc26e24a356b46 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Fri, 24 Apr 2015 10:57:39 +0200
Subject: [PATCH 13/24] large commit, added static library on windows to see if
 the tests work there, need to make the tests work with dll though

---
 CMakeLists.txt              |  5 +++++
 src/CMakeLists.txt          | 23 ++++++++++++++++-------
 tests/CMakeLists.txt        | 31 ++++++++++++++-----------------
 tests/test_gpu_convolve.cpp |  1 -
 4 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64b81d1..aea004c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,11 @@ PROJECT(FourierConvolutionCUDALib CXX)
 if(NOT(${CMAKE_VERSION} VERSION_LESS "3.0.0"))
 cmake_policy(SET CMP0042 NEW)
 endif()
+
+if(${CMAKE_VERSION} VERSION_GREATER "3.1")
+cmake_policy(SET CMP0054 NEW)
+endif()
+
  
 # version number
 SET (FOURIERCONVOLUTIONCUDALIB_NAME "CUDA FOURIER CONVOLUTION LIBRARY")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c3d4e31..0a354bb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,24 +8,33 @@ CUDA_ADD_LIBRARY(${PROJECT_NAME} convolution3Dfft.cu standardCUDAfunctions.cu SH
 CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME})
 IF(WIN32)
 TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
+include (GenerateExportHeader)
+GENERATE_EXPORT_HEADER( ${PROJECT_NAME}
+             BASE_NAME ${PROJECT_NAME}
+             EXPORT_MACRO_NAME ${PROJECT_NAME}_EXPORT
+             EXPORT_FILE_NAME ${PROJECT_NAME}_Export.h
+             STATIC_DEFINE ${PROJECT_NAME}_BUILT_AS_STATIC
+)
+CUDA_ADD_LIBRARY(${PROJECT_NAME}_static convolution3Dfft.cu standardCUDAfunctions.cu)
+CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME}_static)
+TARGET_LINK_LIBRARIES(${PROJECT_NAME}_static ${CUDA_CUDA_LIBRARY})
 ELSE()
-TARGET_LINK_LIBRARIES(${PROJECT_NAME} cuda)
+
+TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
+
 ENDIF()
 
-# TODO: is this really needed
-IF(APPLE)
-MESSAGE("adding ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib")
-   TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcuda.dylib)
- ENDIF(APPLE)
 
 SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
 IF(WIN32)
-INSTALL(TARGETS ${PROJECT_NAME}
+SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
+INSTALL(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static
   EXPORT fourierconvolutioncudalib-targets 
   LIBRARY DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
   PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" #COMPONENT inc
   )
+  
 ELSE(WIN32)
 INSTALL(TARGETS ${PROJECT_NAME}
   EXPORT fourierconvolutioncudalib-targets 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 64ee942..6f7e25e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,28 +1,13 @@
 INCLUDE_DIRECTORIES(.)
 
 SET(Boost_USE_MULTITHREADED ON)
+SET(Boost_USE_STATIC_LIBS ON)
 FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem unit_test_framework REQUIRED)
 IF(Boost_FOUND)
 INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
 LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) 
 ENDIF()
 
-#FIXME: OS dependent
-IF(DEFINED ${BOOST_ROOT} AND ${BOOST_ROOT} MATCHES ".*/.*")
-  SET(BOOST_LIB_DIR ${BOOST_ROOT}/lib)
-ELSE()
-  SET(BOOST_LIB_DIR ${Boost_LIBRARY_DIRS})
-ENDIF()
-
-add_library( ext_boost_system  SHARED IMPORTED )
-set_target_properties( ext_boost_system PROPERTIES IMPORTED_LOCATION ${BOOST_LIB_DIR}/libboost_system.so )
-
-add_library( ext_boost_filesystem  SHARED IMPORTED )
-set_target_properties( ext_boost_filesystem PROPERTIES IMPORTED_LOCATION ${BOOST_LIB_DIR}/libboost_filesystem.so )
-
-add_library( ext_boost_unit_test_framework  SHARED IMPORTED )
-set_target_properties( ext_boost_unit_test_framework PROPERTIES IMPORTED_LOCATION ${BOOST_LIB_DIR}/libboost_unit_test_framework.so )
-
 
 IF(CUDA_FOUND)
 
@@ -30,8 +15,20 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-I${PROJECT_SOURCE_DIR}/src)
 
 
 CUDA_ADD_EXECUTABLE(test_gpu_convolve test_gpu_convolve.cpp image_stack_utils.cpp)
-target_link_libraries(test_gpu_convolve ext_boost_system ext_boost_filesystem ext_boost_unit_test_framework ${PROJECT_NAME})
+
+IF(WIN32)
+MESSAGE("[Win] compiler is ${CMAKE_CXX_COMPILER_ID}")
+MESSAGE("[Win] using Boost libraries ${Boost_LIBRARIES}")
+IF("${CMAKE_CXX_COMPILER_ID}" MATCHES MSVC)
+target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
+set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "/I${PROJECT_SOURCE_DIR}/src")
+ELSE()
+target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME})
+ENDIF()
+ELSE()
+target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME})
 set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-I${PROJECT_SOURCE_DIR}/src")
+ENDIF()
 
 ELSE(CUDA_FOUND)
 MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!")
diff --git a/tests/test_gpu_convolve.cpp b/tests/test_gpu_convolve.cpp
index 7fa3a12..3ff67e8 100644
--- a/tests/test_gpu_convolve.cpp
+++ b/tests/test_gpu_convolve.cpp
@@ -1,4 +1,3 @@
-#define BOOST_TEST_DYN_LINK
 #define BOOST_TEST_MODULE GPU_CONVOLUTION
 #include "boost/test/unit_test.hpp"
 #include "test_fixtures.hpp"

From 9afae3339586b3e850add6314bce96be64996474 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 5 May 2015 14:48:42 +0200
Subject: [PATCH 14/24] setup cmake build so that it works on Windows 7

---
 CMakeLists.txt       | 17 +++++++++++++++++
 src/CMakeLists.txt   | 41 +++++++++++++++--------------------------
 tests/CMakeLists.txt | 10 +++++-----
 3 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aea004c..5cd82ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,6 +30,12 @@ SET (FOURIERCONVOLUTIONCUDALIB_ID "${FOURIERCONVOLUTIONCUDALIB_VENDOR_ID}.${PROJ
 # trying to setup paths so this package can be picked up
 set(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries")
 set(INSTALL_INCLUDE_DIR include CACHE PATH "Installation directory for header files")
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+      "Choose the type of build, options are: Debug Release
+RelWithDebInfo MinSizeRel."
+      FORCE)
+endif(NOT CMAKE_BUILD_TYPE)
 
 # shared path is architecture independent for now, TODO extend this to lib/bin/include
 IF(UNIX)
@@ -102,6 +108,17 @@ IF(INCLUDE_CUDA)
     ELSE(CUDA_FOUND)
         MESSAGE(FATAL_ERROR ">> CUDA not found. Exiting ...")
     ENDIF(CUDA_FOUND)
+	
+	IF(WIN32)
+		IF(${CUDA_VERSION} VERSION_GREATER "7" OR ${CUDA_VERSION} VERSION_EQUAL "7")
+			IF(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+				message(FATAL_ERROR ">> CUDA version 7 (found version: ${CUDA_VERSION}) or higher does not support 32bit builds on Windows (found CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})")
+			ELSE()
+				message("[Windows] 64bit build detected, ${CMAKE_SIZEOF_VOID_P}")
+			ENDIF()
+		
+		ENDIF()
+	ENDIF(WIN32)
 ENDIF(INCLUDE_CUDA)
  
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0a354bb..2337158 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,44 +5,33 @@ INCLUDE_DIRECTORIES(.)
 # build and link
 FIND_PACKAGE(CUDA QUIET)
 CUDA_ADD_LIBRARY(${PROJECT_NAME} convolution3Dfft.cu standardCUDAfunctions.cu SHARED)
+CUDA_ADD_LIBRARY(${PROJECT_NAME}_static convolution3Dfft.cu standardCUDAfunctions.cu)
 CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME})
+CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME}_static)
+
 IF(WIN32)
-TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
+IF(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
 include (GenerateExportHeader)
 GENERATE_EXPORT_HEADER( ${PROJECT_NAME}
-             BASE_NAME ${PROJECT_NAME}
-             EXPORT_MACRO_NAME ${PROJECT_NAME}_EXPORT
-             EXPORT_FILE_NAME ${PROJECT_NAME}_Export.h
-             STATIC_DEFINE ${PROJECT_NAME}_BUILT_AS_STATIC
+			BASE_NAME ${PROJECT_NAME}
+            EXPORT_MACRO_NAME ${PROJECT_NAME}_EXPORT
+            EXPORT_FILE_NAME ${PROJECT_NAME}_Export.h
+            STATIC_DEFINE ${PROJECT_NAME}_BUILT_AS_STATIC
 )
-CUDA_ADD_LIBRARY(${PROJECT_NAME}_static convolution3Dfft.cu standardCUDAfunctions.cu)
-CUDA_ADD_CUFFT_TO_TARGET(${PROJECT_NAME}_static)
-TARGET_LINK_LIBRARIES(${PROJECT_NAME}_static ${CUDA_CUDA_LIBRARY})
-ELSE()
-
-TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
-
+ENDIF()
 ENDIF()
 
+TARGET_LINK_LIBRARIES(${PROJECT_NAME}_static ${CUDA_CUDA_LIBRARY})
+TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
 
 SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
-IF(WIN32)
 SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
+
 INSTALL(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static
   EXPORT fourierconvolutioncudalib-targets 
-  LIBRARY DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
-  ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
-  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" #COMPONENT inc
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  PUBLIC_HEADER DESTINATION include
   )
-  
-ELSE(WIN32)
-INSTALL(TARGETS ${PROJECT_NAME}
-  EXPORT fourierconvolutioncudalib-targets 
-  LIBRARY DESTINATION "${INSTALL_LIB_DIR}" #COMPONENT shlib
-  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" #COMPONENT inc
-  )
-ENDIF(WIN32)
-
-
 
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6f7e25e..15227f8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,7 +10,7 @@ ENDIF()
 
 
 IF(CUDA_FOUND)
-
+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-I${PROJECT_SOURCE_DIR}/src)
 
 
@@ -21,13 +21,13 @@ MESSAGE("[Win] compiler is ${CMAKE_CXX_COMPILER_ID}")
 MESSAGE("[Win] using Boost libraries ${Boost_LIBRARIES}")
 IF("${CMAKE_CXX_COMPILER_ID}" MATCHES MSVC)
 target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
-set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "/I${PROJECT_SOURCE_DIR}/src")
+#set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "/I${PROJECT_SOURCE_DIR}/src")
 ELSE()
-target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME})
+target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
 ENDIF()
 ELSE()
-target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME})
-set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-I${PROJECT_SOURCE_DIR}/src")
+target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
+#set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-I${PROJECT_SOURCE_DIR}/src")
 ENDIF()
 
 ELSE(CUDA_FOUND)

From 22b542b9a2377f4cc7c338d055d16e676bf967c5 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 5 May 2015 14:49:32 +0200
Subject: [PATCH 15/24] added VS 64bit support and safe-guard against 32bit
 builds with cuda 7 or higher

---
 CMakeLists.txt |  2 +-
 README.md      | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5cd82ff..0e861c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,7 +112,7 @@ IF(INCLUDE_CUDA)
 	IF(WIN32)
 		IF(${CUDA_VERSION} VERSION_GREATER "7" OR ${CUDA_VERSION} VERSION_EQUAL "7")
 			IF(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
-				message(FATAL_ERROR ">> CUDA version 7 (found version: ${CUDA_VERSION}) or higher does not support 32bit builds on Windows (found CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})")
+				message(FATAL_ERROR ">> CUDA version 7 or higher does not support 32bit builds on Windows (found CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})")
 			ELSE()
 				message("[Windows] 64bit build detected, ${CMAKE_SIZEOF_VOID_P}")
 			ENDIF()
diff --git a/README.md b/README.md
index 4e97960..c7c2424 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,15 @@ ${COMMAND} ${FLAGS} -lcufft -lcuda ${OUTPUT_FLAG} ${OUTPUT_PREFIX} ${OUTPUT} ${I
 
 Now build the .so/.dll library and put it into the Fiji directory.
 
-This build does not support unit tests.
+
+```bash
+$ cd 
+# build including tests
+$ cmake.exe -G "Visual Studio 12 2013 Win64" -DCMAKE_INSTALL_PREFIX=C:/msys64/home/steinbac/ftmp -DBOOST_LIBRARYDIR=C:/boost_1_58_0/msvc-12-x86_64/lib64-msvc-12.0 -DBOOST_ROOT=C:/boost_1_58_0/msvc-12-x86_64 ..
+$ cmake --build . --target ALL_BUILD
+$ ctest.exe
+$ cmake --build . --target install
+```
 
 OSX/Linux Build Instructions
 ----------------------------

From 7d00b4e913173427c6ece289fdadd6c6480bd0b3 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 5 May 2015 14:51:35 +0200
Subject: [PATCH 16/24] added cmake autogenerated file, swapped preprocessor
 macros

---
 src/FourierConvolutionCUDALib_Export.h | 41 ++++++++++++++++++++++++++
 src/convolution3Dfft.h                 |  9 ++++--
 src/standardCUDAfunctions.h            | 10 ++++---
 3 files changed, 54 insertions(+), 6 deletions(-)
 create mode 100644 src/FourierConvolutionCUDALib_Export.h

diff --git a/src/FourierConvolutionCUDALib_Export.h b/src/FourierConvolutionCUDALib_Export.h
new file mode 100644
index 0000000..f8f1e38
--- /dev/null
+++ b/src/FourierConvolutionCUDALib_Export.h
@@ -0,0 +1,41 @@
+
+#ifndef FourierConvolutionCUDALib_EXPORT_H
+#define FourierConvolutionCUDALib_EXPORT_H
+
+#ifdef FourierConvolutionCUDALib_BUILT_AS_STATIC
+#  define FourierConvolutionCUDALib_EXPORT
+#  define FOURIERCONVOLUTIONCUDALIB_NO_EXPORT
+#else
+#  ifndef FourierConvolutionCUDALib_EXPORT
+#    ifdef FourierConvolutionCUDALib_EXPORTS
+        /* We are building this library */
+#      define FourierConvolutionCUDALib_EXPORT __declspec(dllexport)
+#    else
+        /* We are using this library */
+#      define FourierConvolutionCUDALib_EXPORT __declspec(dllimport)
+#    endif
+#  endif
+
+#  ifndef FOURIERCONVOLUTIONCUDALIB_NO_EXPORT
+#    define FOURIERCONVOLUTIONCUDALIB_NO_EXPORT 
+#  endif
+#endif
+
+#ifndef FOURIERCONVOLUTIONCUDALIB_DEPRECATED
+#  define FOURIERCONVOLUTIONCUDALIB_DEPRECATED __declspec(deprecated)
+#endif
+
+#ifndef FOURIERCONVOLUTIONCUDALIB_DEPRECATED_EXPORT
+#  define FOURIERCONVOLUTIONCUDALIB_DEPRECATED_EXPORT FourierConvolutionCUDALib_EXPORT FOURIERCONVOLUTIONCUDALIB_DEPRECATED
+#endif
+
+#ifndef FOURIERCONVOLUTIONCUDALIB_DEPRECATED_NO_EXPORT
+#  define FOURIERCONVOLUTIONCUDALIB_DEPRECATED_NO_EXPORT FOURIERCONVOLUTIONCUDALIB_NO_EXPORT FOURIERCONVOLUTIONCUDALIB_DEPRECATED
+#endif
+
+#define DEFINE_NO_DEPRECATED 0
+#if DEFINE_NO_DEPRECATED
+# define FOURIERCONVOLUTIONCUDALIB_NO_DEPRECATED
+#endif
+
+#endif
diff --git a/src/convolution3Dfft.h b/src/convolution3Dfft.h
index 95c161a..4815bd3 100644
--- a/src/convolution3Dfft.h
+++ b/src/convolution3Dfft.h
@@ -4,9 +4,14 @@
 #include "standardCUDAfunctions.h"
 
 #ifdef _WIN32
-#define FUNCTION_PREFIX extern "C" __declspec(dllexport)
+//auto-generated macro file for MSVC libraries on Win7
+	#include "convolution3Dfft_Export.h"
+	#define FUNCTION_PREFIX extern "C" FourierConvolutionCUDALib_EXPORT
+
 #else
-#define FUNCTION_PREFIX extern "C"
+
+	#define FUNCTION_PREFIX extern "C"
+	
 #endif
 
 //define constants
diff --git a/src/standardCUDAfunctions.h b/src/standardCUDAfunctions.h
index f9bde7f..1376a71 100644
--- a/src/standardCUDAfunctions.h
+++ b/src/standardCUDAfunctions.h
@@ -8,10 +8,12 @@
 #ifndef STANDARDCUDAFUNCTIONS_H_
 #define STANDARDCUDAFUNCTIONS_H_
 
-#ifdef _WIN32
-#define FUNCTION_PREFIX extern "C" __declspec(dllexport)
-#else
-#define FUNCTION_PREFIX extern "C"
+#ifdef _WIN32
+//auto-generated macro file for MSVC libraries on Win7
+	#include "convolution3Dfft_Export.h"
+	#define FUNCTION_PREFIX extern "C" FourierConvolutionCUDALib_EXPORT
+#else
+	#define FUNCTION_PREFIX extern "C"
 #endif
 //----------------------------------functions to decide whhich GPU to use-------------------------------
 

From 641d9987bbe1ba877a1897da9b1d8567779f7948 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 5 May 2015 15:05:55 +0200
Subject: [PATCH 17/24] * moved declarations of standardCUDAfunctions.h to
 convolution3Dfft.h\n* include convolution3Dfft.h in standardCUDAfunctions.cu
 to make the definitions available to the library\n* link step fails due to
 wrong export/import of some symbols

---
 src/convolution3Dfft.h       | 11 ++++++++---
 src/standardCUDAfunctions.cu |  3 ++-
 src/standardCUDAfunctions.h  | 27 ---------------------------
 tests/test_gpu_convolve.cpp  |  3 +++
 4 files changed, 13 insertions(+), 31 deletions(-)
 delete mode 100644 src/standardCUDAfunctions.h

diff --git a/src/convolution3Dfft.h b/src/convolution3Dfft.h
index 4815bd3..8be324e 100644
--- a/src/convolution3Dfft.h
+++ b/src/convolution3Dfft.h
@@ -1,11 +1,9 @@
 #ifndef __CONVOLUTION_3D_FFT_H__
 #define __CONVOLUTION_3D_FFT_H__
 
-#include "standardCUDAfunctions.h"
-
 #ifdef _WIN32
 //auto-generated macro file for MSVC libraries on Win7
-	#include "convolution3Dfft_Export.h"
+	#include "FourierConvolutionCUDALib_Export.h"
 	#define FUNCTION_PREFIX extern "C" FourierConvolutionCUDALib_EXPORT
 
 #else
@@ -57,4 +55,11 @@ WARNING: If imSize > 5*kernelSize the maximum relative error in the convolution
 */
 //FUNCTION_PREFIX void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
 
+FUNCTION_PREFIX int selectDeviceWithHighestComputeCapability();
+FUNCTION_PREFIX int getCUDAcomputeCapabilityMinorVersion(int devCUDA);
+FUNCTION_PREFIX int getCUDAcomputeCapabilityMajorVersion(int devCUDA);
+FUNCTION_PREFIX int getNumDevicesCUDA();
+FUNCTION_PREFIX void getNameDeviceCUDA(int devCUDA, char *name);
+FUNCTION_PREFIX long long int getMemDeviceCUDA(int devCUDA);
+
 #endif //__CONVOLUTION_3D_FFT_H__
diff --git a/src/standardCUDAfunctions.cu b/src/standardCUDAfunctions.cu
index 3d4b0f7..e5c6f1f 100644
--- a/src/standardCUDAfunctions.cu
+++ b/src/standardCUDAfunctions.cu
@@ -6,7 +6,8 @@
  */
 #include "book.h"
 #include "cuda.h"
-#include "standardCUDAfunctions.h"
+
+#include "convolution3Dfft.h"
 
 //==============================================
 int selectDeviceWithHighestComputeCapability() {
diff --git a/src/standardCUDAfunctions.h b/src/standardCUDAfunctions.h
deleted file mode 100644
index 1376a71..0000000
--- a/src/standardCUDAfunctions.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * standardCUDAfunctions.h
- *
- *  Created on: Jul 24, 2014
- *      Author: preibisch
- */
-
-#ifndef STANDARDCUDAFUNCTIONS_H_
-#define STANDARDCUDAFUNCTIONS_H_
-
-#ifdef _WIN32
-//auto-generated macro file for MSVC libraries on Win7
-	#include "convolution3Dfft_Export.h"
-	#define FUNCTION_PREFIX extern "C" FourierConvolutionCUDALib_EXPORT
-#else
-	#define FUNCTION_PREFIX extern "C"
-#endif
-//----------------------------------functions to decide whhich GPU to use-------------------------------
-
-FUNCTION_PREFIX int selectDeviceWithHighestComputeCapability();
-FUNCTION_PREFIX int getCUDAcomputeCapabilityMinorVersion(int devCUDA);
-FUNCTION_PREFIX int getCUDAcomputeCapabilityMajorVersion(int devCUDA);
-FUNCTION_PREFIX int getNumDevicesCUDA();
-FUNCTION_PREFIX void getNameDeviceCUDA(int devCUDA, char *name);
-FUNCTION_PREFIX long long int getMemDeviceCUDA(int devCUDA);
-
-#endif /* STANDARDCUDAFUNCTIONS_H_ */
diff --git a/tests/test_gpu_convolve.cpp b/tests/test_gpu_convolve.cpp
index 3ff67e8..802c939 100644
--- a/tests/test_gpu_convolve.cpp
+++ b/tests/test_gpu_convolve.cpp
@@ -3,8 +3,11 @@
 #include "test_fixtures.hpp"
 #include "padd_utils.h"
 #include <numeric>
+
 #include "convolution3Dfft.h"
 
+
+
 BOOST_FIXTURE_TEST_SUITE(legacy_convolution,
                          multiviewnative::default_3D_fixture)
 

From 21ad39c56541c566fcfb940c2f9057378da261e6 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 5 May 2015 17:10:50 +0200
Subject: [PATCH 18/24] link failure still prevailing, :(

---
 src/CMakeLists.txt          |  8 ++++++--
 tests/CMakeLists.txt        | 16 +---------------
 tests/test_gpu_convolve.cpp |  2 +-
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2337158..8a725ec 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,14 +18,18 @@ GENERATE_EXPORT_HEADER( ${PROJECT_NAME}
             EXPORT_FILE_NAME ${PROJECT_NAME}_Export.h
             STATIC_DEFINE ${PROJECT_NAME}_BUILT_AS_STATIC
 )
+INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR}/src)
 ENDIF()
 ENDIF()
 
 TARGET_LINK_LIBRARIES(${PROJECT_NAME}_static ${CUDA_CUDA_LIBRARY})
 TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
 
-SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
-SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h") 
+SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h;${PROJECT_NAME}_Export.h") 
+SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h;${PROJECT_NAME}_Export.h") 
+
+set_target_properties(${PROJECT_NAME}_static PROPERTIES COMPILE_FLAGS -D${PROJECT_NAME}_BUILT_AS_STATIC)
+
 
 INSTALL(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static
   EXPORT fourierconvolutioncudalib-targets 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 15227f8..1c11319 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,27 +8,13 @@ INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
 LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) 
 ENDIF()
 
+FIND_PACKAGE(CUDA)
 
 IF(CUDA_FOUND)
 INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-I${PROJECT_SOURCE_DIR}/src)
-
 
 CUDA_ADD_EXECUTABLE(test_gpu_convolve test_gpu_convolve.cpp image_stack_utils.cpp)
-
-IF(WIN32)
-MESSAGE("[Win] compiler is ${CMAKE_CXX_COMPILER_ID}")
-MESSAGE("[Win] using Boost libraries ${Boost_LIBRARIES}")
-IF("${CMAKE_CXX_COMPILER_ID}" MATCHES MSVC)
-target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
-#set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "/I${PROJECT_SOURCE_DIR}/src")
-ELSE()
 target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
-ENDIF()
-ELSE()
-target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
-#set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-I${PROJECT_SOURCE_DIR}/src")
-ENDIF()
 
 ELSE(CUDA_FOUND)
 MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!")
diff --git a/tests/test_gpu_convolve.cpp b/tests/test_gpu_convolve.cpp
index 802c939..26fd485 100644
--- a/tests/test_gpu_convolve.cpp
+++ b/tests/test_gpu_convolve.cpp
@@ -15,7 +15,7 @@ BOOST_AUTO_TEST_CASE(trivial_convolve) {
 
   float* image = image_.data();
   std::vector<float> kernel(kernel_size_,0);
-
+  
   convolution3DfftCUDAInPlace(image, &image_dims_[0], 
 			      &kernel[0], &kernel_dims_[0],
                               selectDeviceWithHighestComputeCapability());

From 14749521250a2b75061cc30ba1d897f01e5c1a56 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Mon, 11 May 2015 16:17:02 +0200
Subject: [PATCH 19/24] tests compile and run under windows with VS

---
 CMakeLists.txt               |    2 +
 src/CMakeLists.txt           |    4 +-
 src/convolution3Dfft.cu      | 1089 +++++++++++++++++-----------------
 src/convolution3Dfft.h       |   11 +-
 src/standardCUDAfunctions.cu |   15 +-
 tests/CMakeLists.txt         |    2 +
 6 files changed, 571 insertions(+), 552 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e861c9..baa5005 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,6 +118,8 @@ IF(INCLUDE_CUDA)
 			ENDIF()
 		
 		ENDIF()
+		SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
+
 	ENDIF(WIN32)
 ENDIF(INCLUDE_CUDA)
  
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8a725ec..d920de2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ GENERATE_EXPORT_HEADER( ${PROJECT_NAME}
             EXPORT_FILE_NAME ${PROJECT_NAME}_Export.h
             STATIC_DEFINE ${PROJECT_NAME}_BUILT_AS_STATIC
 )
+set_target_properties(${PROJECT_NAME}_static PROPERTIES COMPILE_FLAGS "-D${PROJECT_NAME}_BUILT_AS_STATIC")
 INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR}/src)
 ENDIF()
 ENDIF()
@@ -28,9 +29,6 @@ TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY})
 SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h;${PROJECT_NAME}_Export.h") 
 SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/convolution3Dfft.h;${PROJECT_NAME}_Export.h") 
 
-set_target_properties(${PROJECT_NAME}_static PROPERTIES COMPILE_FLAGS -D${PROJECT_NAME}_BUILT_AS_STATIC)
-
-
 INSTALL(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static
   EXPORT fourierconvolutioncudalib-targets 
   LIBRARY DESTINATION lib
diff --git a/src/convolution3Dfft.cu b/src/convolution3Dfft.cu
index 39b0344..c759b84 100644
--- a/src/convolution3Dfft.cu
+++ b/src/convolution3Dfft.cu
@@ -1,541 +1,548 @@
-#include "convolution3Dfft.h"
-#include "book.h"
-#include "cuda.h"
-#include "cufft.h"
-#include <iostream>
-#include <cmath>
-#include <algorithm>
-
-//__device__ static const float PI_2 = 6.28318530717958620f;
-//__device__ static const float PI_1 =  3.14159265358979310f;
-
-////////////////////////////////////////////////////////////////////////////////
-// Modulate Fourier image of padded data by Fourier image of padded kernel
-// and normalize by FFT size
-////////////////////////////////////////////////////////////////////////////////
-//Adapted from CUDA SDK examples
-__device__ void mulAndScale(cufftComplex& a, const cufftComplex& b, const float& c)
-{
-    cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
-    a = t;
-};
-
-__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalize_kernel(cufftComplex *d_Dst, cufftComplex *d_Src, long long int dataSize,float c)
-{
-    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
-	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
-    while(i < dataSize)
-	{		
-
-		cufftComplex a = d_Src[i];
-		cufftComplex b = d_Dst[i];
-
-		mulAndScale(a, b, c);
-		d_Dst[i] = a;
-
-		i += offset;
-	}
-};
-
-//we use nearest neighbor interpolation to access FFT coefficients in the kernel
-__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalizeSubsampled_kernel(cufftComplex *d_Dst, cufftComplex *d_Src,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2,long long int datasize,float c)
-{
-
-	float r_0 = ((float)kernelDim_0) / ((float)imDim_0); //ratio between image size and kernel size to calculate access
-	float r_1 = ((float)kernelDim_1) / ((float)imDim_1);
-	float r_2 = ((float)kernelDim_2) / ((float)imDim_2);
-
-    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
-	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
-	int k_0,k_1,k_2;
-	int aux;
-	//	float auxExp, auxSin,auxCos;
-    while(i < datasize)
-	{
-		//for each dimension we need to access k_i*r_i  i=0, 1, 2
-		aux = 1 + imDim_2/2;
-		k_2 = i % aux;
-		aux = (i - k_2) / aux;
-		k_1 = aux % imDim_1;
-		k_0 = (aux - k_1) / imDim_1;
-
-		cufftComplex b = d_Dst[i];
-
-		//apply shift in fourier domain since we did not apply fftshift to kernel (so we could use the trick of assuming the kernel is padded with zeros and then just subsample FFT)
-		/* This is how we would do it in Matlab (linear phase change)
-		auxExp = k_0 * r_0;
-		auxExp += k_1 * r_1;
-		auxExp += k_2 * r_2;
-		auxExp *= PI_1;
-		auxSin = sin(auxExp);
-		auxCos = cos(auxExp);
-		auxExp = b.x * auxCos - b.y * auxSin;
-
-		b.y = b.x * auxSin + b.y * auxCos;
-		b.x = auxExp;
-		*/
-
-		//add the ratio to each dimension and apply nearest neighbor interpolation
-		//k_2 = min((int)(r_2*(float)k_2 + 0.5f),kernelDim_2-1);//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
-		//k_1 = min((int)(r_1*(float)k_1 + 0.5f),kernelDim_1-1);
-		//k_0 = min((int)(r_0*(float)k_0 + 0.5f),kernelDim_0-1);
-		k_2 = ((int)(r_2*(float)k_2 + 0.5f)) % kernelDim_2;//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
-		k_1 = ((int)(r_1*(float)k_1 + 0.5f)) % kernelDim_1;
-		k_0 = ((int)(r_0*(float)k_0 + 0.5f)) % kernelDim_0;
-		//calculate new coordinate relative to kernel size
-		aux = 1 + kernelDim_2/2;
-		cufftComplex a = d_Src[k_2 + aux *(k_1 + kernelDim_1 * k_0)];
-		
-		if( (k_0 + k_1 + k_2) % 2 == 1 )//after much debugging it seems the phase shift is 0 or Pi (nothing in between). In Matlab is a nice linear change as programmed above
-		{
-			a.x = -a.x;
-			a.y = -a.y;
-		}
-		mulAndScale(a, b, c);
-
-		//__syncthreads();//this actually slows down the code by a lot (0.1 sec for 512x512x512)
-		d_Dst[i] = a;
-
-		i += offset;
-	}
-};
-
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-__global__ void __launch_bounds__(MAX_THREADS_CUDA) fftShiftKernel(imageType* kernelCUDA,imageType* kernelPaddedCUDA,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2)
-{
-	int kernelSize = kernelDim_0 * kernelDim_1 * kernelDim_2;
-
-	int tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if(tid<kernelSize)
-	{
-		//find coordinates
-		long long int x,y,z,aux;
-		z = tid % kernelDim_2;
-		aux = (tid - z)/kernelDim_2;
-		y = aux % kernelDim_1;
-		x = (aux - y)/kernelDim_1;
-
-		//center coordinates
-		x -= kernelDim_0/2;
-		y -= kernelDim_1/2;
-		z -= kernelDim_2/2;
-
-		//circular shift if necessary
-		if(x<0) x += imDim_0;
-		if(y<0) y += imDim_1;
-		if(z<0) z += imDim_2;
-
-		//calculate position in padded kernel
-		aux = z + imDim_2 * (y + imDim_1 * x);
-
-		//copy value
-		kernelPaddedCUDA[aux] = kernelCUDA[tid];//for the most part it should be a coalescent access in oth places
-	}
-}
-
-//=====================================================================
-//-------------to debug elements--------------------------------------
-void writeOutCUDAfft(char* filename,imageType* fftCUDA,int* fftCUDAdims)
-{
-	int fftSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		fftSize *= fftCUDAdims[ii];
-	}
-
-	//copy FFT from CUDA
-	imageType* fftHOST = new imageType[2*fftSize];//complex format
-	HANDLE_ERROR(cudaMemcpy(fftHOST,fftCUDA,2*sizeof(imageType)*fftSize,cudaMemcpyDeviceToHost));
-
-	//calculate module
-	/*
-	int count = 0;
-	for(int ii=0;ii<fftSize;ii++)
-	{
-		fftHOST[ii] = sqrt(fftHOST[count]*fftHOST[count] + fftHOST[count+1]*fftHOST[count+1]);
-		count += 2;
-	}
-	*/
-
-	FILE* fid = fopen(filename,"wb");
-	if(fid == NULL)
-	{
-		printf("ERROR: at writeOutCUDAfft opening file %s\n",filename);
-		exit(2);
-	}else{
-		printf("DEBUGGING: Writing FFT (real part first,imaginary second)  from CUDA of dimensions %d x %d x %d in file %s\n",fftCUDAdims[2],fftCUDAdims[1],fftCUDAdims[0],filename);
-	}
-	//fwrite(fftHOST,sizeof(imageType),fftSize,fid);
-	for(int ii=0;ii<2*fftSize;ii+=2)
-		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
-	for(int ii=1;ii<2*fftSize;ii+=2)
-		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
-
-
-	fclose(fid);
-	delete[] fftHOST;
-}
-
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int devCUDA)
-{
-	imageType* convResult = NULL;
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-
-	//allocate memory for output result
-	convResult = new imageType[imSize];
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), imSizeFFT*sizeof(imageType) ) );
-
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-	//transforming image
-	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex number
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
-
-	//inverse FFT 
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-
-	//copy result to host
-	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-
-	return convResult;
-}
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
-imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
-{
-	imageType* convResult = NULL;
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-	imageType* kernelPaddedCUDA = NULL;
-
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	long long int kernelSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-		kernelSize *= (long long int) (kernelDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-
-	//allocate memory for output result
-	convResult = new imageType[imSize];
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
-
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-
-	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
-	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
-	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
-
-	
-	//make sure GPU finishes before we launch two different streams
-	HANDLE_ERROR(cudaDeviceSynchronize());	
-
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-	//transforming image
-	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
-	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
-
-	//inverse FFT 
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-
-	//copy result to host
-	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
-
-	return convResult;
-}
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
-void convolution3DfftCUDAInPlace(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
-{
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-	imageType* kernelPaddedCUDA = NULL;
-
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	long long int kernelSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-		kernelSize *= (long long int) (kernelDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-
-
-	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
-	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
-	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
-
-	
-	//make sure GPU finishes 
-	HANDLE_ERROR(cudaDeviceSynchronize());	
-
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	//cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;//we wait to conserve memory
-	//cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-	//transforming image
-	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
-
-	//int fftCUDAdims[dimsImage] ={imDim[0],imDim[1],1+imDim[2]/2};
-	//writeOutCUDAfft("E:/temp/fftCUDAgood.bin",kernelPaddedCUDA,fftCUDAdims);
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
-	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;//last parameter is the size of the FFT
-
-
-	//we destroy memory first so we can perform larger block convoutions
-	( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
-
-	//inverse FFT 
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//copy result to host and overwrite image
-	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-    //( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	//HANDLE_ERROR( cudaFree( kernelCUDA));
-	//HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
-
-}
-
-
-//=====================================================================
-//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
-//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
-/*
-void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
-{
-	imageType* imCUDA = NULL;
-	imageType* kernelCUDA = NULL;
-	//imageType* kernelFFTCUDA = NULL; //I need a duplicate in order to perform FFTshift (I can not do it in place)
-
-	cufftHandle fftPlanFwd, fftPlanInv;
-
-	
-	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
-
-	long long int imSize = 1;
-	long long int kernelSize = 1;
-	for(int ii=0;ii<dimsImage;ii++)
-	{
-		imSize *= (long long int) (imDim[ii]);
-		kernelSize *= (long long int) (kernelDim[ii]);
-	}
-
-	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
-	long long int kernelSizeFFT = kernelSize + (long long int)(2*kernelDim[0]*kernelDim[1]);
-
-	
-
-	//allocat ememory in GPU
-	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
-	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
-	//HANDLE_ERROR( cudaMalloc( (void**)&(kernelFFTCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
-
-	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
-	//TODO: pad kernel to image size
-	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
-	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
-
-	//printf("Copying memory (kernel and image) to GPU\n");
-	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
-
-
-	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
-	//HANDLE_ERROR( cudaMemset( kernelFFTCUDA, 0, kernelSizeFFT*sizeof(imageType) ));
-	//int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
-	//int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
-	//fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelFFTCUDA,kernelDim[0],kernelDim[1],kernelDim[2],kernelDim[0],kernelDim[1],kernelDim[2]);HANDLE_ERROR_KERNEL;
-
-	
-	//make sure GPU finishes 
-	HANDLE_ERROR(cudaDeviceSynchronize());	
-
-	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
-	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
-	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
-	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
-	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
-
-
-	//transforming kernel
-	cufftDestroy(fftPlanFwd); HANDLE_ERROR_KERNEL;
-	cufftPlan3d(&fftPlanFwd, kernelDim[0], kernelDim[1], kernelDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
-	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
-
-
-	//int fftCUDAdims[dimsImage] ={kernelDim[0],kernelDim[1],1+kernelDim[2]/2};
-	//writeOutCUDAfft("E:/temp/fftCUDAbad.bin",kernelCUDA,fftCUDAdims);
-
-	//multiply image and kernel in fourier space (and normalize)
-	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
-	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
-	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
-	//we multiply two FFT of different sizes (but one was just padded with zeros)
-	modulateAndNormalizeSubsampled_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2], imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;
-
-
-	//we destroy memory first so we can perform larger block convoutions
-	cufftDestroy(fftPlanFwd) ;HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( kernelCUDA));
-	//HANDLE_ERROR( cudaFree( kernelFFTCUDA));
-
-	//inverse FFT 
-	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
-	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
-	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
-	
-
-	//copy result to host and overwrite image
-	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
-
-	//release memory
-	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
-	HANDLE_ERROR( cudaFree( imCUDA));
-	
-}
-*/
+#include "convolution3Dfft.h"
+#include "book.h"
+#include "cuda.h"
+#include "cufft.h"
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+
+//__device__ static const float PI_2 = 6.28318530717958620f;
+//__device__ static const float PI_1 =  3.14159265358979310f;
+
+////////////////////////////////////////////////////////////////////////////////
+// Modulate Fourier image of padded data by Fourier image of padded kernel
+// and normalize by FFT size
+////////////////////////////////////////////////////////////////////////////////
+//Adapted from CUDA SDK examples
+__device__ void mulAndScale(cufftComplex& a, const cufftComplex& b, const float& c)
+{
+    cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
+    a = t;
+};
+
+__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalize_kernel(cufftComplex *d_Dst, cufftComplex *d_Src, long long int dataSize,float c)
+{
+    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
+	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
+    while(i < dataSize)
+	{		
+
+		cufftComplex a = d_Src[i];
+		cufftComplex b = d_Dst[i];
+
+		mulAndScale(a, b, c);
+		d_Dst[i] = a;
+
+		i += offset;
+	}
+};
+
+//we use nearest neighbor interpolation to access FFT coefficients in the kernel
+__global__ void __launch_bounds__(MAX_THREADS_CUDA)  modulateAndNormalizeSubsampled_kernel(cufftComplex *d_Dst, cufftComplex *d_Src,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2,long long int datasize,float c)
+{
+
+	float r_0 = ((float)kernelDim_0) / ((float)imDim_0); //ratio between image size and kernel size to calculate access
+	float r_1 = ((float)kernelDim_1) / ((float)imDim_1);
+	float r_2 = ((float)kernelDim_2) / ((float)imDim_2);
+
+    long long int i = (long long int)blockDim.x * (long long int)blockIdx.x + (long long int)threadIdx.x;
+	long long int offset = (long long int)blockDim.x * (long long int)gridDim.x;
+	int k_0,k_1,k_2;
+	int aux;
+	//	float auxExp, auxSin,auxCos;
+    while(i < datasize)
+	{
+		//for each dimension we need to access k_i*r_i  i=0, 1, 2
+		aux = 1 + imDim_2/2;
+		k_2 = i % aux;
+		aux = (i - k_2) / aux;
+		k_1 = aux % imDim_1;
+		k_0 = (aux - k_1) / imDim_1;
+
+		cufftComplex b = d_Dst[i];
+
+		//apply shift in fourier domain since we did not apply fftshift to kernel (so we could use the trick of assuming the kernel is padded with zeros and then just subsample FFT)
+		/* This is how we would do it in Matlab (linear phase change)
+		auxExp = k_0 * r_0;
+		auxExp += k_1 * r_1;
+		auxExp += k_2 * r_2;
+		auxExp *= PI_1;
+		auxSin = sin(auxExp);
+		auxCos = cos(auxExp);
+		auxExp = b.x * auxCos - b.y * auxSin;
+
+		b.y = b.x * auxSin + b.y * auxCos;
+		b.x = auxExp;
+		*/
+
+		//add the ratio to each dimension and apply nearest neighbor interpolation
+		//k_2 = min((int)(r_2*(float)k_2 + 0.5f),kernelDim_2-1);//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
+		//k_1 = min((int)(r_1*(float)k_1 + 0.5f),kernelDim_1-1);
+		//k_0 = min((int)(r_0*(float)k_0 + 0.5f),kernelDim_0-1);
+		k_2 = ((int)(r_2*(float)k_2 + 0.5f)) % kernelDim_2;//the very end points need to be interpolated as "ceiling" instead of round or we can get oout of bounds access
+		k_1 = ((int)(r_1*(float)k_1 + 0.5f)) % kernelDim_1;
+		k_0 = ((int)(r_0*(float)k_0 + 0.5f)) % kernelDim_0;
+		//calculate new coordinate relative to kernel size
+		aux = 1 + kernelDim_2/2;
+		cufftComplex a = d_Src[k_2 + aux *(k_1 + kernelDim_1 * k_0)];
+		
+		if( (k_0 + k_1 + k_2) % 2 == 1 )//after much debugging it seems the phase shift is 0 or Pi (nothing in between). In Matlab is a nice linear change as programmed above
+		{
+			a.x = -a.x;
+			a.y = -a.y;
+		}
+		mulAndScale(a, b, c);
+
+		//__syncthreads();//this actually slows down the code by a lot (0.1 sec for 512x512x512)
+		d_Dst[i] = a;
+
+		i += offset;
+	}
+};
+
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+__global__ void __launch_bounds__(MAX_THREADS_CUDA) fftShiftKernel(imageType* kernelCUDA,imageType* kernelPaddedCUDA,int kernelDim_0,int kernelDim_1,int kernelDim_2,int imDim_0,int imDim_1,int imDim_2)
+{
+	int kernelSize = kernelDim_0 * kernelDim_1 * kernelDim_2;
+
+	int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if(tid<kernelSize)
+	{
+		//find coordinates
+		long long int x,y,z,aux;
+		z = tid % kernelDim_2;
+		aux = (tid - z)/kernelDim_2;
+		y = aux % kernelDim_1;
+		x = (aux - y)/kernelDim_1;
+
+		//center coordinates
+		x -= kernelDim_0/2;
+		y -= kernelDim_1/2;
+		z -= kernelDim_2/2;
+
+		//circular shift if necessary
+		if(x<0) x += imDim_0;
+		if(y<0) y += imDim_1;
+		if(z<0) z += imDim_2;
+
+		//calculate position in padded kernel
+		aux = z + imDim_2 * (y + imDim_1 * x);
+
+		//copy value
+		kernelPaddedCUDA[aux] = kernelCUDA[tid];//for the most part it should be a coalescent access in oth places
+	}
+}
+
+//=====================================================================
+//-------------to debug elements--------------------------------------
+void writeOutCUDAfft(char* filename,imageType* fftCUDA,int* fftCUDAdims)
+{
+	int fftSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		fftSize *= fftCUDAdims[ii];
+	}
+
+	//copy FFT from CUDA
+	imageType* fftHOST = new imageType[2*fftSize];//complex format
+	HANDLE_ERROR(cudaMemcpy(fftHOST,fftCUDA,2*sizeof(imageType)*fftSize,cudaMemcpyDeviceToHost));
+
+	//calculate module
+	/*
+	int count = 0;
+	for(int ii=0;ii<fftSize;ii++)
+	{
+		fftHOST[ii] = sqrt(fftHOST[count]*fftHOST[count] + fftHOST[count+1]*fftHOST[count+1]);
+		count += 2;
+	}
+	*/
+
+	FILE* fid = fopen(filename,"wb");
+	if(fid == NULL)
+	{
+		printf("ERROR: at writeOutCUDAfft opening file %s\n",filename);
+		exit(2);
+	}else{
+		printf("DEBUGGING: Writing FFT (real part first,imaginary second)  from CUDA of dimensions %d x %d x %d in file %s\n",fftCUDAdims[2],fftCUDAdims[1],fftCUDAdims[0],filename);
+	}
+	//fwrite(fftHOST,sizeof(imageType),fftSize,fid);
+	for(int ii=0;ii<2*fftSize;ii+=2)
+		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
+	for(int ii=1;ii<2*fftSize;ii+=2)
+		fwrite(&(fftHOST[ii]),sizeof(imageType),1,fid);
+
+
+	fclose(fid);
+	delete[] fftHOST;
+}
+
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+imageType* convolution3DfftCUDA_test(imageType* im,
+													 int* imDim,
+													 imageType* kernel,
+													 int devCUDA)
+{
+	imageType* convResult = NULL;
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+
+	//allocate memory for output result
+	convResult = new imageType[imSize];
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), imSizeFFT*sizeof(imageType) ) );
+
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+	//transforming image
+	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex number
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
+
+	//inverse FFT 
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+
+	//copy result to host
+	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+
+	return convResult;
+}
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
+ imageType* convolution3DfftCUDA(imageType* im,
+													 int* imDim,
+													 imageType* kernel,
+													 int* kernelDim,
+													 int devCUDA)
+{
+	imageType* convResult = NULL;
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+	imageType* kernelPaddedCUDA = NULL;
+
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	long long int kernelSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+		kernelSize *= (long long int) (kernelDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+
+	//allocate memory for output result
+	convResult = new imageType[imSize];
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
+
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+
+	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
+	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
+	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
+
+	
+	//make sure GPU finishes before we launch two different streams
+	HANDLE_ERROR(cudaDeviceSynchronize());	
+
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+	//transforming image
+	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
+	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));//last parameter is the size of the FFT
+
+	//inverse FFT 
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+
+	//copy result to host
+	HANDLE_ERROR(cudaMemcpy(convResult,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+    ( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
+
+	return convResult;
+}
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
+ void convolution3DfftCUDAInPlace(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
+{
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+	imageType* kernelPaddedCUDA = NULL;
+
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	long long int kernelSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+		kernelSize *= (long long int) (kernelDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSize)*sizeof(imageType) ) );
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelPaddedCUDA), imSizeFFT*sizeof(imageType) ) );
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+
+
+	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
+	HANDLE_ERROR( cudaMemset( kernelPaddedCUDA, 0, imSizeFFT*sizeof(imageType) ));
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
+	fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelPaddedCUDA,kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2]);HANDLE_ERROR_KERNEL;
+
+	
+	//make sure GPU finishes 
+	HANDLE_ERROR(cudaDeviceSynchronize());	
+
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	//cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;//we wait to conserve memory
+	//cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+	//transforming image
+	cufftExecR2C(fftPlanFwd, kernelPaddedCUDA, (cufftComplex *)kernelPaddedCUDA);HANDLE_ERROR_KERNEL;
+
+	//int fftCUDAdims[dimsImage] ={imDim[0],imDim[1],1+imDim[2]/2};
+	//writeOutCUDAfft("E:/temp/fftCUDAgood.bin",kernelPaddedCUDA,fftCUDAdims);
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
+	numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	modulateAndNormalize_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelPaddedCUDA, imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;//last parameter is the size of the FFT
+
+
+	//we destroy memory first so we can perform larger block convoutions
+	( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+	HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
+
+	//inverse FFT 
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//copy result to host and overwrite image
+	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+    //( cufftDestroy(fftPlanFwd) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	//HANDLE_ERROR( cudaFree( kernelCUDA));
+	//HANDLE_ERROR( cudaFree( kernelPaddedCUDA));
+
+}
+
+
+//=====================================================================
+//WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
+//NOTE: to avoid transferring a large padded kernel, since memcpy is a limiting factor 
+/*
+void convolution3DfftCUDAInPlaceSaveMemory(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA)
+{
+	imageType* imCUDA = NULL;
+	imageType* kernelCUDA = NULL;
+	//imageType* kernelFFTCUDA = NULL; //I need a duplicate in order to perform FFTshift (I can not do it in place)
+
+	cufftHandle fftPlanFwd, fftPlanInv;
+
+	
+	HANDLE_ERROR( cudaSetDevice( devCUDA ) );
+
+	long long int imSize = 1;
+	long long int kernelSize = 1;
+	for(int ii=0;ii<dimsImage;ii++)
+	{
+		imSize *= (long long int) (imDim[ii]);
+		kernelSize *= (long long int) (kernelDim[ii]);
+	}
+
+	long long int imSizeFFT = imSize+(long long int)(2*imDim[0]*imDim[1]); //size of the R2C transform in cuFFTComplex
+	long long int kernelSizeFFT = kernelSize + (long long int)(2*kernelDim[0]*kernelDim[1]);
+
+	
+
+	//allocat ememory in GPU
+	HANDLE_ERROR( cudaMalloc( (void**)&(imCUDA), imSizeFFT*sizeof(imageType) ) );//a little bit larger to allow in-place FFT
+	HANDLE_ERROR( cudaMalloc( (void**)&(kernelCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
+	//HANDLE_ERROR( cudaMalloc( (void**)&(kernelFFTCUDA), (kernelSizeFFT)*sizeof(imageType) ) );
+
+	//TODO: pad image to a power of 2 size in all dimensions (use whatever  boundary conditions you want to apply)
+	//TODO: pad kernel to image size
+	//TODO: pad kernel and image to xy(z/2 + 1) for in-place transform
+	//NOTE: in the example for 2D convolution using FFT in the Nvidia SDK they do the padding in the GPU, but in might be pushing the memory in the GPU for large images.
+
+	//printf("Copying memory (kernel and image) to GPU\n");
+	HANDLE_ERROR( cudaMemcpy( kernelCUDA, kernel, kernelSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+	HANDLE_ERROR( cudaMemcpy( imCUDA, im, imSize*sizeof(imageType) , cudaMemcpyHostToDevice ) );
+
+
+	//apply ffshift to kernel and pad it with zeros so we can calculate convolution with FFT
+	//HANDLE_ERROR( cudaMemset( kernelFFTCUDA, 0, kernelSizeFFT*sizeof(imageType) ));
+	//int numThreads=std::min((long long int)MAX_THREADS_CUDA,kernelSize);
+	//int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(kernelSize+(long long int)(numThreads-1))/((long long int)numThreads));
+	//fftShiftKernel<<<numBlocks,numThreads>>>(kernelCUDA,kernelFFTCUDA,kernelDim[0],kernelDim[1],kernelDim[2],kernelDim[0],kernelDim[1],kernelDim[2]);HANDLE_ERROR_KERNEL;
+
+	
+	//make sure GPU finishes 
+	HANDLE_ERROR(cudaDeviceSynchronize());	
+
+	//printf("Creating R2C & C2R FFT plans for size %i x %i x %i\n",imDim[0],imDim[1],imDim[2]);
+	cufftPlan3d(&fftPlanFwd, imDim[0], imDim[1], imDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	//transforming convolution kernel; TODO: if I do multiple convolutions with the same kernel I could reuse the results at teh expense of using out-of place memory (and then teh layout of the data is different!!!! so imCUDAfft should also be out of place)
+	//NOTE: from CUFFT manual: If idata and odata are the same, this method does an in-place transform.
+	//NOTE: from CUFFT manual: inplace output data xy(z/2 + 1) with fcomplex. Therefore, in order to perform an in-place FFT, the user has to pad the input array in the last dimension to Nn2 + 1 complex elements interleaved. Note that the real-to-complex transform is implicitly forward.
+	cufftExecR2C(fftPlanFwd, imCUDA, (cufftComplex *)imCUDA);HANDLE_ERROR_KERNEL;
+
+
+	//transforming kernel
+	cufftDestroy(fftPlanFwd); HANDLE_ERROR_KERNEL;
+	cufftPlan3d(&fftPlanFwd, kernelDim[0], kernelDim[1], kernelDim[2], CUFFT_R2C);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanFwd,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL; //for highest performance since we do not need FFTW compatibility
+	cufftExecR2C(fftPlanFwd, kernelCUDA, (cufftComplex *)kernelCUDA);HANDLE_ERROR_KERNEL;
+
+
+	//int fftCUDAdims[dimsImage] ={kernelDim[0],kernelDim[1],1+kernelDim[2]/2};
+	//writeOutCUDAfft("E:/temp/fftCUDAbad.bin",kernelCUDA,fftCUDAdims);
+
+	//multiply image and kernel in fourier space (and normalize)
+	//NOTE: from CUFFT manual: CUFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input scaled by the number of elements.
+	int numThreads=std::min((long long int)MAX_THREADS_CUDA,imSizeFFT/2);//we are using complex numbers
+	int numBlocks=std::min((long long int)MAX_BLOCKS_CUDA,(long long int)(imSizeFFT/2+(long long int)(numThreads-1))/((long long int)numThreads));
+	//we multiply two FFT of different sizes (but one was just padded with zeros)
+	modulateAndNormalizeSubsampled_kernel<<<numBlocks,numThreads>>>((cufftComplex *)imCUDA, (cufftComplex *)kernelCUDA, kernelDim[0],kernelDim[1],kernelDim[2],imDim[0],imDim[1],imDim[2], imSizeFFT/2,1.0f/(float)(imSize));HANDLE_ERROR_KERNEL;
+
+
+	//we destroy memory first so we can perform larger block convoutions
+	cufftDestroy(fftPlanFwd) ;HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( kernelCUDA));
+	//HANDLE_ERROR( cudaFree( kernelFFTCUDA));
+
+	//inverse FFT 
+	cufftPlan3d(&fftPlanInv, imDim[0], imDim[1], imDim[2], CUFFT_C2R);HANDLE_ERROR_KERNEL;
+	cufftSetCompatibilityMode(fftPlanInv,CUFFT_COMPATIBILITY_NATIVE);HANDLE_ERROR_KERNEL;
+	cufftExecC2R(fftPlanInv, (cufftComplex *)imCUDA, imCUDA);HANDLE_ERROR_KERNEL;
+	
+
+	//copy result to host and overwrite image
+	HANDLE_ERROR(cudaMemcpy(im,imCUDA,sizeof(imageType)*imSize,cudaMemcpyDeviceToHost));
+
+	//release memory
+	( cufftDestroy(fftPlanInv) );HANDLE_ERROR_KERNEL;
+	HANDLE_ERROR( cudaFree( imCUDA));
+	
+}
+*/
diff --git a/src/convolution3Dfft.h b/src/convolution3Dfft.h
index 8be324e..667431f 100644
--- a/src/convolution3Dfft.h
+++ b/src/convolution3Dfft.h
@@ -25,7 +25,10 @@ static const int dimsImage = 3;//so thing can be set at co0mpile time
 WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDim[2] * (y + imDim[1] * x)
 NOTE: we assume kernel size is the same as image (it has been padded appropriately)
 */
-imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int devCUDA);
+FUNCTION_PREFIX imageType* convolution3DfftCUDA_test(imageType* im,
+								int* imDim,
+								imageType* kernel,
+								int devCUDA);
 
 
 /*
@@ -34,7 +37,11 @@ WARNING: for cuFFT the fastest running index is z direction!!! so pos = z + imDi
 
 TODO: pad data with imSize+kernelSize-1 (kernelSize/2 on each side) to impose the boundary conditions that you want: mirror, zero, etc...). Look at conv2DFFT in the CUDA SDK examples
 */
-FUNCTION_PREFIX imageType* convolution3DfftCUDA(imageType* im,int* imDim,imageType* kernel,int* kernelDim,int devCUDA);
+FUNCTION_PREFIX imageType* convolution3DfftCUDA(imageType* im,
+												int* imDim,
+												imageType* kernel,
+												int* kernelDim,
+												int devCUDA);
 
 
 /*
diff --git a/src/standardCUDAfunctions.cu b/src/standardCUDAfunctions.cu
index e5c6f1f..307b449 100644
--- a/src/standardCUDAfunctions.cu
+++ b/src/standardCUDAfunctions.cu
@@ -10,7 +10,7 @@
 #include "convolution3Dfft.h"
 
 //==============================================
-int selectDeviceWithHighestComputeCapability() {
+ int selectDeviceWithHighestComputeCapability() {
 
   int numDevices = 0;
   HANDLE_ERROR(cudaGetDeviceCount(&numDevices));
@@ -32,14 +32,15 @@ int selectDeviceWithHighestComputeCapability() {
   return value;
 }
 
-int getCUDAcomputeCapabilityMajorVersion(int devCUDA)
+ int getCUDAcomputeCapabilityMajorVersion(int devCUDA)
 {
 	int major = 0, minor = 0;
 	cuDeviceComputeCapability(&major, &minor,devCUDA);
 
 	return major;
 }
-int getCUDAcomputeCapabilityMinorVersion(int devCUDA)
+
+ int getCUDAcomputeCapabilityMinorVersion(int devCUDA)
 {
 	int major = 0, minor = 0;
 	cuDeviceComputeCapability(&major, &minor,devCUDA);
@@ -47,20 +48,22 @@ int getCUDAcomputeCapabilityMinorVersion(int devCUDA)
 	return minor;
 }
 
-int getNumDevicesCUDA()
+ int getNumDevicesCUDA()
 {
 	int count = 0;
 	HANDLE_ERROR(cudaGetDeviceCount ( &count ));
 	return count;
 }
-void getNameDeviceCUDA(int devCUDA, char* name)
+
+ void getNameDeviceCUDA(int devCUDA, char* name)
 {
 	cudaDeviceProp prop;
 	HANDLE_ERROR( cudaGetDeviceProperties(&prop, devCUDA));
 
 	memcpy(name,prop.name,sizeof(char)*256);
 }
-long long int getMemDeviceCUDA(int devCUDA)
+
+ long long int getMemDeviceCUDA(int devCUDA)
 {
 	cudaDeviceProp prop;
 	HANDLE_ERROR( cudaGetDeviceProperties(&prop, devCUDA));
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1c11319..85bbf81 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,9 +12,11 @@ FIND_PACKAGE(CUDA)
 
 IF(CUDA_FOUND)
 INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
+LINK_DIRECTORIES(${PROJECT_BINARY_DIR}/src) 
 
 CUDA_ADD_EXECUTABLE(test_gpu_convolve test_gpu_convolve.cpp image_stack_utils.cpp)
 target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
+set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-D${PROJECT_NAME}_BUILT_AS_STATIC")
 
 ELSE(CUDA_FOUND)
 MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!")

From 7dcf38c0b573faa38b9f3229322649c79f5c6c73 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Mon, 11 May 2015 16:29:14 +0200
Subject: [PATCH 20/24] mingled with the release type, needs to be given on
 windows for visual studio

---
 CMakeLists.txt |  7 +------
 README.md      | 12 +++++++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index baa5005..bc6f9d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ if(NOT CMAKE_BUILD_TYPE)
 RelWithDebInfo MinSizeRel."
       FORCE)
 endif(NOT CMAKE_BUILD_TYPE)
+MESSAGE(">> Setting up ${CMAKE_BUILD_TYPE} build")
 
 # shared path is architecture independent for now, TODO extend this to lib/bin/include
 IF(UNIX)
@@ -64,12 +65,6 @@ foreach(p LIB INCLUDE SHARE)
   endif()
 endforeach()
 
-# set project build type
-IF(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
-   SET(CMAKE_BUILD_TYPE Release)
-ENDIF(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
-MESSAGE(">> Setting up ${CMAKE_BUILD_TYPE} build")
-
 #########################################################################################################
 ## CUDA related
 # project options
diff --git a/README.md b/README.md
index c7c2424..ff631a5 100644
--- a/README.md
+++ b/README.md
@@ -14,14 +14,16 @@ ${COMMAND} ${FLAGS} -lcufft -lcuda ${OUTPUT_FLAG} ${OUTPUT_PREFIX} ${OUTPUT} ${I
 
 Now build the .so/.dll library and put it into the Fiji directory.
 
+The cmake build system is also supported under Windows 7 64bit now!
 
 ```bash
-$ cd 
-# build including tests
+$ cd X:\path\to\repo
+$ mkdir build
+$ cd build
 $ cmake.exe -G "Visual Studio 12 2013 Win64" -DCMAKE_INSTALL_PREFIX=C:/msys64/home/steinbac/ftmp -DBOOST_LIBRARYDIR=C:/boost_1_58_0/msvc-12-x86_64/lib64-msvc-12.0 -DBOOST_ROOT=C:/boost_1_58_0/msvc-12-x86_64 ..
-$ cmake --build . --target ALL_BUILD
-$ ctest.exe
-$ cmake --build . --target install
+$ cmake.exe --build . --target ALL_BUILD --config Release
+$ ctest.exe -C Release #(optional) the above builds in Release mode
+$ cmake.exe --build . --target install --config Release
 ```
 
 OSX/Linux Build Instructions

From 1a9529bb8ce22a2a2adde47fc4fb4d1d9bdbb8a7 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbach@scionics.de>
Date: Mon, 11 May 2015 21:31:49 +0200
Subject: [PATCH 21/24] replaced relative path in install statement by INSTALL_
 variable, added missing slash

---
 CMakeLists.txt     | 2 +-
 src/CMakeLists.txt | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc6f9d6..f7c18b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,7 +154,7 @@ configure_file(fourierconvolutioncudalib-config-version.cmake.in
 
 # Install the fourierconvolutioncudalib-config.cmake and fourierconvolutioncudalib-config-version.cmake
 install(FILES
-  "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/fourierconvolutioncudalib-config.cmake"
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/fourierconvolutioncudalib-config.cmake"
   "${PROJECT_BINARY_DIR}/fourierconvolutioncudalib-config-version.cmake"
   DESTINATION "${INSTALL_SHARE_DIR}" COMPONENT dev)
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d920de2..f4a275d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,9 +31,9 @@ SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES PUBLIC_HEADER "${PROJECT
 
 INSTALL(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static
   EXPORT fourierconvolutioncudalib-targets 
-  LIBRARY DESTINATION lib
-  ARCHIVE DESTINATION lib
-  PUBLIC_HEADER DESTINATION include
+  LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+  ARCHIVE DESTINATION ${INSTALL_LIB_DIR}
+  PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDE_DIR}
   )
 
 

From 03adcef92a6301d51c1e7d1354276d9e284f5a1f Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbach@scionics.de>
Date: Mon, 11 May 2015 21:37:32 +0200
Subject: [PATCH 22/24] added code to use dynamically linked libraries if
 static boost utf libs are not found

---
 tests/CMakeLists.txt | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 85bbf81..108a901 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -15,8 +15,27 @@ INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
 LINK_DIRECTORIES(${PROJECT_BINARY_DIR}/src) 
 
 CUDA_ADD_EXECUTABLE(test_gpu_convolve test_gpu_convolve.cpp image_stack_utils.cpp)
-target_link_libraries(test_gpu_convolve ${BOOST_LIRBARIES} ${PROJECT_NAME}_static)
+
+IF(Boost_FOUND)
+target_link_libraries(test_gpu_convolve ${Boost_LIBRARIES} ${PROJECT_NAME}_static)
 set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-D${PROJECT_NAME}_BUILT_AS_STATIC")
+MESSAGE(">> static Boost UTF: ${Boost_LIBRARIES}")
+ELSE(Boost_FOUND)
+
+SET(Boost_USE_STATIC_LIBS OFF)
+SET(Boost_USE_MULTITHREADED ON)
+FIND_PACKAGE (Boost 1.42 QUIET COMPONENTS system filesystem unit_test_framework REQUIRED)
+IF(Boost_FOUND)
+MESSAGE(">> dynamic Boost UTF ${Boost_LIBRARIES}")
+INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
+LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) 
+
+target_link_libraries(test_gpu_convolve ${Boost_LIBRARIES} ${PROJECT_NAME})
+set_target_properties(test_gpu_convolve PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK")
+ELSE()
+MESSAGE(">> dynamic Boost UTF NOT FOUND")
+ENDIF()
+ENDIF(Boost_FOUND)
 
 ELSE(CUDA_FOUND)
 MESSAGE(WARNING "Skipping GPU based tests, CUDA not found\!")

From 074d409eddde956d7efcf0d4482b405d2d25907c Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Fri, 22 May 2015 09:16:49 +0200
Subject: [PATCH 23/24] made find BOOST quiet in root CMakeLists.txt, so people
 are not paniced by a warning

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7c18b2..5705b40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,7 +122,7 @@ ENDIF(INCLUDE_CUDA)
 # add subdirectories
 ADD_SUBDIRECTORY(src)
 
-FIND_PACKAGE (Boost 1.42 COMPONENTS system filesystem unit_test_framework thread)
+FIND_PACKAGE (Boost 1.42 COMPONENTS system filesystem unit_test_framework thread QUIET)
 IF(Boost_FOUND)
 ADD_SUBDIRECTORY(tests)
 enable_testing()

From 10981f4855394ab81e76c06431f515ba99f60fa3 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Fri, 22 May 2015 09:23:07 +0200
Subject: [PATCH 24/24] made root CMakeLists.txt file coherent in output,
 dropped WARNING flag of boost check message

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5705b40..777fc05 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,7 +109,7 @@ IF(INCLUDE_CUDA)
 			IF(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
 				message(FATAL_ERROR ">> CUDA version 7 or higher does not support 32bit builds on Windows (found CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})")
 			ELSE()
-				message("[Windows] 64bit build detected, ${CMAKE_SIZEOF_VOID_P}")
+				message(">> [Windows] 64bit build detected, size of void pointer ${CMAKE_SIZEOF_VOID_P}")
 			ENDIF()
 		
 		ENDIF()
@@ -128,7 +128,7 @@ ADD_SUBDIRECTORY(tests)
 enable_testing()
 include("CTestLists.txt")
 ELSE()
-MESSAGE(WARNING ">> Boost libraries not found, unable to compile tests (skipping them)")
+MESSAGE(">> Boost libraries not found, skipping building setting up tests")
 ENDIF()
 
 # Export the package for use from the build-tree