diff --git a/src/edu/stanford/rsl/conrad/opencl/OpenCLBackProjector.java b/src/edu/stanford/rsl/conrad/opencl/OpenCLBackProjector.java
index 488a84f3..ff01bbbb 100644
--- a/src/edu/stanford/rsl/conrad/opencl/OpenCLBackProjector.java
+++ b/src/edu/stanford/rsl/conrad/opencl/OpenCLBackProjector.java
@@ -26,12 +26,23 @@
 import edu.stanford.rsl.conrad.data.numeric.Grid3D;
 import edu.stanford.rsl.conrad.io.ImagePlusDataSink;
 import edu.stanford.rsl.conrad.numerics.SimpleMatrix;
+import edu.stanford.rsl.conrad.numerics.SimpleVector;
 import edu.stanford.rsl.conrad.reconstruction.VOIBasedReconstructionFilter;
 import edu.stanford.rsl.conrad.utils.CONRAD;
 import edu.stanford.rsl.conrad.utils.Configuration;
 import edu.stanford.rsl.conrad.utils.ImageGridBuffer;
 import edu.stanford.rsl.conrad.utils.ImageUtil;
 
+/**
+ * Backprojection on GPU using OpenCL.
+ * 
+ * If Moving Least Squares deformation should be included in the reconstruction, the deformation parameters (points
+ * before and after) need to be set using the method setDeformParameters() before the init() method is called.
+ * 
+ * @author Martin Berger, Jennifer Maier
+ *
+ */
+
 public class OpenCLBackProjector extends VOIBasedReconstructionFilter implements Runnable, Citeable{
 
 	/**
@@ -77,6 +88,13 @@ public class OpenCLBackProjector extends VOIBasedReconstructionFilter implements
 	 */
 	protected float h_volume[];
 
+	/**
+	 * Deformed 3D coordinates
+	 */
+	protected SimpleVector[] p = null;
+	protected SimpleVector[][] qs = null;
+	private CLBuffer<FloatBuffer> pBuff;
+	private CLBuffer<FloatBuffer> qBuff;
 
 	/**
 	 * The global variable of the module which stores the
@@ -95,6 +113,8 @@ public class OpenCLBackProjector extends VOIBasedReconstructionFilter implements
 
 	private boolean initialized = false;
 
+	
+
 	public OpenCLBackProjector () {
 		super();
 	}
@@ -187,8 +207,11 @@ protected void init(){
 			}
 
 			// create the computing kernel
-			kernelFunction = program.createCLKernel("backprojectKernel");
-
+			if (this.p == null) {
+				kernelFunction = program.createCLKernel("backprojectKernel");
+			} else {
+				kernelFunction = program.createCLKernel("backprojectKernelDeformed");
+			}
 			// create the reconstruction volume;
 			// createFloatBuffer uses a byteBuffer internally --> h_volume.length cannot be > 2^31/4 = 2^31/2^2 = 2^29
 			// Thus, 2^29 would already cause a overflow (negative sign) of the integer in the byte buffer! Maximum length is (2^29-1)
@@ -210,7 +233,7 @@ protected void init(){
 
 			commandQueue.
 			putWriteBuffer(volumePointer, true).
-			finish();
+			finish();			
 
 			initialized = true;
 		}
@@ -326,7 +349,7 @@ protected synchronized void initProjectionData(Grid2D projection){
 				// Create the array that will contain the projection data. 
 				projectionArray = context.createFloatBuffer(projection.getWidth()*projection.getHeight(), Mem.READ_ONLY);
 			}
-			
+
 			// Copy the projection data to the array 
 			projectionArray.getBuffer().put(projection.getBuffer());
 			projectionArray.getBuffer().rewind();
@@ -334,7 +357,7 @@ protected synchronized void initProjectionData(Grid2D projection){
 			if(projectionTex != null && !projectionTex.isReleased()){			
 				projectionTex.release();
 			}
-			
+
 			// set the texture
 			CLImageFormat format = new CLImageFormat(ChannelOrder.INTENSITY, ChannelType.FLOAT);
 			projectionTex = context.createImage2d(projectionArray.getBuffer(), projection.getWidth(), projection.getHeight(), format, Mem.READ_ONLY);
@@ -399,6 +422,11 @@ protected synchronized void projectSingleProjection(int projectionNumber, int di
 		if (!largeVolumeMode) {
 			projections.remove(projectionNumber);
 		}
+
+		if (this.p != null) {
+			initDeformedCoordinates(projectionNumber);
+		}
+
 		// backproject for each slice
 		// OpenCL Grids are only two dimensional!
 		int reconDimensionZ = dimz;
@@ -407,28 +435,51 @@ protected synchronized void projectSingleProjection(int projectionNumber, int di
 		double voxelSpacingZ = getGeometry().getVoxelSpacingZ();
 
 		// write kernel parameters
-		kernelFunction.rewind();
-		kernelFunction
-		.putArg(volumePointer)
-		.putArg(getGeometry().getReconDimensionX())
-		.putArg(getGeometry().getReconDimensionY())
-		.putArg(reconDimensionZ)
-		.putArg((int) lineOffset)
-		.putArg((float) voxelSpacingX)
-		.putArg((float) voxelSpacingY)
-		.putArg((float) voxelSpacingZ)
-		.putArg((float) offsetX)
-		.putArg((float) offsetY)
-		.putArg((float) offsetZ)
-		.putArg(projectionTex)
-		.putArg(projectionMatrix)
-		.putArg(projectionMultiplier);
+		// deformed coordinates
+		if (this.pBuff == null) {
+			kernelFunction.rewind();
+			kernelFunction
+			.putArg(volumePointer)
+			.putArg(getGeometry().getReconDimensionX())
+			.putArg(getGeometry().getReconDimensionY())
+			.putArg(reconDimensionZ)
+			.putArg((int) lineOffset)
+			.putArg((float) voxelSpacingX)
+			.putArg((float) voxelSpacingY)
+			.putArg((float) voxelSpacingZ)
+			.putArg((float) offsetX)
+			.putArg((float) offsetY)
+			.putArg((float) offsetZ)
+			.putArg(projectionTex)
+			.putArg(projectionMatrix)
+			.putArg(projectionMultiplier);
+		} else {
+			kernelFunction.rewind();
+			kernelFunction
+			.putArg(volumePointer)
+			.putArg(getGeometry().getReconDimensionX())
+			.putArg(getGeometry().getReconDimensionY())
+			.putArg(reconDimensionZ)
+			.putArg((int) lineOffset)
+			.putArg((float) voxelSpacingX)
+			.putArg((float) voxelSpacingY)
+			.putArg((float) voxelSpacingZ)
+			.putArg((float) offsetX)
+			.putArg((float) offsetY)
+			.putArg((float) offsetZ)
+			.putArg(projectionTex)
+			.putArg(projectionMatrix)
+			.putArg(projectionMultiplier)
+			.putArg(pBuff)
+			.putArg(qBuff)
+			.putArg((int) p.length);
+		}
+
 
-		
 		int[] realLocalSize = new int[2];
 		realLocalSize[0] = Math.min(device.getMaxWorkGroupSize(),bpBlockSize[0]);
 		realLocalSize[1] = Math.max(1, Math.min(device.getMaxWorkGroupSize()/realLocalSize[0], bpBlockSize[1]));
-		
+
 		// rounded up to the nearest multiple of localWorkSize
 		int[] globalWorkSize = {getGeometry().getReconDimensionX(), getGeometry().getReconDimensionY()}; 
 		if ((globalWorkSize[0] % realLocalSize[0] ) != 0){
@@ -439,12 +490,47 @@ protected synchronized void projectSingleProjection(int projectionNumber, int di
 		}
 
 		// Call the OpenCL kernel, writing the results into the volume which is pointed at
-		commandQueue
-		.putWriteImage(projectionTex, true)
-		.put2DRangeKernel(kernelFunction, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1])
-		//.finish()
-		//.putReadBuffer(dOut, true)
-		.finish();
+		if (this.pBuff == null) {
+			commandQueue
+			.putWriteImage(projectionTex, true)
+			.put2DRangeKernel(kernelFunction, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1])
+			.finish();
+		} else {
+			commandQueue
+			.putWriteImage(projectionTex, true)
+			.putWriteBuffer(pBuff, true)
+			.putWriteBuffer(qBuff, true)
+			.put2DRangeKernel(kernelFunction, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1])
+			.finish();
+		}
+		
+
+	}
+
+	private void initDeformedCoordinates(int projectionNumber) {
+
+		SimpleVector[] q = this.qs[projectionNumber];
+		
+		float[] p_floats = new float[3*p.length];
+		float[] q_floats = new float[3*q.length];
+		for (int i = 0; i < p.length; i++) {
+			p_floats[3*i]   = (float)p[i].getElement(0);
+			p_floats[3*i+1] = (float)p[i].getElement(1);
+			p_floats[3*i+2] = (float)p[i].getElement(2);
+			q_floats[3*i]   = (float)q[i].getElement(0);
+			q_floats[3*i+1] = (float)q[i].getElement(1);
+			q_floats[3*i+2] = (float)q[i].getElement(2);
+		}
+
+		CLBuffer<FloatBuffer> pBuffer = context.createFloatBuffer(p.length*p[0].getLen(), Mem.READ_ONLY);
+		pBuffer.getBuffer().put(p_floats);
+		pBuffer.getBuffer().rewind();
+		this.pBuff = pBuffer;
+
+		CLBuffer<FloatBuffer> qBuffer = context.createFloatBuffer(q.length*q[0].getLen(), Mem.READ_ONLY);
+		qBuffer.getBuffer().put(q_floats);
+		qBuffer.getBuffer().rewind();
+		this.qBuff = qBuffer;
 	}
 
 	public void loadInputQueue(Grid3D input) throws IOException {
@@ -644,8 +730,22 @@ public String getToolName(){
 		return "OpenCL Backprojector";
 	}
 
+	/**
+	 * set parameters used for deformation
+	 * 
+	 * @param p  original positions of points used for deformation. Contains n 3D-vectors if there are n points used
+	 *           for deformation
+	 * @param qs deformed positions of points used for deformation. If there are p projections used for backprojection
+	 *           and n points used for deformation, the array contains p x n 3D-vectors.
+	 */
+	public void setDeformParameters(SimpleVector[] p, SimpleVector[][] qs) {
+		this.p = p;
+		this.qs = qs;
+	}
+
+
 }
 /*
- * Copyright (C) 2010-2014 Martin Berger
+ * Copyright (C) 2010-2021 Martin Berger, Jennifer Maier
  * CONRAD is developed as an Open Source project under the GNU General Public License (GPL).
  */
\ No newline at end of file
diff --git a/src/edu/stanford/rsl/conrad/opencl/OpenCLForwardProjector.java b/src/edu/stanford/rsl/conrad/opencl/OpenCLForwardProjector.java
index cc7010c5..8731757a 100644
--- a/src/edu/stanford/rsl/conrad/opencl/OpenCLForwardProjector.java
+++ b/src/edu/stanford/rsl/conrad/opencl/OpenCLForwardProjector.java
@@ -227,7 +227,7 @@ public void computeCanonicalProjectionMatrix(CLBuffer<FloatBuffer> detectorDirec
 			originShift = getOriginTransform();
 		
 		// New srcPoint in the Canonical coord sys
-		SimpleVector srcPtW = proj.computeCameraCenter().negated();//computeSrcPt(projectionMatrix, invARmatrixMat);
+		SimpleVector srcPtW = proj.computeCameraCenter().negated();//computeSrcPt(proj.computeP(), invARmatrixMat); //
 		srcPoint.getBuffer().put((float) -(-0.5 * (volumeSize[0] -1.0) + originShift.getElement(0)*invVoxelScale.getElement(0,0) + invVoxelScale.getElement(0,0) * srcPtW.getElement(0))); 
 		srcPoint.getBuffer().put((float) -(-0.5 * (volumeSize[1] -1.0) + originShift.getElement(1)*invVoxelScale.getElement(1,1) + invVoxelScale.getElement(1,1) * srcPtW.getElement(1))); 
 		srcPoint.getBuffer().put((float) -(-0.5 * (volumeSize[2] -1.0) + originShift.getElement(2)*invVoxelScale.getElement(2,2) + invVoxelScale.getElement(2,2) * srcPtW.getElement(2))); 
@@ -281,6 +281,12 @@ public static Jama.Matrix computeSrcPt(Jama.Matrix projectionMatrix, Jama.Matrix
 		return invertedProjMatrix.times(at);
 	}
 	
+	public static SimpleVector computeSrcPt(SimpleMatrix projectionMatrix, SimpleMatrix invertedProjMatrix) {
+		SimpleVector at = projectionMatrix.getSubCol(0, 3, 3);//.getMatrix(0, 2, 3, 3);
+		//at = at.times(-1.0);
+		return SimpleOperators.multiply(invertedProjMatrix, at);
+	}
+	
 	protected SimpleVector getOriginTransform(){
 		SimpleVector currOrigin = new SimpleVector(this.origin);
 		// compute centered origin as assumed by forward projector
diff --git a/src/edu/stanford/rsl/conrad/opencl/backprojectCL.cl b/src/edu/stanford/rsl/conrad/opencl/backprojectCL.cl
index 9da47d27..c9798f0d 100644
--- a/src/edu/stanford/rsl/conrad/opencl/backprojectCL.cl
+++ b/src/edu/stanford/rsl/conrad/opencl/backprojectCL.cl
@@ -96,8 +96,591 @@ __kernel void backprojectKernel(
     return;
 }
 
+/* --------------------------------------------------------------------------
+ *
+ *
+ *    Voxel-based BP algorithm implementation in OpenCL kernel programming
+ *    WITH DEFORMATION!
+ *
+ *
+ * -------------------------------------------------------------------------- */
+float4 mls_deformation(float x, float y, float z, global float* p, global float* q, int noPoints)
+{
+	
+	// q* and p*
+	float qStar0 = 0;
+	float qStar1 = 0;
+	float qStar2 = 0;
+	float pStar0 = 0;
+	float pStar1 = 0;
+	float pStar2 = 0;
+	float wSum = 0;
+	
+	for (unsigned int idx = 0; idx < noPoints; idx++) {
+		float w = 1/((p[idx*3] - x) * (p[idx*3] - x) + (p[idx*3+1] - y) * (p[idx*3+1] - y) + (p[idx*3+2] - z) * (p[idx*3+2] - z));
+		wSum += w;
+		qStar0 += q[idx*3] * w;
+		qStar1 += q[idx*3+1] * w;
+		qStar2 += q[idx*3+2] * w;
+		pStar0 += p[idx*3] * w;
+		pStar1 += p[idx*3+1] * w;
+		pStar2 += p[idx*3+2] * w;
+	}
+	qStar0 /= wSum;
+	qStar1 /= wSum;
+	qStar2 /= wSum;
+	pStar0 /= wSum;
+	pStar1 /= wSum;
+	pStar2 /= wSum;
+		
+	
+	// PQ'
+	float PQtrans0 = 0;
+	float PQtrans1 = 0;
+	float PQtrans2 = 0;
+	float PQtrans3 = 0;
+	float PQtrans4 = 0;
+	float PQtrans5 = 0;
+	float PQtrans6 = 0;
+	float PQtrans7 = 0;
+	float PQtrans8 = 0;
+	for (unsigned int idx = 0; idx < noPoints; idx++) {
+		float w = 1/((p[idx*3] - x) * (p[idx*3] - x) + (p[idx*3+1] - y) * (p[idx*3+1] - y) + (p[idx*3+2] - z) * (p[idx*3+2] - z));
+		
+		float p_min_pStar0 = p[idx*3] - pStar0;
+		float p_min_pStar1 = p[idx*3+1] - pStar1;
+		float p_min_pStar2 = p[idx*3+2] - pStar2;
+		
+		float q_min_qStar0 = q[idx*3] - qStar0;
+		float q_min_qStar1 = q[idx*3+1] - qStar1;
+		float q_min_qStar2 = q[idx*3+2] - qStar2;
+		
+		PQtrans0 += w*p_min_pStar0*q_min_qStar0;
+		PQtrans3 += w*p_min_pStar0*q_min_qStar1;
+		PQtrans6 += w*p_min_pStar0*q_min_qStar2;
+		
+		PQtrans1 += w*p_min_pStar1*q_min_qStar0;
+		PQtrans4 += w*p_min_pStar1*q_min_qStar1;
+		PQtrans7 += w*p_min_pStar1*q_min_qStar2;
+		
+		PQtrans2 += w*p_min_pStar2*q_min_qStar0;
+		PQtrans5 += w*p_min_pStar2*q_min_qStar1;
+		PQtrans8 += w*p_min_pStar2*q_min_qStar2;
+		
+	}
+
+	// svd
+	int n = 3;
+
+	double tol = 0.000000000000001f;		
+	int loopmax = 100 * n;
+	int loopcount = 0;
+
+	float U0 = 1;
+	float U1 = 0;
+	float U2 = 0;
+	float U3 = 0;
+	float U4 = 1;
+	float U5 = 0;
+	float U6 = 0;
+	float U7 = 0;
+	float U8 = 1;
+
+	float V0 = 1;
+	float V1 = 0;
+	float V2 = 0;
+	float V3 = 0;
+	float V4 = 1;
+	float V5 = 0;
+	float V6 = 0;
+	float V7 = 0;
+	float V8 = 1;
+
+	float S0 = PQtrans0;
+	float S1 = PQtrans3;
+	float S2 = PQtrans6;
+	float S3 = PQtrans1;
+	float S4 = PQtrans4;
+	float S5 = PQtrans7;
+	float S6 = PQtrans2;
+	float S7 = PQtrans5;
+	float S8 = PQtrans8;
+
+	float err = 10000;
+
+	while (err > tol && loopcount < loopmax) {
+
+		// qr //////////////////////////
+		int m = 3;
+
+		float R0 = S0;
+		float R1 = S3;
+		float R2 = S6;
+		float R3 = S1;
+		float R4 = S4;
+		float R5 = S7;
+		float R6 = S2;
+		float R7 = S5;
+		float R8 = S8;
+
+		float Q0 = 1;
+		float Q1 = 0;
+		float Q2 = 0;
+		float Q3 = 0;
+		float Q4 = 1;
+		float Q5 = 0;
+		float Q6 = 0;
+		float Q7 = 0;
+		float Q8 = 1;
+
+		// k = 0;
+		float x0 = R0;
+		float x1 = R1;
+		float x2 = R2;
+
+		float norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+		float g = -sign(x0) * norm_x;
+		x0 = x0 - g;
+
+		// Orthogonal transformation matrix that eliminates one element
+		// below the diagonal of the matrix it is post-multiplying:
+		norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+
+		if (norm_x != 0) {
+
+			x0 = x0/norm_x;
+			x1 = x1/norm_x;
+			x2 = x2/norm_x;
+
+			float u1 = 2 * (R0*x0 + R1*x1 + R2*x2);
+			float u2 = 2 * (R3*x0 + R4*x1 + R5*x2);
+			float u3 = 2 * (R6*x0 + R7*x1 + R8*x2);
+
+			R0 = R0 - x0*u1;
+			R3 = R3 - x0*u2;
+			R6 = R6 - x0*u3;
+			R1 = R1 - x1*u1;
+			R4 = R4 - x1*u2;
+			R7 = R7 - x1*u3;
+			R2 = R2 - x2*u1;
+			R5 = R5 - x2*u2;
+			R8 = R8 - x2*u3;
+
+			float Q0tmp = Q0 - 2*(Q0*x0*x0 + Q3*x1*x0 + Q6*x2*x0);
+			float Q3tmp = Q3 - 2*(Q0*x0*x1 + Q3*x1*x1 + Q6*x2*x1);
+			float Q6tmp = Q6 - 2*(Q0*x0*x2 + Q3*x1*x2 + Q6*x2*x2);
+			float Q1tmp = Q1 - 2*(Q1*x0*x0 + Q4*x1*x0 + Q7*x2*x0);
+			float Q4tmp = Q4 - 2*(Q1*x0*x1 + Q4*x1*x1 + Q7*x2*x1);
+			float Q7tmp = Q7 - 2*(Q1*x0*x2 + Q4*x1*x2 + Q7*x2*x2);
+			float Q2tmp = Q2 - 2*(Q2*x0*x0 + Q5*x1*x0 + Q8*x2*x0);
+			float Q5tmp = Q5 - 2*(Q2*x0*x1 + Q5*x1*x1 + Q8*x2*x1);
+			float Q8tmp = Q8 - 2*(Q2*x0*x2 + Q5*x1*x2 + Q8*x2*x2);
+
+			Q0 = Q0tmp;
+			Q1 = Q1tmp;
+			Q2 = Q2tmp;
+			Q3 = Q3tmp;
+			Q4 = Q4tmp;
+			Q5 = Q5tmp;
+			Q6 = Q6tmp;
+			Q7 = Q7tmp;
+			Q8 = Q8tmp;
+		}
+
+		// k = 1;
+		x0 = 0;
+		x1 = R4;
+		x2 = R5;
+
+		norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+		g = -sign(x1) * norm_x;
+		x1 = x1 - g;
+
+		// Orthogonal transformation matrix that eliminates one element
+		// below the diagonal of the matrix it is post-multiplying:
+		norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+
+		if (norm_x != 0) {
+
+			x0 = x0/norm_x;
+			x1 = x1/norm_x;
+			x2 = x2/norm_x;
+
+			float u1 = 2 * (R0*x0 + R1*x1 + R2*x2);
+			float u2 = 2 * (R3*x0 + R4*x1 + R5*x2);
+			float u3 = 2 * (R6*x0 + R7*x1 + R8*x2);
+
+			R0 = R0 - x0*u1;
+			R3 = R3 - x0*u2;
+			R6 = R6 - x0*u3;
+			R1 = R1 - x1*u1;
+			R4 = R4 - x1*u2;
+			R7 = R7 - x1*u3;
+			R2 = R2 - x2*u1;
+			R5 = R5 - x2*u2;
+			R8 = R8 - x2*u3;
+
+			float Q0tmp = Q0 - 2*(Q0*x0*x0 + Q3*x1*x0 + Q6*x2*x0);
+			float Q3tmp = Q3 - 2*(Q0*x0*x1 + Q3*x1*x1 + Q6*x2*x1);
+			float Q6tmp = Q6 - 2*(Q0*x0*x2 + Q3*x1*x2 + Q6*x2*x2);
+			float Q1tmp = Q1 - 2*(Q1*x0*x0 + Q4*x1*x0 + Q7*x2*x0);
+			float Q4tmp = Q4 - 2*(Q1*x0*x1 + Q4*x1*x1 + Q7*x2*x1);
+			float Q7tmp = Q7 - 2*(Q1*x0*x2 + Q4*x1*x2 + Q7*x2*x2);
+			float Q2tmp = Q2 - 2*(Q2*x0*x0 + Q5*x1*x0 + Q8*x2*x0);
+			float Q5tmp = Q5 - 2*(Q2*x0*x1 + Q5*x1*x1 + Q8*x2*x1);
+			float Q8tmp = Q8 - 2*(Q2*x0*x2 + Q5*x1*x2 + Q8*x2*x2);
+
+			Q0 = Q0tmp;
+			Q1 = Q1tmp;
+			Q2 = Q2tmp;
+			Q3 = Q3tmp;
+			Q4 = Q4tmp;
+			Q5 = Q5tmp;
+			Q6 = Q6tmp;
+			Q7 = Q7tmp;
+			Q8 = Q8tmp;
+		}
+
+		//////////////////////////////////////
+
+		S0 = R0;
+		S1 = R1;
+		S2 = R2;
+		S3 = R3;
+		S4 = R4;
+		S5 = R5;
+		S6 = R6;
+		S7 = R7;
+		S8 = R8;
+
+		float U0tmp = U0 * Q0 + U3 * Q1 + U6 * Q2;
+		float U1tmp = U1 * Q0 + U4 * Q1 + U7 * Q2;
+		float U2tmp = U2 * Q0 + U5 * Q1 + U8 * Q2;
+		float U3tmp = U0 * Q3 + U3 * Q4 + U6 * Q5;
+		float U4tmp = U1 * Q3 + U4 * Q4 + U7 * Q5;
+		float U5tmp = U2 * Q3 + U5 * Q4 + U8 * Q5;
+		float U6tmp = U0 * Q6 + U3 * Q7 + U6 * Q8;
+		float U7tmp = U1 * Q6 + U4 * Q7 + U7 * Q8;
+		float U8tmp = U2 * Q6 + U5 * Q7 + U8 * Q8;
+		U0 = U0tmp;
+		U1 = U1tmp;
+		U2 = U2tmp;
+		U3 = U3tmp;
+		U4 = U4tmp;
+		U5 = U5tmp;
+		U6 = U6tmp;
+		U7 = U7tmp;
+		U8 = U8tmp;
+
+		// qr //////////////////////////
+		R0 = S0;
+		R1 = S3;
+		R2 = S6;
+		R3 = S1;
+		R4 = S4;
+		R5 = S7;
+		R6 = S2;
+		R7 = S5;
+		R8 = S8;
+
+		Q0 = 1;
+		Q1 = 0;
+		Q2 = 0;
+		Q3 = 0;
+		Q4 = 1;
+		Q5 = 0;
+		Q6 = 0;
+		Q7 = 0;
+		Q8 = 1;
+
+		// k = 0;
+		x0 = R0;
+		x1 = R1;
+		x2 = R2;
+
+		norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+		g = -sign(x0) * norm_x;
+		x0 = x0 - g;
+
+		// Orthogonal transformation matrix that eliminates one element
+		// below the diagonal of the matrix it is post-multiplying:
+		norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+
+		if (norm_x != 0) {
+
+			x0 = x0/norm_x;
+			x1 = x1/norm_x;
+			x2 = x2/norm_x;
+
+			float u1 = 2 * (R0*x0 + R1*x1 + R2*x2);
+			float u2 = 2 * (R3*x0 + R4*x1 + R5*x2);
+			float u3 = 2 * (R6*x0 + R7*x1 + R8*x2);
+
+			R0 = R0 - x0*u1;
+			R3 = R3 - x0*u2;
+			R6 = R6 - x0*u3;
+			R1 = R1 - x1*u1;
+			R4 = R4 - x1*u2;
+			R7 = R7 - x1*u3;
+			R2 = R2 - x2*u1;
+			R5 = R5 - x2*u2;
+			R8 = R8 - x2*u3;
+
+			float Q0tmp = Q0 - 2*(Q0*x0*x0 + Q3*x1*x0 + Q6*x2*x0);
+			float Q3tmp = Q3 - 2*(Q0*x0*x1 + Q3*x1*x1 + Q6*x2*x1);
+			float Q6tmp = Q6 - 2*(Q0*x0*x2 + Q3*x1*x2 + Q6*x2*x2);
+			float Q1tmp = Q1 - 2*(Q1*x0*x0 + Q4*x1*x0 + Q7*x2*x0);
+			float Q4tmp = Q4 - 2*(Q1*x0*x1 + Q4*x1*x1 + Q7*x2*x1);
+			float Q7tmp = Q7 - 2*(Q1*x0*x2 + Q4*x1*x2 + Q7*x2*x2);
+			float Q2tmp = Q2 - 2*(Q2*x0*x0 + Q5*x1*x0 + Q8*x2*x0);
+			float Q5tmp = Q5 - 2*(Q2*x0*x1 + Q5*x1*x1 + Q8*x2*x1);
+			float Q8tmp = Q8 - 2*(Q2*x0*x2 + Q5*x1*x2 + Q8*x2*x2);
+
+			Q0 = Q0tmp;
+			Q1 = Q1tmp;
+			Q2 = Q2tmp;
+			Q3 = Q3tmp;
+			Q4 = Q4tmp;
+			Q5 = Q5tmp;
+			Q6 = Q6tmp;
+			Q7 = Q7tmp;
+			Q8 = Q8tmp;
+		}
+
+		// k = 1;
+		x0 = 0;
+		x1 = R4;
+		x2 = R5;
+
+		norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+		g = -sign(x1) * norm_x;
+		x1 = x1 - g;
+
+		// Orthogonal transformation matrix that eliminates one element
+		// below the diagonal of the matrix it is post-multiplying:
+		norm_x = sqrt(x0*x0 + x1*x1 + x2*x2);
+
+		if (norm_x != 0) {
+
+			x0 = x0/norm_x;
+			x1 = x1/norm_x;
+			x2 = x2/norm_x;
+
+			float u1 = 2 * (R0*x0 + R1*x1 + R2*x2);
+			float u2 = 2 * (R3*x0 + R4*x1 + R5*x2);
+			float u3 = 2 * (R6*x0 + R7*x1 + R8*x2);
+
+			R0 = R0 - x0*u1;
+			R3 = R3 - x0*u2;
+			R6 = R6 - x0*u3;
+			R1 = R1 - x1*u1;
+			R4 = R4 - x1*u2;
+			R7 = R7 - x1*u3;
+			R2 = R2 - x2*u1;
+			R5 = R5 - x2*u2;
+			R8 = R8 - x2*u3;
+
+			float Q0tmp = Q0 - 2*(Q0*x0*x0 + Q3*x1*x0 + Q6*x2*x0);
+			float Q3tmp = Q3 - 2*(Q0*x0*x1 + Q3*x1*x1 + Q6*x2*x1);
+			float Q6tmp = Q6 - 2*(Q0*x0*x2 + Q3*x1*x2 + Q6*x2*x2);
+			float Q1tmp = Q1 - 2*(Q1*x0*x0 + Q4*x1*x0 + Q7*x2*x0);
+			float Q4tmp = Q4 - 2*(Q1*x0*x1 + Q4*x1*x1 + Q7*x2*x1);
+			float Q7tmp = Q7 - 2*(Q1*x0*x2 + Q4*x1*x2 + Q7*x2*x2);
+			float Q2tmp = Q2 - 2*(Q2*x0*x0 + Q5*x1*x0 + Q8*x2*x0);
+			float Q5tmp = Q5 - 2*(Q2*x0*x1 + Q5*x1*x1 + Q8*x2*x1);
+			float Q8tmp = Q8 - 2*(Q2*x0*x2 + Q5*x1*x2 + Q8*x2*x2);
+
+			Q0 = Q0tmp;
+			Q1 = Q1tmp;
+			Q2 = Q2tmp;
+			Q3 = Q3tmp;
+			Q4 = Q4tmp;
+			Q5 = Q5tmp;
+			Q6 = Q6tmp;
+			Q7 = Q7tmp;
+			Q8 = Q8tmp;
+		}
+
+		//////////////////////////////////////
+
+		S0 = R0;
+		S1 = R1;
+		S2 = R2;
+		S3 = R3;
+		S4 = R4;
+		S5 = R5;
+		S6 = R6;
+		S7 = R7;
+		S8 = R8;
+
+		float V0tmp = V0 * Q0 + V3 * Q1 + V6 * Q2;
+		float V1tmp = V1 * Q0 + V4 * Q1 + V7 * Q2;
+		float V2tmp = V2 * Q0 + V5 * Q1 + V8 * Q2;
+		float V3tmp = V0 * Q3 + V3 * Q4 + V6 * Q5;
+		float V4tmp = V1 * Q3 + V4 * Q4 + V7 * Q5;
+		float V5tmp = V2 * Q3 + V5 * Q4 + V8 * Q5;
+		float V6tmp = V0 * Q6 + V3 * Q7 + V6 * Q8;
+		float V7tmp = V1 * Q6 + V4 * Q7 + V7 * Q8;
+		float V8tmp = V2 * Q6 + V5 * Q7 + V8 * Q8;
+		V0 = V0tmp;
+		V1 = V1tmp;
+		V2 = V2tmp;
+		V3 = V3tmp;
+		V4 = V4tmp;
+		V5 = V5tmp;
+		V6 = V6tmp;
+		V7 = V7tmp;
+		V8 = V8tmp;
+
+		// exit when we get "close"
+		float E = sqrt(S3*S3 + S6*S6 + S7*S7);
+		float F = sqrt(S0*S0 + S4*S4 + S8*S8);
+		if (F == 0) {
+			F = 1;
+		}
+		err = E/F;
+		loopcount = loopcount + 1;
+	}
+
+	// fix the signs in S
+	float SS0 = S0;
+	float SS1 = S4;
+	float SS2 = S8;
+
+	if (SS0 < -0.000000001) {
+		U0 = -U0;
+		U1 = -U1;
+		U2 = -U2;
+	}
+
+	if (SS1 < -0.000000001) {
+		U3 = -U3;
+		U4 = -U4;
+		U5 = -U5;
+	}
+
+	if (SS2 < -0.000000001) {
+		U6 = -U6;
+		U7 = -U7;
+		U8 = -U8;
+	}
+
+	// transform input coords
+	float x_min_pStar = x - pStar0;
+	float y_min_pStar = y - pStar1;
+	float z_min_pStar = z - pStar2;
+
+	float VUtrans0 = V0*U0 + V3*U3 + V6*U6;
+	float VUtrans1 = V1*U0 + V4*U3 + V7*U6;
+	float VUtrans2 = V2*U0 + V5*U3 + V8*U6;
+	float VUtrans3 = V0*U1 + V3*U4 + V6*U7;
+	float VUtrans4 = V1*U1 + V4*U4 + V7*U7;
+	float VUtrans5 = V2*U1 + V5*U4 + V8*U7;
+	float VUtrans6 = V0*U2 + V3*U5 + V6*U8;
+	float VUtrans7 = V1*U2 + V4*U5 + V7*U8;
+	float VUtrans8 = V2*U2 + V5*U5 + V8*U8;
+
+	if (VUtrans0*VUtrans4*VUtrans8 + VUtrans3*VUtrans7*VUtrans2 + VUtrans6*VUtrans1*VUtrans5 - VUtrans2*VUtrans4*VUtrans6 - VUtrans5*VUtrans7*VUtrans0 - VUtrans8*VUtrans1*VUtrans3 <= 0)
+	{
+		V6 = -V6;
+		V7 = -V7;
+		V8 = -V8;
+		VUtrans0 = V0*U0 + V3*U3 + V6*U6;
+		VUtrans1 = V1*U0 + V4*U3 + V7*U6;
+		VUtrans2 = V2*U0 + V5*U3 + V8*U6;
+		VUtrans3 = V0*U1 + V3*U4 + V6*U7;
+		VUtrans4 = V1*U1 + V4*U4 + V7*U7;
+		VUtrans5 = V2*U1 + V5*U4 + V8*U7;
+		VUtrans6 = V0*U2 + V3*U5 + V6*U8;
+		VUtrans7 = V1*U2 + V4*U5 + V7*U8;
+		VUtrans8 = V2*U2 + V5*U5 + V8*U8;
+	}
+
+	float x_transformed = VUtrans0*x_min_pStar + VUtrans3*y_min_pStar + VUtrans6*z_min_pStar + qStar0;
+	float y_transformed = VUtrans1*x_min_pStar + VUtrans4*y_min_pStar + VUtrans7*z_min_pStar + qStar1;
+	float z_transformed = VUtrans2*x_min_pStar + VUtrans5*y_min_pStar + VUtrans8*z_min_pStar + qStar2;
+	
+	float4 v_transformed = (float4)(x_transformed, y_transformed, z_transformed, 0.0f);
+	return v_transformed;
+
+}
+
+
+__kernel void backprojectKernelDeformed(
+		__global TvoxelValue* pVolume,
+		int sizeX,
+		int sizeY,
+		int recoSizeZ,
+		int offset,
+		float voxelSpacingX,
+		float voxelSpacingY,
+		float voxelSpacingZ,
+		float offsetX,
+		float offsetY, 
+		float offsetZ,
+		__read_only image2d_t gTex2D,
+		__constant Tcoord_dev* gProjMatrix,
+		float projMultiplier,
+		global float* p,
+		global float* q,
+		int noPoints)
+{
+	
+	int gidx = get_group_id(0);
+	int gidy = get_group_id(1);
+	int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+	
+	int locSizex = get_local_size(0);
+	int locSizey = get_local_size(1);
+	
+	unsigned int x = gidx*locSizex+lidx;
+    unsigned int y = gidy*locSizey+lidy;
+	
+	if (x >= sizeX || y >= sizeY)
+		return;
+		
+	unsigned int zStride = sizeX*sizeY;
+	unsigned int yStride = sizeX;
+
+	for (unsigned int z = 0; z < recoSizeZ; z++){
+
+		float4 coords_new = mls_deformation((x * voxelSpacingX) - offsetX, (y * voxelSpacingY) - offsetY, (z * voxelSpacingZ) - offsetZ, p, q, noPoints);
+		float xcoord = coords_new.x;
+		float ycoord = coords_new.y;
+		float zcoord = coords_new.z;
+		
+		// precompute everything except z from the matrix multiplication
+		float precomputeR = gProjMatrix[3]  + ycoord * gProjMatrix[1] + xcoord * gProjMatrix[0];
+		float precomputeS = gProjMatrix[7]  + ycoord * gProjMatrix[5] + xcoord * gProjMatrix[4];
+	    float precomputeT = gProjMatrix[11] + ycoord * gProjMatrix[9] + xcoord * gProjMatrix[8];
+
+		// compute homogeneous coordinates
+		float r = zcoord * gProjMatrix[2]  + precomputeR;
+		float s = zcoord * gProjMatrix[6]  + precomputeS;
+		float t = zcoord * gProjMatrix[10] + precomputeT;
+		
+		// compute projection coordinates 
+		float denom = 1.0f / t;
+		float fu = r * denom;
+		float fv = s * denom;
+		
+		float proj_val = read_imagef(gTex2D, sampler, (float2)(fu + 0.5f + offset, fv + 0.5f)).x;
+		
+		// compute volume index for x,y,z
+		unsigned long idx = z*zStride + y*yStride + x;
+    
+		// Distance weighting by 1/t^2
+		pVolume[idx] += proj_val * denom * denom * projMultiplier;
+		
+	}
+	
+    return;
+}
+
+
 
 /*
- * Copyright (C) 2010-2014 Martin Berger
+ * Copyright (C) 2010-2021 Martin Berger, Jennifer Maier
  * CONRAD is developed as an Open Source project under the GNU General Public License (GPL).
 */
\ No newline at end of file