small change in parallelism to be compatible with older julia verions, small type stability fix in cg

PetersBas · PetersBas · commit 3b843eecbb6d · 2022-04-07T14:23:17.000-07:00
diff --git a/examples/ConstraintSetupExamples.jl b/examples/ConstraintSetupExamples.jl
@@ -27,16 +27,16 @@ else
   #error("download escalator video from http://cvxr.com/tfocs/demos/rpca/escalator_data.mat")
 end
 
-mtrue    = read(file, "X")
-n1       = convert(Integer,read(file, "m"))
-n2       = convert(Integer,read(file, "n"))
-m_mat    = convert(Array{TF,2},mtrue)
-m_tensor = convert(Array{TF,3},reshape(mtrue,n1,n2,Integer(200)))
+mtrue    = read(file, "X");
+n1       = convert(Integer,read(file, "m"));
+n2       = convert(Integer,read(file, "n"));
+m_mat    = convert(Array{TF,2},mtrue);
+m_tensor = convert(Array{TF,3},reshape(mtrue,n1,n2,Integer(200)));
 
 #computational grid for the video
-comp_grid  = compgrid((1f0,1f0,1f0),(size(m_tensor,1),size(m_tensor,2), size(m_tensor,3)))
+comp_grid  = compgrid((1f0,1f0,1f0),(size(m_tensor,1),size(m_tensor,2), size(m_tensor,3)));
 
-comp_grid_time_slice = compgrid((1f0,1f0),(size(m_tensor,1),size(m_tensor,2)))
+comp_grid_time_slice = compgrid((1f0,1f0),(size(m_tensor,1),size(m_tensor,2)));
 
 
 ######################################################################
@@ -52,7 +52,7 @@ options = PARSDMM_options()
 #l1 (total variation) constraints (in one direction)
 
 #find a reasonable value for the l1-ball
-(TD_OP, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid,"D_z",options.FL)
+(TD_OP, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid,"D_z",options.FL);
 TV_z = norm(TD_OP*vec(m_tensor),1)
 
 m_min     = 0.0
@@ -61,15 +61,15 @@ set_type  = "l1"
 TD_OP     = "D_z"
 app_mode  = ("tensor","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 @time (x,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 @time (x,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
-m_proj = reshape(x,comp_grid.n)
+m_proj = reshape(x,comp_grid.n);
 
 figure();
 for i=1:comp_grid.n[3]
@@ -83,23 +83,23 @@ end
 ######## (anisotropic) total-variation on the time-derivative          ########
 ########                     using JOLI operators                      ########
 
-options = PARSDMM_options()
+options = PARSDMM_options();
 
 #TV operator per time-slice
-(TV, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid_time_slice,"TV",options.FL)
+(TV, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid_time_slice,"TV",options.FL);
 #time derivative over the time-slices
-(D, AtA_diag, dense, TD_n)  =  get_TD_operator(comp_grid,"D_z",options.FL)
+(D, AtA_diag, dense, TD_n)  =  get_TD_operator(comp_grid,"D_z",options.FL);
 
-CustomOP_explicit_sparse = kron(TV, SparseMatrixCSC{TF}(LinearAlgebra.I, comp_grid.n[3]-1,comp_grid.n[3]-1))* D
+CustomOP_explicit_sparse = kron(TV, SparseMatrixCSC{TF}(LinearAlgebra.I, comp_grid.n[3]-1,comp_grid.n[3]-1))* D;
 
-D  = joMatrix(D)
-TV = joMatrix(TV)
-CustomOP_JOLI = joKron(TV, joEye(comp_grid.n[3]-1,DDT=Float32,RDT=Float32))* D
+D  = joMatrix(D);
+TV = joMatrix(TV);
+CustomOP_JOLI = joKron(TV, joEye(comp_grid.n[3]-1,DDT=Float32,RDT=Float32))* D;
 
 ## Solve using JOLI ##
 
     #initialize constraints
-    constraint = Vector{SetIntersectionProjection.set_definitions}()
+    constraint = Vector{SetIntersectionProjection.set_definitions}();
 
     m_min     = 0.0
     m_max     = 0.1*norm(CustomOP_JOLI*vec(m_tensor),1)
@@ -122,25 +122,25 @@ CustomOP_JOLI = joKron(TV, joEye(comp_grid.n[3]-1,DDT=Float32,RDT=Float32))* D
     @time (x_joli,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 ## solve using explicit sparse array ##
-    constraint = Vector{SetIntersectionProjection.set_definitions}()
+    constraint = Vector{SetIntersectionProjection.set_definitions}();
 
     m_min     = 0.0
     m_max     = 0.025*norm(CustomOP_explicit_sparse*vec(m_tensor),1)
     set_type  = "l1"
     TD_OP     = "identity"
     app_mode  = ("matrix","")
     custom_TD_OP = (CustomOP_explicit_sparse,false)
-    push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+    push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
     options=PARSDMM_options()
-    (P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
+    (P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
 
     #set properties of custom operator
     set_Prop.AtA_diag[1]  = false
     set_Prop.dense[1]     = false
     set_Prop.banded[1]    = true
 
-    (TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+    (TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
     @time (x_sp,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
     @time (x_sp,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
diff --git a/examples/projection_intersection_2D.jl b/examples/projection_intersection_2D.jl
@@ -114,7 +114,7 @@ options.parallel       = true
 
 @time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 @time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
-#@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 #plot
 figure();
@@ -125,27 +125,30 @@ savefig("projected_model_ParallelPARSDMM.png",bbox_inches="tight")
 #print timings in terminal
 log_PARSDMM.timing
 
-# #use multilevel-serial (2-levels)
-# options.parallel = false
+#use multilevel-serial (2-levels)
+options.parallel = false
 
-# #2 levels, the gird point spacing at level 2 is 3X that of the original (level 1) grid
-# n_levels          = 2
-# coarsening_factor = 3
+#2 levels, the gird point spacing at level 2 is 3X that of the original (level 1) grid
+n_levels          = 2
+coarsening_factor = 3
 
-# #set up all required quantities for each level
-# #(m_levels,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
-# (TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+#set up all required quantities for each level
+#(m_levels,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
 
-# println("")
-# println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
-# @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
-# @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
-# @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+println("")
+println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
 
-# figure();
-# subplot(2,1,1);imshow(permutedims(reshape(m,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("model to project")
-# subplot(2,1,2);imshow(permutedims(reshape(x,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("Projection (bounds and bounds on D_z)")
-# savefig("projected_model_MultilevelSerialPARSDMM.png",bbox_inches="tight")
+figure();
+subplot(2,1,1);imshow(permutedims(reshape(m,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("model to project")
+subplot(2,1,2);imshow(permutedims(reshape(x,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("Projection (bounds and bounds on D_z)")
+savefig("projected_model_MultilevelSerialPARSDMM.png",bbox_inches="tight")
+
+#print timings in terminal, note that for the multi-level version, the timing is only for the final level on the original grid
+log_PARSDMM.timing
 
 # #now use multi-level with parallel PARSDMM
 # options.parallel=true
diff --git a/examples/projection_intersection_3D.jl b/examples/projection_intersection_3D.jl
@@ -102,13 +102,13 @@ subplot(3, 3, 8);plot(log_PARSDMM.gamma)              ;title("gamma")
 subplot(3, 3, 9);semilogy(log_PARSDMM.evol_x)         ;title("x evolution")
 
 #plot
-m_plot = reshape(m,comp_grid.n)
+m_plot = reshape(m,comp_grid.n);
 figure();
 subplot(3,1,1);imshow(m_plot[:,:,Int64(round(comp_grid.n[3]/2))],cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, ymax, 0]); title("model to project x-y slice")
 subplot(3,1,2);imshow(permutedims(m_plot[:,Int64(round(comp_grid.n[2]/2)),:],[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("model to project x-z slice")
 subplot(3,1,3);imshow(permutedims(m_plot[Int64(round(comp_grid.n[1]/2)),:,:],[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  ymax, zmax, 0]); title("model to project y-z slice")
 
-x_plot = reshape(x,comp_grid.n)
+x_plot = reshape(x,comp_grid.n);
 figure();
 subplot(3,1,1);imshow(x_plot[:,:,Int64(round(comp_grid.n[3]/2))],cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, ymax, 0]); title("projected model x-y slice")
 subplot(3,1,2);imshow(permutedims(x_plot[:,Int64(round(comp_grid.n[2]/2)),:],[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("projected model x-z slice")
@@ -121,13 +121,13 @@ n_levels          = 2
 coarsening_factor = 3
 
 #set up all required quantities for each level
-(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options);
 
 println("")
 println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
-@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options)
-@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options)
-@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options)
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
 
 #parallel single level
 println("")
@@ -147,7 +147,7 @@ n_levels          = 2
 coarsening_factor = 3
 
 #set up all required quantities for each level
-(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options);
 
 println("PARSDMM multilevel-parallel (bounds and bounds on D_z):")
 @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
diff --git a/src/CDS_MVp.jl b/src/CDS_MVp.jl
@@ -1,4 +1,4 @@
-export CDS_MVp, CDS_MVp2, CDS_MVp3, CDS_MVp4
+export CDS_MVp
 
 """
 compute single-thread matrix vector product with vector x, output is vector y: y=A*x
@@ -19,9 +19,9 @@ function CDS_MVp(
       r0 = max(1, 1-d)
       r1 = min(N, N-d)
       c0 = max(1, 1+d)
-       for r = r0 : r1
-         c = r - r0 + c0 #original
-       @inbounds y[r] = y[r] + R[r,i] * x[c]#original
+      for r = r0 : r1
+        c = r - r0 + c0 #original
+        @inbounds y[r] = y[r] + R[r,i] * x[c]#original
       end
   end
   return y
@@ -51,7 +51,7 @@ end
 # return y
 # end
 
-# function CDS_MVp(
+# function CDS_MVp2(
 #   N      ::Integer,
 #   ndiags ::Integer,
 #   R      ::Array{TF,2},
diff --git a/src/PARSDMM.jl b/src/PARSDMM.jl
@@ -104,7 +104,7 @@ for i=1:maxit #main loop
   # x-minimization
   @timeit to "argmin x" begin
   copy!(x_old,x);
-  (x,iter,relres,x_solve_tol_ref) = argmin_x(Q,rhs,x,x_solve_tol_ref,i,log_PARSDMM,Q_offsets,Ax_out)
+  (x,iter,relres,x_solve_tol_ref) = argmin_x(Q,rhs,x,x_solve_tol_ref,i,log_PARSDMM,Q_offsets,Ax_out,comp_grid)
   log_PARSDMM.cg_it[i]            = iter
   log_PARSDMM.cg_relres[i]        = relres
   end #end timer for argmin x 
diff --git a/src/PARSDMM_initialize.jl b/src/PARSDMM_initialize.jl
@@ -86,10 +86,13 @@ function PARSDMM_initialize(
     m = [m ; zeros(TF,length(m)) ]
   end
   if parallel
-    #f_compute_rel_feas = (feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) 
-    #[@spawnat pid f_compute_rel_feas for pid in 2:nworkers()]
-    #feasibility_initial = pmap(f_compute_rel_feas, feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
-    feasibility_initial = pmap((feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) , feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
+    feasibility_initial = distribute(feasibility_initial)
+    [@sync @spawnat pid m for pid in P_sub.pids]
+    [@sync @spawnat pid compute_relative_feasibility(m,feasibility_initial[:L],TD_OP[:L],P_sub[:L]) for pid in P_sub.pids]
+    feasibility_initial = convert(Vector{TF},feasibility_initial)
+
+    #using pmap
+    #feasibility_initial = pmap((feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) , feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
   else
     for ii = 1:length(P_sub)
       feasibility_initial[ii] = norm(P_sub[ii](TD_OP[ii]*m) .- TD_OP[ii]*m) ./ (norm(TD_OP[ii]*m)+(100*eps(TF)));
diff --git a/src/SetIntersectionProjection.jl b/src/SetIntersectionProjection.jl
@@ -15,6 +15,9 @@ using JOLI
 using JOLI.FFTW, JOLI.Wavelets
 using SortingAlgorithms
 using TimerOutputs
+# using Flux
+# using NNlib
+# using CUDA
 
 export log_type_PARSDMM, set_properties, PARSDMM_options, set_definitions
 
@@ -39,6 +42,7 @@ include("CDS_MVp.jl")
 include("CDS_MVp_MT.jl")
 include("CDS_MVp_MT_subfunc.jl")
 include("CDS_scaled_add!.jl")
+include("CDS2stencil.jl")
 
 #scripts for parallelism
 include("update_y_l_parallel.jl")
@@ -53,7 +57,7 @@ include("setup_multi_level_PARSDMM.jl")
 include("constraint2coarse.jl")
 include("interpolate_y_l.jl")
 
-#scripts for setting up constraints, projetors, linear operators
+#scripts for setting up constraints, projectors, linear operators
 include("default_PARSDMM_options.jl")
 include("convert_options!.jl")
 include("get_discrete_Grad.jl");
@@ -97,13 +101,6 @@ mutable struct log_type_PARSDMM
       cg_it             :: Vector{Integer}
       cg_relres         :: Vector{Real}
       timing            :: TimerOutput
-      # T_cg              :: Real
-      # T_stop            :: Real
-      # T_ini             :: Real
-      # T_rhs             :: Real
-      # T_adjust_rho_gamma:: Real
-      # T_y_l_upd         :: Real
-      # T_Q_upd           :: Real
 end
 
 @with_kw mutable struct PARSDMM_options
diff --git a/src/argmin_x.jl b/src/argmin_x.jl
@@ -11,7 +11,8 @@ function argmin_x(
                   i               ::Integer,
                   log_PARSDMM,
                   Q_offsets=[],
-                  Ax_out=zeros(TF,length(x)) ::Vector{TF}
+                  Ax_out=zeros(TF,length(x)) ::Vector{TF},
+                  comp_grid=[]
                   ) where {TF<:Real}
 
   #Initialize
@@ -20,8 +21,14 @@ function argmin_x(
   iter    = 0
 
   if typeof(Q)==Array{TF,2} #set up multi-threaded matrix-vector product in compressed diagonal storage format
-    Af1(in) =  (fill!(Ax_out,TF(0)); CDS_MVp_MT(size(Q,1),size(Q,2),Q,Q_offsets,in,Ax_out); return Ax_out)
+    
+    # w = CDS2stencil(Q, Q_offsets, comp_grid.n)
+    # w = w |> gpu
+    # cdims = DenseConvDims(reshape(x, comp_grid.n ...,1,1), w, padding=Int(size(w,1)-1)/2)
 
+    Af1(in) =  Ax_CDS_MT(in, Ax_out, Q, Q_offsets)
+    #Af1(in) =  Ax_stencil_cuda(in, Ax_out, comp_grid, w, cdims)
+ 
     #determine what relative residual CG needs to reach
     if i<3 #i is the PARSDMM iteration counter
       x_solve_tol_ref = TF(max(0.1*norm(Af1(x)-rhs)/norm(rhs),10*eps(TF))) #10% of current relative residual
@@ -30,6 +37,7 @@ function argmin_x(
     end
 
     (x,flag,relres,iter) = cg(Af1,rhs,tol=x_solve_tol_ref,maxIter=1000,x=x,out=0);
+    #(x,flag,relres,iter) = cg(Q,Q_offsets,rhs, x_solve_tol_ref, 1000, x, 0)
 
   elseif typeof(Q)==SparseMatrixCSC{TF,Int64} #CG with native Julia sparse CSC format MPVs
     
@@ -60,3 +68,29 @@ function argmin_x(
 
   return x,iter,relres,x_solve_tol_ref
 end
+
+function Ax_CDS_MT(in::Vector{TF}, Ax_out::Vector{TF}, Q::Array{TF,2}, Q_offsets::Vector{Int}) where TF <: Real
+  
+  fill!(Ax_out,TF(0))
+  CDS_MVp_MT(size(Q,1),size(Q,2),Q,Q_offsets,in,Ax_out);
+
+  return Ax_out
+end
+
+# function Ax_stencil_cuda(in::Vector{TF}, Ax_out::Vector{TF}, comp_grid, w::CUDA.CuArray{TF, 5}, cdims) where TF <: Real
+#   in     = in |> gpu
+#   Ax_out = Ax_out |>gpu
+#   fill!(Ax_out,TF(0))
+#   Ax_out = reshape(Ax_out,comp_grid.n ...,1,1)
+#   in     = reshape(in,comp_grid.n ...,1,1)
+#   conv!(Ax_out, in, w, cdims)
+
+#   # #for now, boundary conditions are an issue, manually correct it
+#   # kernel_size = size(w,1) 
+
+
+#   Ax_out = Ax_out |> cpu
+
+#   return vec(Ax_out)
+
+# end
diff --git a/src/cg.jl b/src/cg.jl
diff --git a/src/compute_relative_feasibility.jl b/src/compute_relative_feasibility.jl