diff --git a/models/flow/d3q27_cumulant/Dynamics.R b/models/flow/d3q27_cumulant/Dynamics.R
index fcb1f0b53..74478d0d2 100644
--- a/models/flow/d3q27_cumulant/Dynamics.R
+++ b/models/flow/d3q27_cumulant/Dynamics.R
@@ -43,6 +43,10 @@ AddNodeType(name="NVelocity", group="BOUNDARY")
 AddNodeType(name="SVelocity", group="BOUNDARY")
 AddNodeType(name="NPressure", group="BOUNDARY")
 AddNodeType(name="SPressure", group="BOUNDARY")
+AddNodeType(name="FVelocity", group="BOUNDARY")
+AddNodeType(name="BVelocity", group="BOUNDARY")
+AddNodeType(name="FPressure", group="BOUNDARY")
+AddNodeType(name="BPressure", group="BOUNDARY")
 AddNodeType(name="NSymmetry", group="ADDITIONALS")
 AddNodeType(name="SSymmetry", group="ADDITIONALS")
 AddNodeType(name="Body", group="BODY")
diff --git a/models/flow/d3q27_cumulant/Dynamics.c.Rt b/models/flow/d3q27_cumulant/Dynamics.c.Rt
index e75b957b2..127b7d838 100644
--- a/models/flow/d3q27_cumulant/Dynamics.c.Rt
+++ b/models/flow/d3q27_cumulant/Dynamics.c.Rt
@@ -24,9 +24,11 @@ CudaDeviceFunction real_t getRho(){
 	return <?R C(sum(f)) ?>;
 }
 
-CudaDeviceFunction real_t getP(){
-	return ((<?R C(sum(f)) ?>)-1.0)/3.0;
+CudaDeviceFunction real_t getP() {
+    return ((<?R C(sum(f)) ?>) - 1.0) / 3.0;
 }
+
+
 CudaDeviceFunction vector_t getU_(){
         real_t d = getRho();
         vector_t u;
@@ -98,6 +100,7 @@ CudaDeviceFunction real_t getKinE(){
 }
 
 #endif
+
 CudaDeviceFunction float2 Color() {
         float2 ret;
         vector_t u = getU();
@@ -109,6 +112,7 @@ CudaDeviceFunction float2 Color() {
         }
         return ret;
 }
+
 CudaDeviceFunction void BounceBack()
 {
     vector_t p1,p2;
@@ -136,6 +140,16 @@ CudaDeviceFunction void WVelocity()
 <?R ZouHe(EQ, 1, 1, "velocity") ?>
 }
 
+CudaDeviceFunction void FVelocity()
+{
+<?R ZouHe(EQ, 3, -1, "velocity") ?>
+}
+
+CudaDeviceFunction void BVelocity()
+{
+<?R ZouHe(EQ, 3, 1, "velocity") ?>
+}
+
 CudaDeviceFunction void SVelocity()
 {
 <?R ZouHe(EQ, 2, 1, "velocity") ?>
@@ -151,6 +165,16 @@ CudaDeviceFunction void WPressure()
 <?R ZouHe(EQ, 1, 1, "pressure") ?>
 }
 
+CudaDeviceFunction void FPressure()
+{
+<?R ZouHe(EQ, 3, -1, "pressure") ?>
+}
+
+CudaDeviceFunction void BPressure()
+{
+<?R ZouHe(EQ, 3, 1, "pressure") ?>
+}
+
 CudaDeviceFunction void SPressure()
 {
 <?R ZouHe(EQ, 2, 1, "pressure") ?>
@@ -242,26 +266,38 @@ CudaDeviceFunction void RunBoundaries() {
 		WVelocityTurbulent();
 		break;
 	case NODE_EPressure:
-        EPressure();
-        break;
+		EPressure();
+		break;
 	case NODE_WPressure:
 		WPressure();
 		break;
+	case NODE_FPressure:
+		FPressure();
+		break;
+	case NODE_BPressure:
+		BPressure();
+		break;
 	case NODE_SPressure:
-        SPressure();
-        break;
+		SPressure();
+		break;
 	case NODE_NPressure:
-        NPressure();
-        break;
+		NPressure();
+		break;
 	case NODE_WVelocity:
 		WVelocity();
 		break;
-	 case NODE_NVelocity:
-        NVelocity();
-        break;
-	 case NODE_SVelocity:
-        SVelocity();
-        break;
+	case NODE_FVelocity:
+		FVelocity();
+		break;
+	case NODE_BVelocity:
+		BVelocity();
+		break;
+	case NODE_NVelocity:
+		NVelocity();
+		break;
+	case NODE_SVelocity:
+		SVelocity();
+		break;
 	case NODE_EVelocity:
 		EVelocity();
 		break;
@@ -518,3 +554,4 @@ varUYUZ = varUYUZ + getU_().y*getU_().z;
 #endif
 }
 
+
diff --git a/src/ArbLattice.cpp b/src/ArbLattice.cpp
index 23129f962..7abf849ca 100644
--- a/src/ArbLattice.cpp
+++ b/src/ArbLattice.cpp
@@ -23,6 +23,7 @@ ArbLattice::ArbLattice(size_t num_snaps_, const UnitEnv& units_, const std::map<
 
 void ArbLattice::initialize(size_t num_snaps_, const std::map<std::string, int>& setting_zones, pugi::xml_node arb_node) {
     const int rank = mpitools::MPI_Rank(comm);
+    sample = std::make_unique<Sampler>(model.get(), units, rank);
     sizes.snaps = num_snaps_;
 #ifdef ADJOINT
     sizes.snaps += 2;  // Adjoint snaps are appended to the total snap allocation
@@ -549,7 +550,28 @@ std::vector<real_t> ArbLattice::getCoord(const Model::Coord& d, real_t scale) {
     return ret;
 }
 
-#include <iostream>
+int ArbLattice::getId(const double &x, const double &y, const double &z) {
+    int id = 0;
+    double epsilon = std::fabs(connect.coord(0, 0)-x) + std::fabs(connect.coord(1, 0)-y) + std::fabs(connect.coord(2, 0)-z);
+    for (int i = 1; i < getLocalSize(); i++) {
+        double tmp = std::fabs(connect.coord(0, i)-x) + std::fabs(connect.coord(1, i)-y) + std::fabs(connect.coord(2, i)-z);
+        if (tmp < epsilon) {
+            epsilon = tmp;
+            id = i;
+        }
+    }
+    return id;
+}
+
+void ArbLattice::getSample(int quant, lbRegion r, real_t scale, real_t *buf) {
+    setSnapIn(Snap);
+#ifdef ADJOINT
+    setAdjSnapIn(aSnap);
+#endif
+    int lid = getId(r.x, r.y, r.z);
+    launcher.SampleQuantity(quant, lid, buf, scale, data);
+}
+
 
 void ArbLattice::initCommManager() {
     if (mpitools::MPI_Size(comm) == 1) return;
@@ -843,4 +865,20 @@ void ArbLattice::resetAverage(){
             CudaMemset(&getSnapPtr(Snap)[f.id*sizes.snaps_pitch], 0, sizes.snaps_pitch*sizeof(real_t));
         }
     }
-}
\ No newline at end of file
+}
+
+void ArbLattice::updateAllSamples(){
+    const int rank = mpitools::MPI_Rank(comm);
+    if (sample->size != 0) {
+	    for (size_t j = 0; j < sample->spoints.size(); j++) {
+		    if (rank == sample->spoints[j].rank) {
+		        for(const Model::Quantity& q : model->quantities) {
+                    if (sample->quant->in(q.name.c_str())){
+                        double v = sample->units->alt(q.unit.c_str());
+                        getSample(q.id, sample->spoints[j].location, 1/v, &sample->gpu_buffer[sample->location[q.name.c_str()]+(data.iter - sample->startIter)*sample->size + sample->totalIter*j*sample->size]); 
+                    }
+                }
+	        } 
+        }
+    }
+}
diff --git a/src/ArbLattice.hpp b/src/ArbLattice.hpp
index 4883ff4b6..9665f81f8 100644
--- a/src/ArbLattice.hpp
+++ b/src/ArbLattice.hpp
@@ -13,6 +13,7 @@
 #include "Region.h"
 #include "pugixml.hpp"
 #include "utils.h"
+#include "Sampler.h"
 
 /// Note on node numbering: We have 2 indexing schemes - a global and a local one.
 /// Globally, nodes are numbered such that consecutive ranks own monotonically increasing intervals, (e.g. rank 0 owns nodes [0, n_0), rank 1 owns nodes [n_0, n_1), etc.) This is required for ParMETIS, but it is also a convenient scheme in general.
@@ -71,6 +72,7 @@ class ArbLattice : public LatticeBase {
    public:
     static constexpr size_t Q = Model_m::Q;    /// Stencil size
     static constexpr size_t NF = Model_m::NF;  /// Number of fields
+    std::unique_ptr<Sampler> sample;           /// initializing sample with zero size
 
     ArbLattice(size_t num_snaps_, const UnitEnv& units_, const std::map<std::string, int>& setting_zones, pugi::xml_node arb_node, MPI_Comm comm_);
     ArbLattice(const ArbLattice&) = delete;
@@ -90,6 +92,8 @@ class ArbLattice : public LatticeBase {
     virtual std::vector<real_t> getField(const Model::Field& f);
     virtual std::vector<real_t> getFieldAdj(const Model::Field& f);
     virtual std::vector<real_t> getCoord(const Model::Coord& q, real_t scale = 1);
+    virtual int getId(const double &x,const  double &y, const double &z);
+    virtual void getSample(int quant, lbRegion r, real_t scale, real_t *buf);
 
     virtual void setFlags(const std::vector<big_flag_t>& x);
     virtual void setField(const Model::Field& f, const std::vector<real_t>& x);
@@ -101,6 +105,7 @@ class ArbLattice : public LatticeBase {
     const std::vector<unsigned>& getLocalPermutation() const { return local_permutation; }
 
     void resetAverage();
+    void updateAllSamples();
 
    protected:
     ArbLatticeLauncher launcher;  /// Launcher responsible for running CUDA kernels on the lattice
diff --git a/src/ArbLatticeLauncher.h.Rt b/src/ArbLatticeLauncher.h.Rt
index 31c604a97..4e1fd12da 100644
--- a/src/ArbLatticeLauncher.h.Rt
+++ b/src/ArbLatticeLauncher.h.Rt
@@ -21,6 +21,7 @@ struct ArbLatticeLauncher {
     void unpack(CudaStream_t stream) const;
 
     void getQuantity(int quant, real_t* host_tab, real_t scale, const LatticeData& data) const;
+    void SampleQuantity(int quant, int lid, real_t *host_tab, real_t scale, const LatticeData& data);
 
 private:
 <?R for (q in rows(Quantities)) { ifdef(q$adjoint);
@@ -31,8 +32,12 @@ if (q$adjoint) {
 }
 ?>
     void getQuantity<?%s q$name ?>(<?%s q$type ?>* tab, real_t scale, const LatticeData& data) const;
+    void getSample<?%s q$name ?>(int lid, <?%s q$type ?>* tab, real_t scale, const LatticeData& data) const;
 <?R }
 ifdef() ?>
 };
 
 #endif  // ARBLATTICELAUNCHER_H
+
+
+
diff --git a/src/ArbLatticeLauncher.hpp.Rt b/src/ArbLatticeLauncher.hpp.Rt
index ecddb46d8..b5c4b0a7d 100644
--- a/src/ArbLatticeLauncher.hpp.Rt
+++ b/src/ArbLatticeLauncher.hpp.Rt
@@ -87,6 +87,23 @@ ifdef();
 	}
 }
 
+void ArbLatticeLauncher::SampleQuantity(int quant, int lid, real_t* host_tab, real_t scale, const LatticeData &data) {
+    switch(quant) {	<?R
+for (q in rows(Quantities)) { ifdef(q$adjoint);
+?>
+        case <?%s q$Index ?>: {
+            const auto gpu_tab = cudaMakeUnique<<?%s q$type ?>>(1);
+            getSample<?%s q$name ?>(lid, gpu_tab.get(), scale, data);
+            CudaMemcpy(host_tab, gpu_tab.get(), sizeof(<?%s q$type ?>), CudaMemcpyDeviceToHost);
+            break;
+        } <?R
+}
+ifdef();
+?>
+	}
+}
+
+
 <?R for (q in rows(Quantities)) { ifdef(q$adjoint);
 if (q$adjoint) {
   node = "Node_Adj"
@@ -128,10 +145,47 @@ if (q$type == "vector_t") {
     }
 };
 
+struct GetSampleArbExecutor<?%s q$name?> : public LinearExecutor {
+    ArbLatticeContainer container;
+    LatticeData data;
+    int lid;
+    <?%s q$type ?>* buf;
+    real_t scale;
+
+    CudaDeviceFunction void Execute() const {
+        using LA = ArbLatticeAccess; <?R
+        if (q$adjoint) { ?>
+            using N = Node< LA, Adjoint, NoGlobals, Get >; <?R
+        } else { ?>
+            using N = Node< LA, Primal, NoGlobals, Get >; <?R
+        }?>
+        LA acc(lid, container);
+        N now(acc, data);
+        acc.pop(now); <?R
+        if (q$adjoint) { ?>
+            acc.pop_adj(now); <?R
+        } ?>
+        auto val = now.get<?%s q$name ?>(); <?R
+        if (q$type == "vector_t") {
+            for (coef in c("x","y","z")) { ?>
+                val.<?%s coef ?> *= scale; <?R
+            }
+        } else { ?>
+            val *= scale; <?R
+        } ?>
+        buf[0] = val;    
+    }
+};
+
 void ArbLatticeLauncher::getQuantity<?%s q$name ?>(<?%s q$type ?>* tab, real_t scale, const LatticeData& data) const {
     const GetQuantityArbExecutor<?%s q$name?> executor{{container.num_border_nodes + container.num_interior_nodes}, container, data, tab, scale};
     LaunchExecutor(executor);
 }
+
+void ArbLatticeLauncher::getSample<?%s q$name ?>(int lid, <?%s q$type ?>* tab, real_t scale, const LatticeData& data) const {
+    const GetSampleArbExecutor<?%s q$name?> executor{{1}, container, data, lid, tab, scale};
+    LaunchExecutor(executor);
+}
 <?R }
 ifdef() ?>
 
diff --git a/src/Handlers/cbFailcheck.cpp b/src/Handlers/cbFailcheck.cpp
index b118cae40..812b4cf4f 100644
--- a/src/Handlers/cbFailcheck.cpp
+++ b/src/Handlers/cbFailcheck.cpp
@@ -1,4 +1,5 @@
 #include "cbFailcheck.h"
+#include <algorithm>
 
 std::string cbFailcheck::xmlname = "Failcheck";
 
diff --git a/src/Handlers/cbSample.cpp b/src/Handlers/cbSample.cpp
index 295488d85..2587c3230 100644
--- a/src/Handlers/cbSample.cpp
+++ b/src/Handlers/cbSample.cpp
@@ -3,20 +3,20 @@ std::string cbSample::xmlname = "Sample";
 #include "../HandlerFactory.h"
 
 int cbSample::Init () {
-		std::string nm="Sampler";
-		Callback::Init();
-		if (everyIter == 0) {
-                	error("Iteration value in sampler should not be zero");
-                	return -1;
-                }
-		pugi::xml_attribute attr=node.attribute("what");
-		if (attr) {
-			s.add_from_string(attr.value(),',');
-		}
-		else {
-			s.add_from_string("all",',');
-		}
-                const auto lattice = solver->getCartLattice();
+	std::string nm="Sampler";
+	Callback::Init();
+	if (everyIter == 0) {
+				error("Iteration value in sampler should not be zero");
+				return -1;
+			}
+	pugi::xml_attribute attr=node.attribute("what");
+	if (attr) {
+		s.add_from_string(attr.value(),',');
+	}
+	else {
+		s.add_from_string("all",',');
+	}
+	const auto init_cartesian = [&](const Lattice<CartLattice>* lattice){
 		for (pugi::xml_node par = node.first_child(); par; par = par.next_sibling()) {
 			if (strcmp(par.name(),"Point") == 0) {
 				lbRegion loc;
@@ -44,23 +44,72 @@ int cbSample::Init () {
 		lattice->sample->mpi_rank = solver->mpi_rank;
 		lattice->sample->Allocate(&s,startIter,everyIter);
 		lattice->sample->initCSV(filename.c_str());
-		return 0;
-		}
+		return EXIT_SUCCESS;
+	};
+	const auto init_arbitrary = [&](const Lattice<ArbLattice>* lattice) {
+      for (pugi::xml_node par = node.first_child(); par; par = par.next_sibling()) {
+        if (strcmp(par.name(),"Point") == 0) {
+          lbRegion loc;
+          attr = par.attribute("dx");
+          if (attr) {
+            loc.x = solver->units.si(attr.value());
+            loc.dx = solver->units.alt(attr.value());
+          }
+          attr = par.attribute("dy");
+          if (attr) {
+            loc.y = solver->units.si(attr.value());
+            loc.dy = solver->units.alt(attr.value());
+          }
+          attr = par.attribute("dz");
+          if (attr) {
+            loc.z = solver->units.si(attr.value());
+            loc.dz = solver->units.alt(attr.value());
+          }
+		  if (loc.nx == 1) lattice->sample->addPoint(loc, solver->mpi_rank);
+        } else {
+          error("Uknown element in Sampler\n");
+          return -1;
+        }
+      }
+      filename = solver->outIterFile(nm, ".csv");
+      lattice->sample->units = &solver->units;
+      lattice->sample->mpi_rank = solver->mpi_rank;
+      lattice->sample->Allocate(&s,startIter,everyIter);
+      lattice->sample->initCSV(filename.c_str());
+      return EXIT_SUCCESS;
+    };
+	return std::visit(OverloadSet{init_cartesian, init_arbitrary}, solver->getLatticeVariant());
+}
 
 
 int cbSample::DoIt () {
-		Callback::DoIt();
-                const auto lattice = solver->getCartLattice();
-		lattice->sample->writeHistory(solver->iter);
-		lattice->sample->startIter = solver->iter;
-		return 0;
-		}
+	const auto do_cartesian = [&](Lattice<CartLattice>* lattice) {
+      lattice->sample->writeHistory(solver->iter);
+      lattice->sample->startIter = solver->iter;
+      return EXIT_SUCCESS;
+    };
+
+    const auto do_arbitrary = [&](Lattice<ArbLattice>* lattice) {
+      lattice->sample->writeHistory(solver->iter);
+      lattice->sample->startIter = solver->iter;
+      return EXIT_SUCCESS;
+    };
+    return std::visit(OverloadSet{do_cartesian, do_arbitrary}, solver->getLatticeVariant());
+}
 
 
 int cbSample::Finish () {
-	   solver->getCartLattice()->sample->Finish();
-	   return Callback::Finish();
-	 }	 
+	const auto end_cartesian = [&](const Lattice<CartLattice>* lattice) {
+        lattice->sample->Finish();
+        return Callback::Finish();
+    };
+
+    const auto end_arbitrary = [&](const Lattice<ArbLattice>* lattice) {
+        lattice->sample->Finish();
+        return Callback::Finish();
+    };
+    return std::visit(OverloadSet{end_cartesian, end_arbitrary}, solver->getLatticeVariant());
+}	 
 
 
 // Register the handler (basing on xmlname) in the Handler Factory
diff --git a/src/Lattice.hpp.Rt b/src/Lattice.hpp.Rt
index de3ac6bf8..70cc47324 100644
--- a/src/Lattice.hpp.Rt
+++ b/src/Lattice.hpp.Rt
@@ -109,8 +109,7 @@ struct Lattice : public LatticeType {
         CudaDeviceSynchronize();
         LatticeType::Snap = tab_out;
         LatticeType::MarkIteration();
-        if constexpr(std::is_same_v<LatticeType, CartLattice>) /// TODO
-            LatticeType::updateAllSamples();
+        LatticeType::updateAllSamples();
         DEBUG_PROF_POP();
     }
 
diff --git a/src/Region.h b/src/Region.h
index 9d2f197ca..83484b800 100644
--- a/src/Region.h
+++ b/src/Region.h
@@ -8,6 +8,7 @@
 struct lbRegion {
   int dx = 0, dy = 0, dz = 0;
   int nx = 1, ny = 1, nz = 1;
+  float x = 0.0f, y = 0.0f, z = 0.0f;
   lbRegion() = default;
   lbRegion(int w, int h) : nx(w), ny(h) {}
   lbRegion(int x, int y, int w, int h) : dx(x), dy(y), nx(w), ny(h) {}
diff --git a/src/vtkLattice.cpp b/src/vtkLattice.cpp
index befdccf5c..721020a74 100644
--- a/src/vtkLattice.cpp
+++ b/src/vtkLattice.cpp
@@ -4,6 +4,7 @@
 
 #include <cassert>
 #include <cstdio>
+#include <algorithm>
 
 #include "Global.h"
 #include "cross.h"
diff --git a/src/vtuOutput.cpp b/src/vtuOutput.cpp
index 5d06cc4a1..01b7de16b 100644
--- a/src/vtuOutput.cpp
+++ b/src/vtuOutput.cpp
@@ -1,6 +1,7 @@
 #include "vtuOutput.h"
 
 #include "mpitools.hpp"
+#include <stdexcept>
 
 VtkFileOut::VtkFileOut(std::string name_, size_t num_cells_, size_t num_points_, const double* coords, const unsigned* verts, MPI_Comm comm_, bool has_scalars, bool has_vectors) : name(std::move(name_)), comm(comm_), num_cells(num_cells_), num_points(num_points_) {
     init();