From ad53bcf19d1f478065a48a4e9516ae4813337480 Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Mon, 20 Dec 2021 16:51:47 +0100
Subject: [PATCH 01/15] Desktop 3nd

---
 pasture-tools/Cargo.toml                      |  13 ++-
 .../src/acceleration_structures/gpu_octree.rs | 100 ++++++++++++++++
 .../src/acceleration_structures/mod.rs        |   2 +
 pasture-tools/src/lib.rs                      |   3 +
 pasture-tools/src/main.rs                     | 110 ++++++++++++++++++
 5 files changed, 226 insertions(+), 2 deletions(-)
 create mode 100644 pasture-tools/src/acceleration_structures/gpu_octree.rs
 create mode 100644 pasture-tools/src/acceleration_structures/mod.rs
 create mode 100644 pasture-tools/src/lib.rs
 create mode 100644 pasture-tools/src/main.rs
diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index 6b1128b..e0a9fb5 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -12,7 +12,7 @@ categories = ["data-structures", "command-line-utilities"]
 readme = "README.md"
 
 [dependencies]
-pasture-core = {version = "=0.2.0", path = "../pasture-core" }
+pasture-core = {version = "=0.2.0", path = "../pasture-core", features=["gpu"]}
 pasture-io = {version = "=0.2.0", path = "../pasture-io" }
 pasture-algorithms = {version = "=0.2.0", path = "../pasture-algorithms" }
 pasture-derive = {version = "=0.2.0", path = "../pasture-derive" }
@@ -23,6 +23,12 @@ pretty_env_logger = "0.4.0"
 plotters = "^0.3.0"
 rand = {version = "0.8.3", features = ["small_rng"] }
 
+#gpu related
+wgpu = { version = "0.11.0", features = ["spirv"], optional = true }
+shaderc = { version = "0.7.2", optional = true }
+futures = { version = "0.3", optional = true }
+bytemuck = { version = "1.5.1", optional = true }
+
 [[bin]]
 name = "reorder_laz_chunks"
 
@@ -30,4 +36,7 @@ name = "reorder_laz_chunks"
 name = "plotting"
 
 [[bin]]
-name = "info"
\ No newline at end of file
+name = "info"
+
+[features]
+gpu = ["pasture-core/gpu", "wgpu", "shaderc", "futures", "bytemuck"]
diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
new file mode 100644
index 0000000..71573d0
--- /dev/null
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -0,0 +1,100 @@
+use pasture_core::{
+    containers::{
+        InterleavedPointBufferMut, InterleavedVecPointStorage, PointBuffer, PointBufferExt,
+    },
+    gpu,
+    layout::{attributes, PointLayout},
+    nalgebra::Vector3,
+};
+use pasture_derive::PointType;
+use wgpu;
+
+#[repr(C)]
+#[derive(PointType, Debug)]
+struct MyPointType {
+    #[pasture(BUILTIN_POSITION_3D)]
+    pub position: Vector3<f64>,
+    #[pasture(BUILTIN_COLOR_RGB)]
+    pub icolor: Vector3<u16>,
+    #[pasture(attribute = "MyColorF32")]
+    pub fcolor: Vector3<f32>,
+    #[pasture(attribute = "MyVec3U8")]
+    pub byte_vec: Vector3<u8>,
+    #[pasture(BUILTIN_CLASSIFICATION)]
+    pub classification: u8,
+    #[pasture(BUILTIN_INTENSITY)]
+    pub intensity: u16,
+    #[pasture(BUILTIN_SCAN_ANGLE)]
+    pub scan_angle: i16,
+    #[pasture(BUILTIN_SCAN_DIRECTION_FLAG)]
+    pub scan_dir_flag: bool,
+    #[pasture(attribute = "MyInt32")]
+    pub my_int: i32,
+    #[pasture(BUILTIN_WAVEFORM_PACKET_SIZE)]
+    pub packet_size: u32,
+    #[pasture(BUILTIN_RETURN_POINT_WAVEFORM_LOCATION)]
+    pub ret_point_loc: f32,
+    #[pasture(BUILTIN_GPS_TIME)]
+    pub gps_time: f64,
+}
+
+struct Boundary {
+    nw_front: f32,
+    nw_back: f32,
+    sw_front: f32,
+    sw_back: f32,
+    ne_front: f32,
+    ne_back: f32,
+    se_front: f32,
+    se_back: f32,
+}
+
+trait OctreeNode {}
+
+struct OctreeRegion {
+    boundary: Boundary,
+    points: [Vector3<f64>; 8],
+}
+
+struct OctreeLeaf {
+    point: Vector3<f64>,
+}
+
+pub struct GpuOctree<'a> {
+    device: gpu::Device<'a>,
+    buffer: Option<wgpu::Buffer>,
+    buffer_size: Option<wgpu::BufferAddress>,
+    buffer_binding: Option<u32>,
+    point_buffer: &'a dyn PointBuffer,
+}
+
+impl GpuOctree<'_> {
+    pub async fn new<'a>(point_buffer: &'a dyn PointBuffer) -> GpuOctree<'a> {
+        let device = gpu::Device::new(gpu::DeviceOptions {
+            device_power: gpu::DevicePower::High,
+            device_backend: gpu::DeviceBackend::Vulkan,
+            use_adapter_features: true,
+            use_adapter_limits: true,
+        })
+        .await;
+
+        GpuOctree {
+            device: device.unwrap(),
+            buffer: None,
+            buffer_size: None,
+            buffer_binding: None,
+            point_buffer: point_buffer,
+        }
+    }
+    pub fn construct(&mut self, layout: PointLayout) {
+        let point_count = self.point_buffer.len();
+        let points_in_byte: usize = point_count * 8 * 3;
+        let mut raw_points = vec![0 as u8; points_in_byte];
+        self.point_buffer.get_raw_attribute_range(
+            0..point_count,
+            &attributes::POSITION_3D,
+            raw_points.as_mut_slice(),
+        );
+
+    }
+}
diff --git a/pasture-tools/src/acceleration_structures/mod.rs b/pasture-tools/src/acceleration_structures/mod.rs
new file mode 100644
index 0000000..a30ce70
--- /dev/null
+++ b/pasture-tools/src/acceleration_structures/mod.rs
@@ -0,0 +1,2 @@
+mod gpu_octree;
+pub use self::gpu_octree::*;
diff --git a/pasture-tools/src/lib.rs b/pasture-tools/src/lib.rs
new file mode 100644
index 0000000..1214913
--- /dev/null
+++ b/pasture-tools/src/lib.rs
@@ -0,0 +1,3 @@
+extern crate self as pasture_tools;
+
+pub mod acceleration_structures;
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
new file mode 100644
index 0000000..2e42c0d
--- /dev/null
+++ b/pasture-tools/src/main.rs
@@ -0,0 +1,110 @@
+#[cfg(feature = "gpu")]
+mod ex {
+
+    use pasture_core::containers::{InterleavedVecPointStorage, PointBuffer, PointBufferExt};
+    use pasture_core::gpu;
+    use pasture_core::gpu::GpuPointBufferInterleaved;
+    use pasture_core::layout::PointType;
+    use pasture_core::layout::{attributes, PointAttributeDataType, PointAttributeDefinition};
+    use pasture_core::nalgebra::Vector3;
+    use pasture_derive::PointType;
+
+    #[repr(C)]
+    #[derive(PointType, Debug)]
+    struct MyPointType {
+        #[pasture(BUILTIN_POSITION_3D)]
+        pub position: Vector3<f64>,
+        #[pasture(BUILTIN_COLOR_RGB)]
+        pub icolor: Vector3<u16>,
+        #[pasture(attribute = "MyColorF32")]
+        pub fcolor: Vector3<f32>,
+        #[pasture(attribute = "MyVec3U8")]
+        pub byte_vec: Vector3<u8>,
+        #[pasture(BUILTIN_CLASSIFICATION)]
+        pub classification: u8,
+        #[pasture(BUILTIN_INTENSITY)]
+        pub intensity: u16,
+        #[pasture(BUILTIN_SCAN_ANGLE)]
+        pub scan_angle: i16,
+        #[pasture(BUILTIN_SCAN_DIRECTION_FLAG)]
+        pub scan_dir_flag: bool,
+        #[pasture(attribute = "MyInt32")]
+        pub my_int: i32,
+        #[pasture(BUILTIN_WAVEFORM_PACKET_SIZE)]
+        pub packet_size: u32,
+        #[pasture(BUILTIN_RETURN_POINT_WAVEFORM_LOCATION)]
+        pub ret_point_loc: f32,
+        #[pasture(BUILTIN_GPS_TIME)]
+        pub gps_time: f64,
+    }
+
+    pub fn main() {
+        futures::executor::block_on(run());
+    }
+
+    async fn run() {
+        // == Init point buffer ======================================================================
+
+        let points = vec![
+            MyPointType {
+                position: Vector3::new(1.0, 0.0, 0.0),
+                icolor: Vector3::new(255, 0, 0),
+                fcolor: Vector3::new(1.0, 1.0, 1.0),
+                byte_vec: Vector3::new(1, 0, 0),
+                classification: 1,
+                intensity: 1,
+                scan_angle: -1,
+                scan_dir_flag: true,
+                my_int: -100000,
+                packet_size: 1,
+                ret_point_loc: 1.0,
+                gps_time: 1.0,
+            },
+            MyPointType {
+                position: Vector3::new(0.0, 1.0, 0.0),
+                icolor: Vector3::new(0, 255, 0),
+                fcolor: Vector3::new(0.0, 1.0, 0.0),
+                byte_vec: Vector3::new(0, 1, 0),
+                classification: 2,
+                intensity: 2,
+                scan_angle: -2,
+                scan_dir_flag: false,
+                my_int: -200000,
+                packet_size: 2,
+                ret_point_loc: 2.0,
+                gps_time: 2.0,
+            },
+            MyPointType {
+                position: Vector3::new(0.0, 0.0, 1.0),
+                icolor: Vector3::new(0, 0, 255),
+                fcolor: Vector3::new(0.0, 0.0, 1.0),
+                byte_vec: Vector3::new(0, 0, 1),
+                classification: 3,
+                intensity: 3,
+                scan_angle: -3,
+                scan_dir_flag: true,
+                my_int: -300000,
+                packet_size: 3,
+                ret_point_loc: 3.0,
+                gps_time: 3.0,
+            },
+        ];
+        let a: f64 = 1.0;
+        println!("{:?}", a.to_be_bytes());
+        let layout = MyPointType::layout();
+        let mut point_buffer = InterleavedVecPointStorage::new(layout);
+        point_buffer.push_points(points.as_slice());
+
+        let mut octree =
+            pasture_tools::acceleration_structures::GpuOctree::new(&point_buffer).await;
+        octree.construct(MyPointType::layout());
+    }
+}
+
+#[cfg(feature = "gpu")]
+fn main() {
+    ex::main();
+}
+
+#[cfg(not(feature = "gpu"))]
+fn main() {}

From eece1aeb1eb5279d7fcffcdb3b5d65a0b476716c Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Mon, 10 Jan 2022 13:16:28 +0100
Subject: [PATCH 02/15] Desktop 4th

---
 .../src/acceleration_structures/gpu_octree.rs | 190 ++++++++++++++----
 .../acceleration_structures/shaders/comp.spv  | Bin 0 -> 2076 bytes
 .../shaders/find_max_values.comp              |  35 ++++
 pasture-tools/src/main.rs                     |  27 ++-
 4 files changed, 211 insertions(+), 41 deletions(-)
 create mode 100644 pasture-tools/src/acceleration_structures/shaders/comp.spv
 create mode 100644 pasture-tools/src/acceleration_structures/shaders/find_max_values.comp

diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 71573d0..06f1dd9 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -1,12 +1,15 @@
+use pasture_core::containers::attr1::AttributeIteratorByValue;
 use pasture_core::{
     containers::{
         InterleavedPointBufferMut, InterleavedVecPointStorage, PointBuffer, PointBufferExt,
     },
     gpu,
-    layout::{attributes, PointLayout},
+    layout::{attributes, PointAttributeDataType, PointAttributeDefinition, PointLayout},
     nalgebra::Vector3,
 };
 use pasture_derive::PointType;
+use std::convert::TryInto;
+
 use wgpu;
 
 #[repr(C)]
@@ -38,63 +41,178 @@ struct MyPointType {
     pub gps_time: f64,
 }
 
-struct Boundary {
-    nw_front: f32,
-    nw_back: f32,
-    sw_front: f32,
-    sw_back: f32,
-    ne_front: f32,
-    ne_back: f32,
-    se_front: f32,
-    se_back: f32,
-}
-
-trait OctreeNode {}
-
 struct OctreeRegion {
-    boundary: Boundary,
-    points: [Vector3<f64>; 8],
+    bounds: [Vector3<f64>; 8],
+    partition: Vec<usize>,
+    points_per_partition: Vec<usize>,
+    start: usize,
+    end: usize,
+    children: Option<[Box<OctreeNode>; 8]>,
 }
 
 struct OctreeLeaf {
     point: Vector3<f64>,
 }
 
+enum OctreeNode {
+    OctreeRegion,
+    OctreeLeaf,
+}
+
 pub struct GpuOctree<'a> {
-    device: gpu::Device<'a>,
-    buffer: Option<wgpu::Buffer>,
-    buffer_size: Option<wgpu::BufferAddress>,
-    buffer_binding: Option<u32>,
+    device: &'a mut gpu::Device<'a>,
+    buffer_bind_group: Option<wgpu::BindGroup>,
+    buffer_bind_group_layout: Option<wgpu::BindGroupLayout>,
     point_buffer: &'a dyn PointBuffer,
+    root_node: Option<OctreeRegion>,
 }
 
-impl GpuOctree<'_> {
-    pub async fn new<'a>(point_buffer: &'a dyn PointBuffer) -> GpuOctree<'a> {
-        let device = gpu::Device::new(gpu::DeviceOptions {
-            device_power: gpu::DevicePower::High,
-            device_backend: gpu::DeviceBackend::Vulkan,
-            use_adapter_features: true,
-            use_adapter_limits: true,
-        })
-        .await;
+impl OctreeRegion {}
 
+impl<'a> GpuOctree<'a> {
+    pub fn new(point_buffer: &'a dyn PointBuffer, device: &'a mut gpu::Device<'a>) -> Self {
         GpuOctree {
-            device: device.unwrap(),
-            buffer: None,
-            buffer_size: None,
-            buffer_binding: None,
+            device: device,
+            buffer_bind_group: None,
+            buffer_bind_group_layout: None,
             point_buffer: point_buffer,
+            root_node: None,
         }
     }
-    pub fn construct(&mut self, layout: PointLayout) {
+    pub async fn construct(&'a mut self, layout: PointLayout) {
         let point_count = self.point_buffer.len();
-        let points_in_byte: usize = point_count * 8 * 3;
-        let mut raw_points = vec![0 as u8; points_in_byte];
+        let mut points: Vec<Vector3<f64>> = Vec::new();
+        let point_iterator: AttributeIteratorByValue<Vector3<f64>, dyn PointBuffer> =
+            self.point_buffer.iter_attribute(&attributes::POSITION_3D);
+        let mut raw_points = vec![0u8; 24 * point_count];
         self.point_buffer.get_raw_attribute_range(
             0..point_count,
             &attributes::POSITION_3D,
             raw_points.as_mut_slice(),
         );
+        let mut blah: Vec<f64> = raw_points
+            .chunks_exact(8)
+            .map(|b| f64::from_ne_bytes(b.try_into().unwrap()))
+            .collect();
+        for point in point_iterator {
+            points.push(point);
+        }
+        println!("{:?}", blah);
+
+        let max = self.find_max(&raw_points, point_count).await;
+        println!("{}", max);
+        // let mut root_node = OctreeRegion {
+        //     bounds: [Vector3::new(f64::MIN, f64::MAX, f64::MIN), Vector3::new(f64::MAX, f64::MAX, f64::MIN),]
+        //     partition: vec![point_count],
+        //     points_per_partition: vec![point_count],
+        //     start: 0,
+        //     end: point_count - 1,
+        //     children: None,
+        // };
+        //let mut current_nodes = vec![&root_node];
+        let tree_depth = 1;
+        let new_nodes_created = false;
+        let num_leaves = 0;
+        loop {
+            if !new_nodes_created {
+                break;
+            }
+        }
+    }
 
+    async fn find_max(&'a mut self, points: &[u8], count: usize) -> u32 {
+        let mut result: u32 = 0;
+        let mut max_value_buffer = wgpu::util::DeviceExt::create_buffer_init(
+            &self.device.wgpu_device,
+            &wgpu::util::BufferInitDescriptor {
+                label: Some("max_value_buffer"),
+                contents: &result.to_ne_bytes(),
+                usage: wgpu::BufferUsages::STORAGE
+                    | wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST,
+            },
+        );
+
+        let buffer = Some(wgpu::util::DeviceExt::create_buffer_init(
+            &self.device.wgpu_device,
+            &wgpu::util::BufferInitDescriptor {
+                label: Some("gpu_point_buffer"),
+                contents: points,
+                usage: wgpu::BufferUsages::STORAGE,
+            },
+        ));
+
+        self.buffer_bind_group_layout = Some(self.device.wgpu_device.create_bind_group_layout(
+            &wgpu::BindGroupLayoutDescriptor {
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                ],
+                label: Some("compute_bind_group_layout"),
+            },
+        ));
+
+        self.buffer_bind_group = Some(self.device.wgpu_device.create_bind_group(
+            &wgpu::BindGroupDescriptor {
+                label: Some("storage_bind_group"),
+                layout: self.buffer_bind_group_layout.as_ref().unwrap(),
+                entries: &[
+                    wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: buffer.as_ref().unwrap().as_entire_binding(),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding: 1,
+                        resource: max_value_buffer.as_entire_binding(),
+                    },
+                ],
+            },
+        ));
+        self.device.set_bind_group(
+            0,
+            self.buffer_bind_group_layout.as_ref().unwrap(),
+            self.buffer_bind_group.as_ref().unwrap(),
+        );
+
+        self.device
+            .set_compute_shader_glsl(include_str!("shaders/find_max_values.comp"));
+        self.device.compute(count as u32, 1, 1);
+
+        let max_value_buffer_slice = max_value_buffer.slice(..);
+        let mapped_future = max_value_buffer_slice.map_async(wgpu::MapMode::Read);
+        self.device.wgpu_device.poll(wgpu::Maintain::Wait);
+
+        if let Ok(()) = mapped_future.await {
+            let mapped_max_value_buffer = max_value_buffer.slice(..).get_mapped_range().to_vec();
+            println!("{:?}", mapped_max_value_buffer);
+            println!("----------");
+            println!("{:?}", points);
+            result = mapped_max_value_buffer
+                .chunks_exact(4)
+                .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
+                .last()
+                .unwrap();
+        }
+        result
     }
 }
diff --git a/pasture-tools/src/acceleration_structures/shaders/comp.spv b/pasture-tools/src/acceleration_structures/shaders/comp.spv
new file mode 100644
index 0000000000000000000000000000000000000000..406c88ed51514b6d81bb2158dc2b2a7de6f3cdd3
GIT binary patch
literal 2076
zcmZ9M=}uHZ5QSS{h7nLeS=>MtcLY%ZQ3Tm!Kv7X2fDl<iGJ;_-=p*?E{`5)w<1@G<
zCVt;#D!H+fqE6MRK3!e6huWI{`jk$jhEz(g)A4FeHDprKH{csnF^kK~Z<mMm57vi9
zMu)9DooaO%*Gx?TT!+11+ukwXh_#S6*NUkbR?FTR;%_7hROI&+;7V}Y>ql^<OlPaI
zy1e^tt+KT95qGe?yR$S;349@)#josc?;Olkb`RGRCl<4u{0H;1hnt%ldsNn@c9Sgk
zb>k~*ds`d(2dhWb++Z&EshXR}<vwS*VtS5$v$20z331|Ls?YMkwD%w9*)JqmWp3+i
zw2RgF48Dl<V2jl|0aL$!SY`HmPbcnv@6m_3BJMp7v;Fqx%I`N9`|Tya-`M?p4$i5`
zzD+OH*882>IWzhrq|Z8Tn=&)sNqiD7c8|PoM?Lcs)z~|m8z8o}33FM03NL5A1C<v1
zG^V!i$5`zy{yHyn-C*ZptVSE&e$9ENv9pRhbbxy>&#?#NPR!;0^LT4L%eV{g+q5V3
zjD0ue+NkWu$6aoM)s4DAuyw&d0IMCm?<&tP5SI|}PJS!T?4e>A?>=+xHTHX%lYdpk
z){pUa%&Y5iUcbor+jOtr%-@!NhO9F`jrm>t7L(Md(~Oy)Lcx0D;Ld@~&7k7EE?~~s
zIm(5OGxUASH=$~L33EpFCl{KR!Ok@_d%&S7m)GnimJiK7aA?YX<z9Z5E12iW|KuLm
z@UB0cE_W3(=X}J`%XP5#^X~R!{F|Dn)kp0OaDEoIh@)>g--NZ^eE^e>zHfu0Z@JJM
z0{eDi7I(m*DVNtACYBG)yWr52i&@+Qd!D?r7{R-oh1`A2oU;%|FQed?g*_QNi>TE{
z?O3%o`W`2azU6%XF^h*_`RMx*IQo_g&BtJWC!skB4o$hd<`l7fXg&dlrd-To8ti%U
z&SDntau(D08O)rs5JxX_U}y1zcVEQMWA^6v@O{ch?Ne~f%^G9p7P|WHoM->q0v3DA
zVE2BhzT-u(InS`)XLy(W$}M5$>{lE*FTuyZsY3QYVujfM8=aVM+t?cWiv9Y(Ol=(d
E2Uoj+KmY&$

literal 0
HcmV?d00001

diff --git a/pasture-tools/src/acceleration_structures/shaders/find_max_values.comp b/pasture-tools/src/acceleration_structures/shaders/find_max_values.comp
new file mode 100644
index 0000000..aeb7c8e
--- /dev/null
+++ b/pasture-tools/src/acceleration_structures/shaders/find_max_values.comp
@@ -0,0 +1,35 @@
+#version 450
+
+layout(std430, set=0, binding=0) buffer PointCloud {
+  dvec3 pointBuffer[];
+};
+
+layout(std430, set=0, binding=1) buffer ResultBuffer {
+  uint resultBuffer;
+};
+
+shared uint largest_x;
+shared uint largest_y;
+shared uint largest_z;
+
+void main() {
+  uint idx = gl_LocalInvocationID.x;
+
+  if(abs(pointBuffer[idx].x) > abs(pointBuffer[largest_x].x)) {
+    // atomicExchange(largest_x, idx);
+    largest_x = idx;
+  }
+  if(abs(pointBuffer[idx].y) > abs(pointBuffer[largest_y].y)) {
+    //atomicExchange(largest_y, idx);
+    largest_y = idx;
+  }
+  if(abs(pointBuffer[idx].z) > abs(pointBuffer[largest_z].z)) {
+    atomicExchange(largest_z, idx);
+    largest_z = idx;
+  }
+
+  barrier();
+  if(idx == 0){
+    resultBuffer = largest_x;
+  }
+}
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
index 2e42c0d..d2b36d8 100644
--- a/pasture-tools/src/main.rs
+++ b/pasture-tools/src/main.rs
@@ -89,15 +89,30 @@ mod ex {
                 gps_time: 3.0,
             },
         ];
-        let a: f64 = 1.0;
-        println!("{:?}", a.to_be_bytes());
+
         let layout = MyPointType::layout();
         let mut point_buffer = InterleavedVecPointStorage::new(layout);
         point_buffer.push_points(points.as_slice());
 
+        let device = gpu::Device::new(gpu::DeviceOptions {
+            device_power: gpu::DevicePower::High,
+            device_backend: gpu::DeviceBackend::Vulkan,
+            use_adapter_features: true,
+            use_adapter_limits: true,
+        })
+        .await;
+
+        let mut device = match device {
+            Ok(d) => d,
+            Err(_) => {
+                println!("Failed to request device. Aborting.");
+                return;
+            }
+        };
+
         let mut octree =
-            pasture_tools::acceleration_structures::GpuOctree::new(&point_buffer).await;
-        octree.construct(MyPointType::layout());
+            pasture_tools::acceleration_structures::GpuOctree::new(&point_buffer, &mut device);
+        octree.construct(MyPointType::layout()).await;
     }
 }
 
@@ -107,4 +122,6 @@ fn main() {
 }
 
 #[cfg(not(feature = "gpu"))]
-fn main() {}
+fn main() {
+    println!("Whoops");
+}

From ec1e6dee1fd27f955f59651b9acf9ce074bdb5b3 Mon Sep 17 00:00:00 2001
From: jneus <jannis.neus@live.de>
Date: Tue, 18 Jan 2022 16:08:10 +0100
Subject: [PATCH 03/15] Laptop: 1st

---
 .../src/acceleration_structures/gpu_octree.rs | 532 ++++++++++++++----
 .../shaders/generate_nodes.comp               | 195 +++++++
 pasture-tools/src/main.rs                     |  29 +-
 3 files changed, 630 insertions(+), 126 deletions(-)
 create mode 100644 pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp

diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 06f1dd9..d548ab2 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -1,4 +1,6 @@
 use pasture_core::containers::attr1::AttributeIteratorByValue;
+use pasture_core::math::AABB;
+use pasture_core::nalgebra::Point3;
 use pasture_core::{
     containers::{
         InterleavedPointBufferMut, InterleavedVecPointStorage, PointBuffer, PointBufferExt,
@@ -9,7 +11,7 @@ use pasture_core::{
 };
 use pasture_derive::PointType;
 use std::convert::TryInto;
-
+use std::mem;
 use wgpu;
 
 #[repr(C)]
@@ -41,45 +43,121 @@ struct MyPointType {
     pub gps_time: f64,
 }
 
-struct OctreeRegion {
-    bounds: [Vector3<f64>; 8],
-    partition: Vec<usize>,
-    points_per_partition: Vec<usize>,
-    start: usize,
-    end: usize,
+struct OctreeNode {
+    bounds: AABB<f64>,
     children: Option<[Box<OctreeNode>; 8]>,
-}
-
-struct OctreeLeaf {
-    point: Vector3<f64>,
-}
-
-enum OctreeNode {
-    OctreeRegion,
-    OctreeLeaf,
+    node_partitioning: [u32; 8],
+    points_per_partition: [u32; 8],
+    point_start: u32,
+    point_end: u32,
 }
 
 pub struct GpuOctree<'a> {
-    device: &'a mut gpu::Device<'a>,
-    buffer_bind_group: Option<wgpu::BindGroup>,
-    buffer_bind_group_layout: Option<wgpu::BindGroupLayout>,
     point_buffer: &'a dyn PointBuffer,
-    root_node: Option<OctreeRegion>,
+    point_partitioning: Vec<u32>,
+    root_node: Option<OctreeNode>,
+    bounds: AABB<f64>,
+    points_per_node: u32,
 }
 
-impl OctreeRegion {}
+impl OctreeNode {
+    fn is_leaf(&self) -> bool {
+        return self.children.is_none();
+    }
+    fn into_raw(&self) -> Vec<u8> {
+        let mut raw_node: Vec<u8> = Vec::new();
+        for coord in self.bounds.min().iter() {
+            raw_node.append(&mut coord.to_ne_bytes().to_vec());
+        }
+        for coord in self.bounds.max().iter() {
+            raw_node.append(&mut coord.to_ne_bytes().to_vec());
+        }
+        raw_node.append(
+            &mut self
+                .node_partitioning
+                .map(|x| x.to_ne_bytes())
+                .to_vec()
+                .into_iter()
+                .flatten()
+                .collect(),
+        );
+        raw_node.append(
+            &mut self
+                .points_per_partition
+                .map(|x| x.to_ne_bytes())
+                .to_vec()
+                .into_iter()
+                .flatten()
+                .collect(),
+        );
+        raw_node.append(&mut self.point_start.to_ne_bytes().to_vec());
+        raw_node.append(&mut self.point_end.to_ne_bytes().to_vec());
+
+        raw_node
+    }
+    fn from_raw(mut data: Vec<u8>) -> Self {
+        let raw_bounds: Vec<u8> = data.drain(..24).collect();
+        let bounds_iter = raw_bounds.chunks_exact(8);
+        let bounds_min: Point3<f64> = Point3 {
+            coords: Vector3::from_vec(
+                bounds_iter
+                    .take(3)
+                    .map(|b| f64::from_ne_bytes(b.try_into().unwrap()))
+                    .collect(),
+            ),
+        };
+        let raw_bounds: Vec<u8> = data.drain(..24).collect();
+        let bounds_iter = raw_bounds.chunks_exact(8);
+        let bounds_max: Point3<f64> = Point3 {
+            coords: Vector3::from_vec(
+                bounds_iter
+                    .take(3)
+                    .map(|b| f64::from_ne_bytes(b.try_into().unwrap()))
+                    .collect(),
+            ),
+        };
+        let mut rest_data: Vec<u32> = data
+            .chunks_exact(4)
+            .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
+            .collect();
+        let mut rest_iter = rest_data.iter_mut();
+        let mut node_partitioning = [0u32; 8];
+        for i in 0..8 {
+            node_partitioning[i] = *rest_iter.next().unwrap();
+        }
+        let mut points_per_partition = [0u32; 8];
+        for i in 0..8 {
+            points_per_partition[i] = *rest_iter.next().unwrap();
+        }
+        let points_start = *rest_iter.next().unwrap();
+        let points_end = *rest_iter.next().unwrap();
+
+        OctreeNode {
+            bounds: AABB::from_min_max(bounds_min, bounds_max),
+            children: None,
+            node_partitioning,
+            points_per_partition,
+            point_start: points_start,
+            point_end: points_end,
+        }
+    }
+}
 
 impl<'a> GpuOctree<'a> {
-    pub fn new(point_buffer: &'a dyn PointBuffer, device: &'a mut gpu::Device<'a>) -> Self {
+    pub fn new(
+        point_buffer: &'a dyn PointBuffer,
+        max_bounds: AABB<f64>,
+        points_per_node: u32,
+    ) -> Self {
         GpuOctree {
-            device: device,
-            buffer_bind_group: None,
-            buffer_bind_group_layout: None,
-            point_buffer: point_buffer,
+            point_buffer,
+            point_partitioning: (0..point_buffer.len() as u32).collect(),
             root_node: None,
+            bounds: max_bounds,
+            points_per_node,
         }
     }
-    pub async fn construct(&'a mut self, layout: PointLayout) {
+    pub async fn construct(&mut self, layout: PointLayout) {
         let point_count = self.point_buffer.len();
         let mut points: Vec<Vector3<f64>> = Vec::new();
         let point_iterator: AttributeIteratorByValue<Vector3<f64>, dyn PointBuffer> =
@@ -90,129 +168,339 @@ impl<'a> GpuOctree<'a> {
             &attributes::POSITION_3D,
             raw_points.as_mut_slice(),
         );
-        let mut blah: Vec<f64> = raw_points
-            .chunks_exact(8)
-            .map(|b| f64::from_ne_bytes(b.try_into().unwrap()))
-            .collect();
         for point in point_iterator {
             points.push(point);
         }
-        println!("{:?}", blah);
-
-        let max = self.find_max(&raw_points, point_count).await;
-        println!("{}", max);
-        // let mut root_node = OctreeRegion {
-        //     bounds: [Vector3::new(f64::MIN, f64::MAX, f64::MIN), Vector3::new(f64::MAX, f64::MAX, f64::MIN),]
-        //     partition: vec![point_count],
-        //     points_per_partition: vec![point_count],
-        //     start: 0,
-        //     end: point_count - 1,
-        //     children: None,
-        // };
-        //let mut current_nodes = vec![&root_node];
-        let tree_depth = 1;
-        let new_nodes_created = false;
-        let num_leaves = 0;
-        loop {
-            if !new_nodes_created {
-                break;
+
+        let device = gpu::Device::new(gpu::DeviceOptions {
+            device_power: gpu::DevicePower::High,
+            device_backend: gpu::DeviceBackend::Vulkan,
+            use_adapter_features: true,
+            use_adapter_limits: true,
+        })
+        .await;
+
+        let mut device = match device {
+            Ok(d) => d,
+            Err(_) => {
+                println!("Failed to request device. Aborting.");
+                return;
             }
-        }
-    }
+        };
 
-    async fn find_max(&'a mut self, points: &[u8], count: usize) -> u32 {
-        let mut result: u32 = 0;
-        let mut max_value_buffer = wgpu::util::DeviceExt::create_buffer_init(
-            &self.device.wgpu_device,
+        let gpu_point_buffer = wgpu::util::DeviceExt::create_buffer_init(
+            &device.wgpu_device,
             &wgpu::util::BufferInitDescriptor {
-                label: Some("max_value_buffer"),
-                contents: &result.to_ne_bytes(),
-                usage: wgpu::BufferUsages::STORAGE
-                    | wgpu::BufferUsages::MAP_READ
+                label: Some("PointBuffer"),
+                contents: raw_points.as_slice(),
+                usage: wgpu::BufferUsages::MAP_READ
                     | wgpu::BufferUsages::MAP_WRITE
                     | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST,
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
             },
         );
-
-        let buffer = Some(wgpu::util::DeviceExt::create_buffer_init(
-            &self.device.wgpu_device,
+        let raw_indeces: Vec<u8> = (0u32..(point_count - 1) as u32)
+            .flat_map(|x| x.to_le_bytes().to_vec())
+            .collect();
+        let point_index_buffer = wgpu::util::DeviceExt::create_buffer_init(
+            &device.wgpu_device,
             &wgpu::util::BufferInitDescriptor {
-                label: Some("gpu_point_buffer"),
-                contents: points,
-                usage: wgpu::BufferUsages::STORAGE,
+                label: Some("IndexBuffer"),
+                contents: raw_indeces.as_slice(),
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
             },
-        ));
-
-        self.buffer_bind_group_layout = Some(self.device.wgpu_device.create_bind_group_layout(
-            &wgpu::BindGroupLayoutDescriptor {
+        );
+        let mut root_node = OctreeNode {
+            bounds: self.bounds,
+            children: None,
+            node_partitioning: [1; 8],
+            points_per_partition: [2; 8],
+            point_start: 0,
+            point_end: point_count as u32 - 1,
+        };
+        let xdiff = &root_node.bounds.max().x - &root_node.bounds.min().x;
+        let ydiff = &root_node.bounds.max().y - &root_node.bounds.min().y;
+        let zdiff = &root_node.bounds.max().z - &root_node.bounds.min().z;
+        println!("xdiff {}", xdiff);
+        println!("ydiff {}", ydiff);
+        println!("zdiff {}", zdiff);
+        let xpartition = &root_node.bounds.min().x + 0.5 * xdiff;
+        let ypartition = &root_node.bounds.min().y + 0.5 * ydiff;
+        let zpartition = &root_node.bounds.min().z + 0.5 * zdiff;
+        println!("x_partition {}", xpartition);
+        println!("y_partition {}", ypartition);
+        println!("z_partition {}", zpartition);
+        let points_bind_group_layout =
+            device
+                .wgpu_device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    entries: &[
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 0,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 1,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                    ],
+                    label: Some("PointBufferBindGroupLayout"),
+                });
+        let points_bind_group = device
+            .wgpu_device
+            .create_bind_group(&wgpu::BindGroupDescriptor {
+                label: Some("PointBufferBindGroup"),
+                layout: &points_bind_group_layout,
                 entries: &[
-                    wgpu::BindGroupLayoutEntry {
+                    wgpu::BindGroupEntry {
                         binding: 0,
-                        visibility: wgpu::ShaderStages::COMPUTE,
-                        ty: wgpu::BindingType::Buffer {
-                            ty: wgpu::BufferBindingType::Storage { read_only: false },
-                            has_dynamic_offset: false,
-                            min_binding_size: None,
-                        },
-                        count: None,
+                        resource: gpu_point_buffer.as_entire_binding(),
                     },
-                    wgpu::BindGroupLayoutEntry {
+                    wgpu::BindGroupEntry {
                         binding: 1,
-                        visibility: wgpu::ShaderStages::COMPUTE,
-                        ty: wgpu::BindingType::Buffer {
-                            ty: wgpu::BufferBindingType::Storage { read_only: false },
-                            has_dynamic_offset: false,
-                            min_binding_size: None,
-                        },
-                        count: None,
+                        resource: point_index_buffer.as_entire_binding(),
                     },
                 ],
-                label: Some("compute_bind_group_layout"),
-            },
-        ));
+            });
+        device.set_bind_group(1, &points_bind_group_layout, &points_bind_group);
 
-        self.buffer_bind_group = Some(self.device.wgpu_device.create_bind_group(
-            &wgpu::BindGroupDescriptor {
-                label: Some("storage_bind_group"),
-                layout: self.buffer_bind_group_layout.as_ref().unwrap(),
+        let tree_depth = 1;
+        let num_leaves: u32 = 0;
+        let num_nodes: u32 = 1;
+        let nodes_counter = wgpu::util::DeviceExt::create_buffer_init(
+            &device.wgpu_device,
+            &wgpu::util::BufferInitDescriptor {
+                label: Some("NodeCounterBuffer"),
+                contents: &num_nodes.to_le_bytes(),
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
+            },
+        );
+        let leaves_counter = wgpu::util::DeviceExt::create_buffer_init(
+            &device.wgpu_device,
+            &wgpu::util::BufferInitDescriptor {
+                label: Some("LeafCounterBuffer"),
+                contents: &num_leaves.to_le_bytes(),
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
+            },
+        );
+        let counter_bind_group_layout =
+            device
+                .wgpu_device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    entries: &[
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 0,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 1,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                    ],
+                    label: Some("CounterBindGroupLayout"),
+                });
+        let counter_bind_group = device
+            .wgpu_device
+            .create_bind_group(&wgpu::BindGroupDescriptor {
+                label: Some("CounterBindGroup"),
+                layout: &counter_bind_group_layout,
                 entries: &[
                     wgpu::BindGroupEntry {
                         binding: 0,
-                        resource: buffer.as_ref().unwrap().as_entire_binding(),
+                        resource: nodes_counter.as_entire_binding(),
                     },
                     wgpu::BindGroupEntry {
                         binding: 1,
-                        resource: max_value_buffer.as_entire_binding(),
+                        resource: leaves_counter.as_entire_binding(),
                     },
                 ],
-            },
-        ));
-        self.device.set_bind_group(
-            0,
-            self.buffer_bind_group_layout.as_ref().unwrap(),
-            self.buffer_bind_group.as_ref().unwrap(),
-        );
+            });
+        device.set_bind_group(0, &counter_bind_group_layout, &counter_bind_group);
+        let mut current_nodes = vec![&root_node];
+        loop {
+            let num_new_nodes = 8u32.pow(tree_depth) - num_leaves;
+            let new_nodes_buffer = device.wgpu_device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some("NewNodesBuffer"),
+                size: (mem::size_of::<OctreeNode>() as u64
+                    - mem::size_of::<[Box<OctreeNode>; 8]>() as u64)
+                    * num_new_nodes as u64,
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
+                mapped_at_creation: false,
+            });
+            let mut parent_nodes_raw: Vec<u8> = Vec::new();
+            for node in &current_nodes {
+                parent_nodes_raw.append(&mut node.into_raw());
+            }
+
+            let parent_nodes_buffer = wgpu::util::DeviceExt::create_buffer_init(
+                &device.wgpu_device,
+                &wgpu::util::BufferInitDescriptor {
+                    label: Some("ParentNodesBuffer"),
+                    contents: parent_nodes_raw.as_slice(),
+                    usage: wgpu::BufferUsages::MAP_READ
+                        | wgpu::BufferUsages::MAP_WRITE
+                        | wgpu::BufferUsages::COPY_SRC
+                        | wgpu::BufferUsages::COPY_DST
+                        | wgpu::BufferUsages::STORAGE,
+                },
+            );
+            let nodes_bind_group_layout =
+                device
+                    .wgpu_device
+                    .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                        label: Some("NodesBindGroupLayout"),
+                        entries: &[
+                            wgpu::BindGroupLayoutEntry {
+                                binding: 1,
+                                visibility: wgpu::ShaderStages::COMPUTE,
+                                ty: wgpu::BindingType::Buffer {
+                                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                    has_dynamic_offset: false,
+                                    min_binding_size: None,
+                                },
+                                count: None,
+                            },
+                            wgpu::BindGroupLayoutEntry {
+                                binding: 0,
+                                visibility: wgpu::ShaderStages::COMPUTE,
+                                ty: wgpu::BindingType::Buffer {
+                                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                    has_dynamic_offset: false,
+                                    min_binding_size: None,
+                                },
+                                count: None,
+                            },
+                        ],
+                    });
+            let nodes_bind_group =
+                device
+                    .wgpu_device
+                    .create_bind_group(&wgpu::BindGroupDescriptor {
+                        label: Some("NodesBindGroup"),
+                        layout: &nodes_bind_group_layout,
+                        entries: &[
+                            wgpu::BindGroupEntry {
+                                binding: 1,
+                                resource: parent_nodes_buffer.as_entire_binding(),
+                            },
+                            wgpu::BindGroupEntry {
+                                binding: 0,
+                                resource: new_nodes_buffer.as_entire_binding(),
+                            },
+                        ],
+                    });
+            device.set_bind_group(2, &nodes_bind_group_layout, &nodes_bind_group);
+
+            device.set_compute_shader_glsl(include_str!("shaders/generate_nodes.comp"));
+            device.compute(current_nodes.len() as u32, 1, 1);
+
+            let mapped_future = point_index_buffer.slice(..).map_async(wgpu::MapMode::Read);
+            device.wgpu_device.poll(wgpu::Maintain::Wait);
+
+            if let Ok(()) = mapped_future.await {
+                let mapped_index_buffer = point_index_buffer.slice(..).get_mapped_range().to_vec();
+                let indices: Vec<u32> = mapped_index_buffer
+                    .chunks_exact(4)
+                    .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
+                    .collect();
+
+                self.point_partitioning = indices.clone();
+
+                let mapped_counter = nodes_counter.slice(..).map_async(wgpu::MapMode::Read);
+                device.wgpu_device.poll(wgpu::Maintain::Wait);
+                if let Ok(()) = mapped_counter.await {
+                    let mapped_counter = nodes_counter.slice(..).get_mapped_range().to_vec();
+                    let x: Vec<u32> = mapped_counter
+                        .chunks_exact(4)
+                        .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
+                        .collect();
+                    // println!(
+                    //     "Point at middle: {:?}",
+                    //     points[indices[x[0] as usize] as usize]
+                    // );
+                    // println!(
+                    //     "Point after middle: {:?}",
+                    //     points[indices[x[0] as usize + 1] as usize]
+                    // );
+                    let mut index = 0;
+                    for i in indices.iter().take(200) {
+                        let p = points[*i as usize];
+                        println!("index: {}, partition_index: {}, point: {:?}", index, i, p);
+                        index += 1;
+                    }
+                }
+            }
+            let mapped_future = parent_nodes_buffer.slice(..).map_async(wgpu::MapMode::Read);
+            device.wgpu_device.poll(wgpu::Maintain::Wait);
+
+            if let Ok(()) = mapped_future.await {
+                let mapped_node_buffer = parent_nodes_buffer.slice(..).get_mapped_range().to_vec();
+                let nodes = OctreeNode::from_raw(mapped_node_buffer);
+                println!("{:?}", nodes.node_partitioning);
+                println!("{:?}", nodes.points_per_partition);
+                let indices = self.point_partitioning.as_slice();
+                for i in 0..nodes.node_partitioning[0] as usize {
+                    let point = points[indices[i] as usize];
+                    if point.x > xpartition || point.y > ypartition || point.z > zpartition {
+                        println!("ERROR at point {} {:?}", indices[i], point);
+                    }
+                }
+                for i in self.point_partitioning[0] as usize..nodes.node_partitioning[1] as usize {
+                    let point = points[indices[i] as usize];
+                    if point.x > xpartition || point.y > ypartition || point.z < zpartition {
+                        println!("ERROR at point {} {:?}", indices[i], point);
+                    }
+                }
+                for i in self.point_partitioning[1] as usize..nodes.node_partitioning[2] as usize {
+                    let point = points[indices[i] as usize];
+                    if point.x < xpartition || point.y > ypartition || point.z > zpartition {
+                        println!("ERROR at point {} {:?}", indices[i], point);
+                    }
+                }
+            }
 
-        self.device
-            .set_compute_shader_glsl(include_str!("shaders/find_max_values.comp"));
-        self.device.compute(count as u32, 1, 1);
-
-        let max_value_buffer_slice = max_value_buffer.slice(..);
-        let mapped_future = max_value_buffer_slice.map_async(wgpu::MapMode::Read);
-        self.device.wgpu_device.poll(wgpu::Maintain::Wait);
-
-        if let Ok(()) = mapped_future.await {
-            let mapped_max_value_buffer = max_value_buffer.slice(..).get_mapped_range().to_vec();
-            println!("{:?}", mapped_max_value_buffer);
-            println!("----------");
-            println!("{:?}", points);
-            result = mapped_max_value_buffer
-                .chunks_exact(4)
-                .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
-                .last()
-                .unwrap();
+            break;
         }
-        result
     }
 }
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
new file mode 100644
index 0000000..7efd59d
--- /dev/null
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -0,0 +1,195 @@
+#version 450
+
+struct Node {
+  double[3] bounds_min;
+  double[3] bounds_max;
+  uint node_partitioning[8];
+  uint points_per_partition[8];
+  uint points_start;
+  uint points_end;
+};
+
+layout(std430, binding=0) buffer NodeCounter{
+   uint num_nodes;
+ };
+layout(std430, binding=1) buffer LeafCounter{
+  uint num_leaves;
+};
+layout(std430, set=1, binding=0) buffer PointBuffer {
+  double[3] points[];
+};
+layout(std430, set=1, binding=1) buffer Partitioning {
+  uint indeces[];
+};
+layout(std430, set=2, binding=0) buffer ChildNodes{
+  Node children[];
+};
+layout(std430, set=2, binding=1) buffer ParentNodes{
+  Node parents[];
+};
+
+uint[3] partitioning_order(double x, double y, double z){
+  uint[] order = uint[3](0,1,2);
+  double[] axes = double[3](x,y,z);
+  for(uint i = 0; i < 2; ++i){
+    if(axes[i] < axes[i+1]){
+      uint tmp_order = order[i];
+      double tmp_axis = axes[i];
+      order[i] = order[i+1];
+      axes[i] = axes[i+1];
+      order[i+1] = tmp_order;
+      axes[i+1] = tmp_axis;
+    }
+  }
+  if(axes[0] < axes[1]){
+    uint tmp = order[0];
+    order[0] = order[1];
+    order[1] = tmp;
+  }
+  return order;
+}
+
+void swap( uint a,  uint b){
+  uint tmp = indeces[a];
+  indeces[a] = indeces[b];
+  indeces[b] = tmp;
+}
+
+void partitioning(uint axis, double threshold, uint iteration, inout uint[7] partition_borders, Node parent){
+  uint start = parent.points_start;
+  uint end = parent.points_end;
+  uint iteration_end = 1;
+  if(iteration > 0) {
+    end = partition_borders[iteration - 1];
+    iteration_end = iteration == 1 ? 2 : 4;
+  }
+  for(uint k = 0; k < iteration_end; ++k){
+    while(start < end){
+      if(points[indeces[start]][axis] <= threshold){
+        ++start;
+        if(points[indeces[end]][axis] > threshold){
+          --end;
+        }
+      } else {
+        if(points[indeces[end]][axis] <= threshold){
+          swap(start, end);
+          ++start;
+          --end;
+        } else {
+          --end;
+        }
+      }
+    }
+    //num_nodes = start;
+    start = end - 1;
+    if(iteration_end - k <= 2){
+      end = parent.points_end;
+    } else {
+      end = partition_borders[k * 2];
+    }
+    partition_borders[iteration_end - 1 + k] = start;
+    num_nodes = start;
+    if(iteration == 1) {
+      start = partition_borders[0];
+    }
+    else {
+      if(k == 0) start = partition_borders[1];
+      else if(k == 1) start = partition_borders[0];
+      else if(k == 2) start = partition_borders[2];
+    }
+  }
+}
+
+bool[3] partitioned_to_right(uint[3] partition_order, uint index){
+  bool[3] on_right_side = bool[3](false, false, false);
+  if(index % 2 != 0){
+    on_right_side[partition_order[2]] = true;
+  }
+  else if( index % 4 != 0){
+    on_right_side[partition_order[1]] = true;
+  }
+  if(index >= 4){
+    on_right_side[partition_order[0]] = true;
+  }
+  return on_right_side;
+}
+
+double[2][3] get_childs_bounds(uint[3] partition_order, double[3] partition_axes, uint child_index, Node parent){
+  double[3] bounds_min;
+  double[3] bounds_max;
+  if(child_index == 0) {
+    bounds_min = parent.bounds_min;
+    bounds_max = partition_axes;
+  }
+  else {
+    bool[3] on_right_side = partitioned_to_right(partition_order, child_index);
+    for(uint k = 0; k < 3; ++k){
+      bounds_min[k] = on_right_side[k] ? partition_axes[k] : parent.bounds_min[k];
+      bounds_max[k] = on_right_side[k] ? parent.bounds_max[k] : partition_axes[k];
+
+    }
+  }
+  return double[2][3](bounds_min, bounds_max);
+}
+
+void main() {
+  uint idx = gl_GlobalInvocationID.x;
+
+  Node parent = parents[idx];
+
+  double x_diff = parent.bounds_max[0] - parent.bounds_min[0];
+  double y_diff = parent.bounds_max[1] - parent.bounds_min[1];
+  double z_diff = parent.bounds_max[2] - parent.bounds_min[2];
+  double x_partition = parent.bounds_min[0] + 0.5 * abs(x_diff);
+  double y_partition = parent.bounds_min[1] + 0.5 * abs(y_diff);
+  double z_partition = parent.bounds_min[2] + 0.5 * abs(z_diff);
+  uint[3] partition_order = partitioning_order(abs(x_diff), abs(y_diff), abs(z_diff));
+  uint[7] partition_borders;
+  for(uint i = 0; i < 3; ++i){
+    uint partition_axis = partition_order[i];
+    switch(partition_axis){
+      case 0:
+        partitioning(partition_axis, x_partition, i, partition_borders, parent);
+        break;
+      case 1:
+        partitioning(partition_axis, y_partition, i, partition_borders, parent);
+        break;
+      case 2:
+        partitioning(partition_axis, z_partition, i, partition_borders, parent);
+    }
+  }
+  parents[idx].node_partitioning[0] = partition_borders[3];
+  parents[idx].node_partitioning[1] = partition_borders[1];
+  parents[idx].node_partitioning[2] = partition_borders[4];
+  parents[idx].node_partitioning[3] = partition_borders[0];
+  parents[idx].node_partitioning[4] = partition_borders[5];
+  parents[idx].node_partitioning[5] = partition_borders[2];
+  parents[idx].node_partitioning[6] = partition_borders[6];
+  parents[idx].node_partitioning[7] = parents[idx].points_end;
+
+  for(uint i = 0; i < 8; ++i){
+    if(i == 0) {
+      parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i];
+    } else {
+    parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i] - parents[idx].node_partitioning[i-1];
+    }
+  }
+  for(uint i = 0; i < 8; ++i){
+    if(i == 0) {
+      children[idx + i].points_start = 0;
+    } else {
+      children[idx + i].points_start = parents[idx].node_partitioning[i - 1];
+    }
+    children[idx + i].points_end = parents[idx].node_partitioning[i];
+    double[2][3] child_bounds = get_childs_bounds(
+      partition_order,
+      double[3](x_partition, y_partition, z_partition),
+      i,
+      parent
+    );
+    children[idx + i].bounds_min = child_bounds[0];
+    children[idx + i].bounds_max = child_bounds[1];
+  }
+
+
+}
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
index d2b36d8..75df6ac 100644
--- a/pasture-tools/src/main.rs
+++ b/pasture-tools/src/main.rs
@@ -6,9 +6,15 @@ mod ex {
     use pasture_core::gpu::GpuPointBufferInterleaved;
     use pasture_core::layout::PointType;
     use pasture_core::layout::{attributes, PointAttributeDataType, PointAttributeDefinition};
+    use pasture_core::meta::Metadata;
     use pasture_core::nalgebra::Vector3;
     use pasture_derive::PointType;
+    use pasture_io::base::PointReader;
+    use pasture_io::las::las_bounds_to_pasture_bounds;
+    use pasture_io::las::LASReader;
+    use pasture_io::las::LasPointFormat0;
 
+    use anyhow::Result;
     #[repr(C)]
     #[derive(PointType, Debug)]
     struct MyPointType {
@@ -42,7 +48,7 @@ mod ex {
         futures::executor::block_on(run());
     }
 
-    async fn run() {
+    async fn run() -> Result<()> {
         // == Init point buffer ======================================================================
 
         let points = vec![
@@ -94,6 +100,21 @@ mod ex {
         let mut point_buffer = InterleavedVecPointStorage::new(layout);
         point_buffer.push_points(points.as_slice());
 
+        let mut reader = LASReader::from_path(
+            //"/home/jnoice/dev/pasture/pasture-io/examples/in/10_points_format_1.las",
+            "/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
+            //"/home/jnoice/Downloads/interesting.las",
+        )?;
+        let count = reader.remaining_points();
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        reader.read_into(&mut buffer, count)?;
+
+        for point in buffer.iter_point::<LasPointFormat0>().take(5) {
+            println!("{:?}", point);
+        }
+        let bounds = reader.get_metadata().bounds().unwrap();
+
         let device = gpu::Device::new(gpu::DeviceOptions {
             device_power: gpu::DevicePower::High,
             device_backend: gpu::DeviceBackend::Vulkan,
@@ -106,13 +127,13 @@ mod ex {
             Ok(d) => d,
             Err(_) => {
                 println!("Failed to request device. Aborting.");
-                return;
+                return Ok(());
             }
         };
 
-        let mut octree =
-            pasture_tools::acceleration_structures::GpuOctree::new(&point_buffer, &mut device);
+        let mut octree = pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 2);
         octree.construct(MyPointType::layout()).await;
+        Ok(())
     }
 }
 

From fb68a13e269120839d0d5266b004c07319555f0d Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Wed, 26 Jan 2022 17:57:44 +0100
Subject: [PATCH 04/15] check on laptop for gpu timeout

---
 pasture-tools/Cargo.toml                      |   6 +-
 .../src/acceleration_structures/gpu_octree.rs | 619 ++++++++++--------
 .../shaders/generate_nodes.comp               | 181 +++--
 pasture-tools/src/lib.rs                      |   2 +-
 pasture-tools/src/main.rs                     | 100 ++-
 5 files changed, 560 insertions(+), 348 deletions(-)

diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index e0a9fb5..28ba0dd 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -29,6 +29,9 @@ shaderc = { version = "0.7.2", optional = true }
 futures = { version = "0.3", optional = true }
 bytemuck = { version = "1.5.1", optional = true }
 
+[features]
+gpu = ["wgpu", "shaderc", "futures", "bytemuck"]
+
 [[bin]]
 name = "reorder_laz_chunks"
 
@@ -37,6 +40,3 @@ name = "plotting"
 
 [[bin]]
 name = "info"
-
-[features]
-gpu = ["pasture-core/gpu", "wgpu", "shaderc", "futures", "bytemuck"]
diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index d548ab2..5f4176b 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -2,17 +2,15 @@ use pasture_core::containers::attr1::AttributeIteratorByValue;
 use pasture_core::math::AABB;
 use pasture_core::nalgebra::Point3;
 use pasture_core::{
-    containers::{
-        InterleavedPointBufferMut, InterleavedVecPointStorage, PointBuffer, PointBufferExt,
-    },
-    gpu,
-    layout::{attributes, PointAttributeDataType, PointAttributeDefinition, PointLayout},
+    containers::{PointBuffer, PointBufferExt},
+    layout::{attributes, PointLayout},
     nalgebra::Vector3,
 };
 use pasture_derive::PointType;
+use std::borrow::Cow;
 use std::convert::TryInto;
-use std::mem;
-use wgpu;
+use std::fs::File;
+use std::io::prelude::*;
 
 #[repr(C)]
 #[derive(PointType, Debug)]
@@ -42,10 +40,10 @@ struct MyPointType {
     #[pasture(BUILTIN_GPS_TIME)]
     pub gps_time: f64,
 }
-
+#[derive(Debug, Clone)]
 struct OctreeNode {
     bounds: AABB<f64>,
-    children: Option<[Box<OctreeNode>; 8]>,
+    children: Option<Box<[OctreeNode]>>,
     node_partitioning: [u32; 8],
     points_per_partition: [u32; 8],
     point_start: u32,
@@ -53,6 +51,8 @@ struct OctreeNode {
 }
 
 pub struct GpuOctree<'a> {
+    gpu_device: wgpu::Device,
+    gpu_queue: wgpu::Queue,
     point_buffer: &'a dyn PointBuffer,
     point_partitioning: Vec<u32>,
     root_node: Option<OctreeNode>,
@@ -61,21 +61,25 @@ pub struct GpuOctree<'a> {
 }
 
 impl OctreeNode {
-    fn is_leaf(&self) -> bool {
-        return self.children.is_none();
+    fn is_leaf(&self, points_per_node: u32) -> bool {
+        // println!(
+        //     "\npoint start: {}, point end: {}\n",
+        //     self.point_start, self.point_end
+        // );
+        return self.point_end - self.point_start <= points_per_node;
     }
     fn into_raw(&self) -> Vec<u8> {
         let mut raw_node: Vec<u8> = Vec::new();
         for coord in self.bounds.min().iter() {
-            raw_node.append(&mut coord.to_ne_bytes().to_vec());
+            raw_node.append(&mut coord.to_le_bytes().to_vec());
         }
         for coord in self.bounds.max().iter() {
-            raw_node.append(&mut coord.to_ne_bytes().to_vec());
+            raw_node.append(&mut coord.to_le_bytes().to_vec());
         }
         raw_node.append(
             &mut self
                 .node_partitioning
-                .map(|x| x.to_ne_bytes())
+                .map(|x| x.to_le_bytes())
                 .to_vec()
                 .into_iter()
                 .flatten()
@@ -84,15 +88,16 @@ impl OctreeNode {
         raw_node.append(
             &mut self
                 .points_per_partition
-                .map(|x| x.to_ne_bytes())
+                .map(|x| x.to_le_bytes())
                 .to_vec()
                 .into_iter()
                 .flatten()
                 .collect(),
         );
-        raw_node.append(&mut self.point_start.to_ne_bytes().to_vec());
-        raw_node.append(&mut self.point_end.to_ne_bytes().to_vec());
-
+        raw_node.append(&mut self.point_start.to_le_bytes().to_vec());
+        //[0u8; 4].iter().for_each(|&x| raw_node.push(x));
+        raw_node.append(&mut self.point_end.to_le_bytes().to_vec());
+        //[0u8; 4].iter().for_each(|&x| raw_node.push(x));
         raw_node
     }
     fn from_raw(mut data: Vec<u8>) -> Self {
@@ -102,7 +107,7 @@ impl OctreeNode {
             coords: Vector3::from_vec(
                 bounds_iter
                     .take(3)
-                    .map(|b| f64::from_ne_bytes(b.try_into().unwrap()))
+                    .map(|b| f64::from_le_bytes(b.try_into().unwrap()))
                     .collect(),
             ),
         };
@@ -112,13 +117,13 @@ impl OctreeNode {
             coords: Vector3::from_vec(
                 bounds_iter
                     .take(3)
-                    .map(|b| f64::from_ne_bytes(b.try_into().unwrap()))
+                    .map(|b| f64::from_le_bytes(b.try_into().unwrap()))
                     .collect(),
             ),
         };
         let mut rest_data: Vec<u32> = data
             .chunks_exact(4)
-            .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
+            .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
             .collect();
         let mut rest_iter = rest_data.iter_mut();
         let mut node_partitioning = [0u32; 8];
@@ -130,6 +135,7 @@ impl OctreeNode {
             points_per_partition[i] = *rest_iter.next().unwrap();
         }
         let points_start = *rest_iter.next().unwrap();
+        //rest_iter.next();
         let points_end = *rest_iter.next().unwrap();
 
         OctreeNode {
@@ -144,18 +150,43 @@ impl OctreeNode {
 }
 
 impl<'a> GpuOctree<'a> {
-    pub fn new(
+    pub async fn new(
         point_buffer: &'a dyn PointBuffer,
         max_bounds: AABB<f64>,
         points_per_node: u32,
-    ) -> Self {
-        GpuOctree {
+    ) -> Result<GpuOctree<'a>, wgpu::RequestDeviceError> {
+        let instance = wgpu::Instance::new(wgpu::Backends::all());
+        let adapter = instance
+            .request_adapter(&wgpu::RequestAdapterOptions {
+                power_preference: wgpu::PowerPreference::HighPerformance,
+                compatible_surface: None,
+                force_fallback_adapter: false,
+            })
+            .await
+            .unwrap();
+        let (device, queue) = adapter
+            .request_device(
+                &wgpu::DeviceDescriptor {
+                    features: adapter.features(),
+                    limits: adapter.limits(),
+                    label: Some("Octree_Device"),
+                },
+                None,
+            )
+            .await?;
+        println!("GPU Adapter limits: {:?}", adapter.limits());
+        Ok(GpuOctree {
+            gpu_device: device,
+            gpu_queue: queue,
             point_buffer,
             point_partitioning: (0..point_buffer.len() as u32).collect(),
             root_node: None,
             bounds: max_bounds,
             points_per_node,
-        }
+        })
+    }
+    pub fn print_tree(&self) {
+        println!("{:?}", self.root_node);
     }
     pub async fn construct(&mut self, layout: PointLayout) {
         let point_count = self.point_buffer.len();
@@ -172,72 +203,41 @@ impl<'a> GpuOctree<'a> {
             points.push(point);
         }
 
-        let device = gpu::Device::new(gpu::DeviceOptions {
-            device_power: gpu::DevicePower::High,
-            device_backend: gpu::DeviceBackend::Vulkan,
-            use_adapter_features: true,
-            use_adapter_limits: true,
-        })
-        .await;
-
-        let mut device = match device {
-            Ok(d) => d,
-            Err(_) => {
-                println!("Failed to request device. Aborting.");
-                return;
-            }
-        };
-
-        let gpu_point_buffer = wgpu::util::DeviceExt::create_buffer_init(
-            &device.wgpu_device,
-            &wgpu::util::BufferInitDescriptor {
-                label: Some("PointBuffer"),
-                contents: raw_points.as_slice(),
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST
-                    | wgpu::BufferUsages::STORAGE,
-            },
-        );
-        let raw_indeces: Vec<u8> = (0u32..(point_count - 1) as u32)
-            .flat_map(|x| x.to_le_bytes().to_vec())
+        let mut construction_shader = File::open(
+            "/home/jnoice/dev/pasture/pasture-tools/src/acceleration_structures/shaders/comp.spv",
+        )
+        .unwrap();
+        let mut shader_bytes: Vec<u8> = Vec::new();
+        for byte in construction_shader.bytes() {
+            shader_bytes.push(byte.unwrap());
+        }
+        let shader_words: Vec<u32> = shader_bytes
+            .chunks_exact(4)
+            .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
             .collect();
-        let point_index_buffer = wgpu::util::DeviceExt::create_buffer_init(
-            &device.wgpu_device,
-            &wgpu::util::BufferInitDescriptor {
-                label: Some("IndexBuffer"),
-                contents: raw_indeces.as_slice(),
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST
-                    | wgpu::BufferUsages::STORAGE,
-            },
-        );
-        let mut root_node = OctreeNode {
-            bounds: self.bounds,
-            children: None,
-            node_partitioning: [1; 8],
-            points_per_partition: [2; 8],
-            point_start: 0,
-            point_end: point_count as u32 - 1,
-        };
-        let xdiff = &root_node.bounds.max().x - &root_node.bounds.min().x;
-        let ydiff = &root_node.bounds.max().y - &root_node.bounds.min().y;
-        let zdiff = &root_node.bounds.max().z - &root_node.bounds.min().z;
-        println!("xdiff {}", xdiff);
-        println!("ydiff {}", ydiff);
-        println!("zdiff {}", zdiff);
-        let xpartition = &root_node.bounds.min().x + 0.5 * xdiff;
-        let ypartition = &root_node.bounds.min().y + 0.5 * ydiff;
-        let zpartition = &root_node.bounds.min().z + 0.5 * zdiff;
-        println!("x_partition {}", xpartition);
-        println!("y_partition {}", ypartition);
-        println!("z_partition {}", zpartition);
+        let mut compiler = shaderc::Compiler::new().unwrap();
+        // let shader = self
+        //     .gpu_device
+        //     .create_shader_module(&wgpu::ShaderModuleDescriptor {
+        //         label: Some("NodeGenerationShader"),
+        //         source: wgpu::ShaderSource::SpirV(Cow::from(shader_words.as_slice())),
+        //     });
+        let comp_shader = include_str!("shaders/generate_nodes.comp");
+        let comp_spirv = compiler.compile_into_spirv(
+            comp_shader,
+            shaderc::ShaderKind::Compute,
+            "ComputeShader",
+            "main",
+            None
+        )
+        .unwrap();
+        let comp_data = wgpu::util::make_spirv(comp_spirv.as_binary_u8());
+        let shader = self.gpu_device.create_shader_module(&wgpu::ShaderModuleDescriptor{
+            label: Some("ModeGenerationShader"),
+            source: comp_data,
+        });
         let points_bind_group_layout =
-            device
-                .wgpu_device
+            self.gpu_device
                 .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
                     entries: &[
                         wgpu::BindGroupLayoutEntry {
@@ -260,58 +260,23 @@ impl<'a> GpuOctree<'a> {
                             },
                             count: None,
                         },
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 2,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
                     ],
                     label: Some("PointBufferBindGroupLayout"),
                 });
-        let points_bind_group = device
-            .wgpu_device
-            .create_bind_group(&wgpu::BindGroupDescriptor {
-                label: Some("PointBufferBindGroup"),
-                layout: &points_bind_group_layout,
-                entries: &[
-                    wgpu::BindGroupEntry {
-                        binding: 0,
-                        resource: gpu_point_buffer.as_entire_binding(),
-                    },
-                    wgpu::BindGroupEntry {
-                        binding: 1,
-                        resource: point_index_buffer.as_entire_binding(),
-                    },
-                ],
-            });
-        device.set_bind_group(1, &points_bind_group_layout, &points_bind_group);
-
-        let tree_depth = 1;
-        let num_leaves: u32 = 0;
-        let num_nodes: u32 = 1;
-        let nodes_counter = wgpu::util::DeviceExt::create_buffer_init(
-            &device.wgpu_device,
-            &wgpu::util::BufferInitDescriptor {
-                label: Some("NodeCounterBuffer"),
-                contents: &num_nodes.to_le_bytes(),
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST
-                    | wgpu::BufferUsages::STORAGE,
-            },
-        );
-        let leaves_counter = wgpu::util::DeviceExt::create_buffer_init(
-            &device.wgpu_device,
-            &wgpu::util::BufferInitDescriptor {
-                label: Some("LeafCounterBuffer"),
-                contents: &num_leaves.to_le_bytes(),
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST
-                    | wgpu::BufferUsages::STORAGE,
-            },
-        );
-        let counter_bind_group_layout =
-            device
-                .wgpu_device
+        let mut nodes_bind_group_layout =
+            self.gpu_device
                 .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: Some("NodesBindGroupLayout"),
                     entries: &[
                         wgpu::BindGroupLayoutEntry {
                             binding: 0,
@@ -334,33 +299,104 @@ impl<'a> GpuOctree<'a> {
                             count: None,
                         },
                     ],
-                    label: Some("CounterBindGroupLayout"),
                 });
-        let counter_bind_group = device
-            .wgpu_device
-            .create_bind_group(&wgpu::BindGroupDescriptor {
-                label: Some("CounterBindGroup"),
-                layout: &counter_bind_group_layout,
-                entries: &[
-                    wgpu::BindGroupEntry {
-                        binding: 0,
-                        resource: nodes_counter.as_entire_binding(),
-                    },
-                    wgpu::BindGroupEntry {
-                        binding: 1,
-                        resource: leaves_counter.as_entire_binding(),
-                    },
-                ],
+
+        let compute_pipeline_layout =
+            self.gpu_device
+                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                    label: Some("ConstructionPipelineLayout"),
+                    bind_group_layouts: &[&points_bind_group_layout, &nodes_bind_group_layout],
+                    push_constant_ranges: &[],
+                });
+        let compute_pipeline =
+            self.gpu_device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: Some("ConstructionPipeline"),
+                    layout: Some(&compute_pipeline_layout),
+                    module: &shader,
+                    entry_point: "main",
+                });
+
+        let gpu_point_buffer = wgpu::util::DeviceExt::create_buffer_init(
+            &self.gpu_device,
+            &wgpu::util::BufferInitDescriptor {
+                label: Some("PointBuffer"),
+                contents: raw_points.as_slice(),
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
+            },
+        );
+
+        let mut root_node = OctreeNode {
+            bounds: self.bounds,
+            children: None,
+            node_partitioning: [0; 8],
+            points_per_partition: [0; 8],
+            point_start: 0,
+            point_end: point_count as u32 - 1,
+        };
+        root_node.node_partitioning[0] = point_count as u32 - 1;
+        root_node.points_per_partition[0] = point_count as u32 - 1;
+        let xdiff = &root_node.bounds.max().x - &root_node.bounds.min().x;
+        let ydiff = &root_node.bounds.max().y - &root_node.bounds.min().y;
+        let zdiff = &root_node.bounds.max().z - &root_node.bounds.min().z;
+        println!("Point count: {}", point_count);
+        println!("xdiff {}", xdiff);
+        println!("ydiff {}", ydiff);
+        println!("zdiff {}", zdiff);
+        let xpartition = &root_node.bounds.min().x + 0.5 * xdiff;
+        let ypartition = &root_node.bounds.min().y + 0.5 * ydiff;
+        let zpartition = &root_node.bounds.min().z + 0.5 * zdiff;
+        println!("x_partition {}", xpartition);
+        println!("y_partition {}", ypartition);
+        println!("z_partition {}", zpartition);
+
+        let mut tree_depth = 1;
+        let mut num_leaves: u32 = 0;
+        let mut num_nodes: u32 = 1;
+
+        let mut current_nodes = vec![&mut root_node];
+        let mut children_nodes: Vec<Box<[OctreeNode]>> = Vec::new();
+
+        let mut raw_indeces: Vec<u8> = (0u32..(point_count - 1) as u32)
+            .flat_map(|x| x.to_le_bytes().to_vec())
+            .collect();
+
+        while !current_nodes.is_empty() {
+            //for i in 0..2 {
+            //let num_new_nodes = 8u64.pow(tree_depth) - num_leaves as u64;
+
+            let point_index_buffer = wgpu::util::DeviceExt::create_buffer_init(
+                &self.gpu_device,
+                &wgpu::util::BufferInitDescriptor {
+                    label: Some("IndexBuffer"),
+                    contents: raw_indeces.as_slice(),
+                    usage: wgpu::BufferUsages::MAP_READ
+                        | wgpu::BufferUsages::MAP_WRITE
+                        | wgpu::BufferUsages::COPY_SRC
+                        | wgpu::BufferUsages::COPY_DST
+                        | wgpu::BufferUsages::STORAGE,
+                },
+            );
+            let debug_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some("DebugBuffer"),
+                size: (3 * 4 + 8 * 4 + 4 + 2 * 4) * current_nodes.len() as u64,
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
+                mapped_at_creation: false,
             });
-        device.set_bind_group(0, &counter_bind_group_layout, &counter_bind_group);
-        let mut current_nodes = vec![&root_node];
-        loop {
-            let num_new_nodes = 8u32.pow(tree_depth) - num_leaves;
-            let new_nodes_buffer = device.wgpu_device.create_buffer(&wgpu::BufferDescriptor {
+
+            let new_nodes_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
                 label: Some("NewNodesBuffer"),
-                size: (mem::size_of::<OctreeNode>() as u64
-                    - mem::size_of::<[Box<OctreeNode>; 8]>() as u64)
-                    * num_new_nodes as u64,
+                size: //(mem::size_of::<OctreeNode>() - mem::size_of::<Box<[OctreeNode]>>()) as u64
+                    128
+                    * current_nodes.len() as u64 * 8 as u64,
                 usage: wgpu::BufferUsages::MAP_READ
                     | wgpu::BufferUsages::MAP_WRITE
                     | wgpu::BufferUsages::COPY_SRC
@@ -368,13 +404,33 @@ impl<'a> GpuOctree<'a> {
                     | wgpu::BufferUsages::STORAGE,
                 mapped_at_creation: false,
             });
-            let mut parent_nodes_raw: Vec<u8> = Vec::new();
+            let points_bind_group = self
+                .gpu_device
+                .create_bind_group(&wgpu::BindGroupDescriptor {
+                    label: Some("PointBufferBindGroup"),
+                    layout: &points_bind_group_layout,
+                    entries: &[
+                        wgpu::BindGroupEntry {
+                            binding: 0,
+                            resource: gpu_point_buffer.as_entire_binding(),
+                        },
+                        wgpu::BindGroupEntry {
+                            binding: 1,
+                            resource: point_index_buffer.as_entire_binding(),
+                        },
+                        wgpu::BindGroupEntry {
+                            binding: 2,
+                            resource: debug_buffer.as_entire_binding(),
+                        },
+                    ],
+                });
+            let mut parent_nodes_raw = Vec::new();
             for node in &current_nodes {
                 parent_nodes_raw.append(&mut node.into_raw());
             }
 
             let parent_nodes_buffer = wgpu::util::DeviceExt::create_buffer_init(
-                &device.wgpu_device,
+                &self.gpu_device,
                 &wgpu::util::BufferInitDescriptor {
                     label: Some("ParentNodesBuffer"),
                     contents: parent_nodes_raw.as_slice(),
@@ -385,122 +441,165 @@ impl<'a> GpuOctree<'a> {
                         | wgpu::BufferUsages::STORAGE,
                 },
             );
-            let nodes_bind_group_layout =
-                device
-                    .wgpu_device
-                    .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
-                        label: Some("NodesBindGroupLayout"),
-                        entries: &[
-                            wgpu::BindGroupLayoutEntry {
-                                binding: 1,
-                                visibility: wgpu::ShaderStages::COMPUTE,
-                                ty: wgpu::BindingType::Buffer {
-                                    ty: wgpu::BufferBindingType::Storage { read_only: false },
-                                    has_dynamic_offset: false,
-                                    min_binding_size: None,
-                                },
-                                count: None,
-                            },
-                            wgpu::BindGroupLayoutEntry {
-                                binding: 0,
-                                visibility: wgpu::ShaderStages::COMPUTE,
-                                ty: wgpu::BindingType::Buffer {
-                                    ty: wgpu::BufferBindingType::Storage { read_only: false },
-                                    has_dynamic_offset: false,
-                                    min_binding_size: None,
-                                },
-                                count: None,
-                            },
-                        ],
-                    });
-            let nodes_bind_group =
-                device
-                    .wgpu_device
-                    .create_bind_group(&wgpu::BindGroupDescriptor {
-                        label: Some("NodesBindGroup"),
-                        layout: &nodes_bind_group_layout,
-                        entries: &[
-                            wgpu::BindGroupEntry {
-                                binding: 1,
-                                resource: parent_nodes_buffer.as_entire_binding(),
-                            },
-                            wgpu::BindGroupEntry {
-                                binding: 0,
-                                resource: new_nodes_buffer.as_entire_binding(),
-                            },
-                        ],
-                    });
-            device.set_bind_group(2, &nodes_bind_group_layout, &nodes_bind_group);
 
-            device.set_compute_shader_glsl(include_str!("shaders/generate_nodes.comp"));
-            device.compute(current_nodes.len() as u32, 1, 1);
+            let nodes_bind_group = self
+                .gpu_device
+                .create_bind_group(&wgpu::BindGroupDescriptor {
+                    label: Some("NodesBindGroup"),
+                    layout: &nodes_bind_group_layout,
+                    entries: &[
+                        wgpu::BindGroupEntry {
+                            binding: 0,
+                            resource: parent_nodes_buffer.as_entire_binding(),
+                        },
+                        wgpu::BindGroupEntry {
+                            binding: 1,
+                            resource: new_nodes_buffer.as_entire_binding(),
+                        },
+                    ],
+                });
+            let mut encoder =
+                self.gpu_device
+                    .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                        label: Some("CommandEncoder"),
+                    });
+            {
+                let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                    label: Some("ConstructionComputePass"),
+                });
+                compute_pass.set_pipeline(&compute_pipeline);
+                compute_pass.set_bind_group(0, &points_bind_group, &[]);
+                compute_pass.set_bind_group(1, &nodes_bind_group, &[]);
+                println!(
+                    "Starting gpu computation with {} threads",
+                    current_nodes.len()
+                );
+                compute_pass.insert_debug_marker("Pasture Compute Debug");
+                compute_pass.dispatch(current_nodes.len() as u32, 1, 1);
+            }
+            self.gpu_queue.submit(Some(encoder.finish()));
 
             let mapped_future = point_index_buffer.slice(..).map_async(wgpu::MapMode::Read);
-            device.wgpu_device.poll(wgpu::Maintain::Wait);
-
+            self.gpu_device.poll(wgpu::Maintain::Wait);
             if let Ok(()) = mapped_future.await {
                 let mapped_index_buffer = point_index_buffer.slice(..).get_mapped_range().to_vec();
                 let indices: Vec<u32> = mapped_index_buffer
                     .chunks_exact(4)
-                    .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
+                    .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
                     .collect();
 
-                self.point_partitioning = indices.clone();
+                // self.point_partitioning = indices.clone();
+                // indices.iter().for_each(|x| {
+                //     println!("{}", x);
+                // });
+                raw_indeces = mapped_index_buffer.clone();
+            }
+            point_index_buffer.unmap();
+            let mapped_debug = debug_buffer.slice(..).map_async(wgpu::MapMode::Read);
+            self.gpu_device.poll(wgpu::Maintain::Wait);
+            if let Ok(()) = mapped_debug.await {
+                let mut debug: Vec<u32> = debug_buffer
+                    .slice(..)
+                    .get_mapped_range()
+                    .to_vec()
+                    .chunks_exact(4)
+                    .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
+                    .collect();
+                for k in 0..current_nodes.len() {
+                    let partition_order: Vec<u32> = debug.drain(..3).collect();
+                    let borders: Vec<u32> = debug.drain(..8).collect();
+                    let thread_id: Vec<u32> = debug.drain(..1).collect();
+                    let start_end: Vec<u32> = debug.drain(..2).collect();
+                    println!(
+                        " Node {} | Partition Order: {:?} \n Partition borders: {:?}\n thread index: {}\n start/end: {:?}",
+                         k, partition_order, borders, thread_id.first().unwrap(), start_end,
+                    );
+                }
+            }
+
+            let mapped_children = new_nodes_buffer.slice(..).map_async(wgpu::MapMode::Read);
+            self.gpu_device.poll(wgpu::Maintain::Wait);
+
+            let mapped_parents = parent_nodes_buffer.slice(..).map_async(wgpu::MapMode::Read);
+            self.gpu_device.poll(wgpu::Maintain::Wait);
 
-                let mapped_counter = nodes_counter.slice(..).map_async(wgpu::MapMode::Read);
-                device.wgpu_device.poll(wgpu::Maintain::Wait);
-                if let Ok(()) = mapped_counter.await {
-                    let mapped_counter = nodes_counter.slice(..).get_mapped_range().to_vec();
-                    let x: Vec<u32> = mapped_counter
-                        .chunks_exact(4)
-                        .map(|b| u32::from_ne_bytes(b.try_into().unwrap()))
-                        .collect();
+            if matches!(mapped_parents.await, Ok(())) && matches!(mapped_children.await, Ok(())) {
+                let mapped_node_buffer = parent_nodes_buffer.slice(..).get_mapped_range().to_vec();
+                let nodes: Vec<OctreeNode> = mapped_node_buffer
+                    .chunks_exact(120)
+                    .map(|b| OctreeNode::from_raw(b.to_vec()))
+                    .collect();
+                let mapped_children = new_nodes_buffer.slice(..).get_mapped_range().to_vec();
+                let mut children: Vec<OctreeNode> = mapped_children
+                    .chunks_exact(120)
+                    .map(|b| OctreeNode::from_raw(b.to_vec()))
+                    .collect();
+                let mut generated_children: Vec<&mut OctreeNode> = Vec::new();
+                for mut node in nodes {
+                    let children_sizes = node.points_per_partition.clone();
+                    // println!("=========== Node ===========");
+                    // println!("{:?}", node);
+                    let mut local_children: Vec<OctreeNode> = children.drain(..8).collect();
+
+                    node.children = Some(local_children.into_boxed_slice());
+                    let mut node_ref = current_nodes.swap_remove(0);
+                    // println!("Number of parents left: {}", current_nodes.len());
+                    *node_ref = node;
                     // println!(
-                    //     "Point at middle: {:?}",
-                    //     points[indices[x[0] as usize] as usize]
+                    //     "Parent node point start: {}, point end: {}",
+                    //     &node_ref.point_start, &node_ref.point_end
                     // );
                     // println!(
-                    //     "Point after middle: {:?}",
-                    //     points[indices[x[0] as usize + 1] as usize]
+                    //     "Points per partition in parent : {:?}",
+                    //     &node_ref.points_per_partition
                     // );
-                    let mut index = 0;
-                    for i in indices.iter().take(200) {
-                        let p = points[*i as usize];
-                        println!("index: {}, partition_index: {}, point: {:?}", index, i, p);
-                        index += 1;
-                    }
-                }
-            }
-            let mapped_future = parent_nodes_buffer.slice(..).map_async(wgpu::MapMode::Read);
-            device.wgpu_device.poll(wgpu::Maintain::Wait);
+                    // println!("Parent node bounds: {:?}", &node_ref.bounds);
+                    let mut children: &mut Box<[OctreeNode]> = node_ref.children.as_mut().unwrap();
+                    let iter = (*children).iter_mut();
 
-            if let Ok(()) = mapped_future.await {
-                let mapped_node_buffer = parent_nodes_buffer.slice(..).get_mapped_range().to_vec();
-                let nodes = OctreeNode::from_raw(mapped_node_buffer);
-                println!("{:?}", nodes.node_partitioning);
-                println!("{:?}", nodes.points_per_partition);
-                let indices = self.point_partitioning.as_slice();
-                for i in 0..nodes.node_partitioning[0] as usize {
-                    let point = points[indices[i] as usize];
-                    if point.x > xpartition || point.y > ypartition || point.z > zpartition {
-                        println!("ERROR at point {} {:?}", indices[i], point);
-                    }
-                }
-                for i in self.point_partitioning[0] as usize..nodes.node_partitioning[1] as usize {
-                    let point = points[indices[i] as usize];
-                    if point.x > xpartition || point.y > ypartition || point.z < zpartition {
-                        println!("ERROR at point {} {:?}", indices[i], point);
-                    }
-                }
-                for i in self.point_partitioning[1] as usize..nodes.node_partitioning[2] as usize {
-                    let point = points[indices[i] as usize];
-                    if point.x < xpartition || point.y > ypartition || point.z > zpartition {
-                        println!("ERROR at point {} {:?}", indices[i], point);
+                    let mut child_index = 0;
+                    for child in iter {
+                        // println!("========== Child ===========");
+                        // println!("{:?}", child);
+                        // println!(
+                        //     "{}, {}",
+                        //     children_sizes[child_index] != 0,
+                        //     !child.is_leaf(self.points_per_node)
+                        // );
+                        if children_sizes[child_index] != 0 && !child.is_leaf(self.points_per_node)
+                        {
+                            generated_children.push(child);
+                        } else {
+                            num_leaves += 1;
+                            // println!(
+                            //     "Child {} is LEAF. Point Count: {}",
+                            //     child_index, children_sizes[child_index]
+                            // );
+                        }
+
+                        num_nodes += 1;
+                        child_index += 1;
                     }
                 }
+                current_nodes.append(&mut generated_children);
             }
+            parent_nodes_buffer.unmap();
+            new_nodes_buffer.unmap();
+            // println!(
+            //     "Num Nodes: {}, Num New Parents: {}, Num Leaves: {}",
+            //     num_nodes,
+            //     current_nodes.len(),
+            //     num_leaves
+            // );
 
-            break;
+            tree_depth += 1;
+
+            // println!("{}", tree_depth);
+            // println!("{:?}", current_nodes);
         }
+
+        self.root_node = Some(root_node);
+        println!("{:?}", self.root_node);
+        // println!("Tree depth = {}", tree_depth);
     }
 }
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
index 7efd59d..614043d 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -1,32 +1,37 @@
 #version 450
 
 struct Node {
-  double[3] bounds_min;
-  double[3] bounds_max;
+  double bounds_min[3];
+  double bounds_max[3];
   uint node_partitioning[8];
   uint points_per_partition[8];
   uint points_start;
   uint points_end;
 };
-
-layout(std430, binding=0) buffer NodeCounter{
-   uint num_nodes;
- };
-layout(std430, binding=1) buffer LeafCounter{
-  uint num_leaves;
+struct Debug {
+  uint debug_order[3];
+  uint debug_borders[8];
+  uint thread_id;
+  uint points_start;
+  uint points_end;
 };
-layout(std430, set=1, binding=0) buffer PointBuffer {
-  double[3] points[];
+
+layout(std430, set=0, binding=0) buffer PointBuffer {
+  double points[][3];
 };
-layout(std430, set=1, binding=1) buffer Partitioning {
+layout(std430, set=0, binding=1) buffer Partitioning {
   uint indeces[];
 };
-layout(std430, set=2, binding=0) buffer ChildNodes{
-  Node children[];
+layout(std430, set=0, binding=2) buffer DebugBuffer {
+  Debug debug[];
 };
-layout(std430, set=2, binding=1) buffer ParentNodes{
+layout(std430, set=1, binding=0) buffer ParentNodes{
   Node parents[];
 };
+layout(std430, set=1, binding=1) buffer ChildNodes{
+  Node children[];
+};
+layout (local_size_x=1, local_size_y=1, local_size_z=1) in;
 
 uint[3] partitioning_order(double x, double y, double z){
   uint[] order = uint[3](0,1,2);
@@ -55,48 +60,80 @@ void swap( uint a,  uint b){
   indeces[b] = tmp;
 }
 
-void partitioning(uint axis, double threshold, uint iteration, inout uint[7] partition_borders, Node parent){
-  uint start = parent.points_start;
-  uint end = parent.points_end;
-  uint iteration_end = 1;
-  if(iteration > 0) {
-    end = partition_borders[iteration - 1];
-    iteration_end = iteration == 1 ? 2 : 4;
-  }
-  for(uint k = 0; k < iteration_end; ++k){
-    while(start < end){
-      if(points[indeces[start]][axis] <= threshold){
-        ++start;
-        if(points[indeces[end]][axis] > threshold){
-          --end;
+void partitioning(uint axis, double threshold, uint iteration, uint id){
+  // uint start = parent.points_start;
+  // uint end = parent.points_end;
+  // uint iteration_end = 1;
+  // if(iteration > 0) {
+  //   end = partition_borders[iteration - 1];
+  //   iteration_end = iteration == 1 ? 2 : 4;
+  // }
+  // for(uint k = 0; k < iteration_end; ++k){
+  //   while(start < end){
+  //     if(points[indeces[start]][axis] <= threshold){
+  //       ++start;
+  //       if(points[indeces[end]][axis] > threshold){
+  //         --end;
+  //       }
+  //     } else {
+  //       if(points[indeces[end]][axis] <= threshold){
+  //         swap(start, end);
+  //         ++start;
+  //         --end;
+  //       } else {
+  //         --end;
+  //       }
+  //     }
+  //   }
+  //   //num_nodes = start;
+  //   if(iteration_end - k <= 2){
+  //     end = parent.points_end;
+  //   } else {
+  //     end = partition_borders[k * 2];
+  //   }
+  //   partition_borders[iteration_end - 1 + k] = start;
+  //   if(iteration == 1) {
+  //     start = partition_borders[0] + 1;
+  //   }
+  //   else {
+  //     if(k == 0) start = partition_borders[1] + 1;
+  //     else if(k == 1) start = partition_borders[0] + 1;
+  //     else if(k == 2) start = partition_borders[2] + 1;
+  //   }
+  // }
+  uint start = parents[id].points_start;
+  debug[id].thread_id = 9999;
+  uint[8] local_partitioning;
+  for(uint k = 0; k < 8; ++k) {
+    if(parents[id].node_partitioning[k] != 0 && k < 7 && start < parents[id].points_end){
+      uint end = parents[id].node_partitioning[k];
+      while(start < end) {
+        debug[id].thread_id = 8888;
+        debug[id].points_start = start;
+        debug[id].points_end = end;
+        if(points[indeces[start]][axis] <= threshold){
+          ++start;
         }
-      } else {
-        if(points[indeces[end]][axis] <= threshold){
+        else if(points[indeces[end]][axis] <= threshold){
           swap(start, end);
           ++start;
           --end;
-        } else {
+        }
+        if(points[indeces[end]][axis] > threshold){
           --end;
         }
       }
+
+      local_partitioning[k * 2] = start;
+      local_partitioning[k * 2 + 1] = parents[id].node_partitioning[k];
+      start = parents[id].node_partitioning[k] ;
     }
-    //num_nodes = start;
-    start = end - 1;
-    if(iteration_end - k <= 2){
-      end = parent.points_end;
-    } else {
-      end = partition_borders[k * 2];
-    }
-    partition_borders[iteration_end - 1 + k] = start;
-    num_nodes = start;
-    if(iteration == 1) {
-      start = partition_borders[0];
-    }
-    else {
-      if(k == 0) start = partition_borders[1];
-      else if(k == 1) start = partition_borders[0];
-      else if(k == 2) start = partition_borders[2];
+    else if(start >= parents[id].points_end){
+      local_partitioning[k * 2] = parents[id].points_end;
+      local_partitioning[k * 2 + 1] = parents[id].points_end;
+      start = parents[id].points_end;
     }
+    parents[id].node_partitioning[k] = local_partitioning[k];
   }
 }
 
@@ -134,7 +171,7 @@ double[2][3] get_childs_bounds(uint[3] partition_order, double[3] partition_axes
 
 void main() {
   uint idx = gl_GlobalInvocationID.x;
-
+  debug[idx].thread_id = 1111;
   Node parent = parents[idx];
 
   double x_diff = parent.bounds_max[0] - parent.bounds_min[0];
@@ -143,53 +180,63 @@ void main() {
   double x_partition = parent.bounds_min[0] + 0.5 * abs(x_diff);
   double y_partition = parent.bounds_min[1] + 0.5 * abs(y_diff);
   double z_partition = parent.bounds_min[2] + 0.5 * abs(z_diff);
+  debug[idx].thread_id = 2222;
   uint[3] partition_order = partitioning_order(abs(x_diff), abs(y_diff), abs(z_diff));
-  uint[7] partition_borders;
   for(uint i = 0; i < 3; ++i){
     uint partition_axis = partition_order[i];
     switch(partition_axis){
       case 0:
-        partitioning(partition_axis, x_partition, i, partition_borders, parent);
+        partitioning(partition_axis, x_partition, i, idx);
         break;
       case 1:
-        partitioning(partition_axis, y_partition, i, partition_borders, parent);
+        partitioning(partition_axis, y_partition, i, idx);
         break;
       case 2:
-        partitioning(partition_axis, z_partition, i, partition_borders, parent);
+        partitioning(partition_axis, z_partition, i, idx);
     }
   }
-  parents[idx].node_partitioning[0] = partition_borders[3];
-  parents[idx].node_partitioning[1] = partition_borders[1];
-  parents[idx].node_partitioning[2] = partition_borders[4];
-  parents[idx].node_partitioning[3] = partition_borders[0];
-  parents[idx].node_partitioning[4] = partition_borders[5];
-  parents[idx].node_partitioning[5] = partition_borders[2];
-  parents[idx].node_partitioning[6] = partition_borders[6];
-  parents[idx].node_partitioning[7] = parents[idx].points_end;
+  debug[idx].thread_id = 3333;
+  // parents[idx].node_partitioning[0] = partition_borders[3];
+  // parents[idx].node_partitioning[1] = partition_borders[1];
+  // parents[idx].node_partitioning[2] = partition_borders[4];
+  // parents[idx].node_partitioning[3] = partition_borders[0];
+  // parents[idx].node_partitioning[4] = partition_borders[5];
+  // parents[idx].node_partitioning[5] = partition_borders[2];
+  // parents[idx].node_partitioning[6] = partition_borders[6];
+  // parents[idx].node_partitioning[7] = parents[idx].points_end;
 
   for(uint i = 0; i < 8; ++i){
     if(i == 0) {
-      parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i];
+      parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i] - parents[idx].points_start;
     } else {
     parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i] - parents[idx].node_partitioning[i-1];
     }
   }
+  debug[idx].thread_id = 4444;
   for(uint i = 0; i < 8; ++i){
     if(i == 0) {
-      children[idx + i].points_start = 0;
+      children[idx * 8 + i].points_start = parents[idx].points_start;
     } else {
-      children[idx + i].points_start = parents[idx].node_partitioning[i - 1];
+      children[idx * 8 + i].points_start = parents[idx].node_partitioning[i - 1];
     }
-    children[idx + i].points_end = parents[idx].node_partitioning[i];
+    children[idx * 8 + i].points_end = parents[idx].node_partitioning[i];
+    children[idx * 8 + i].node_partitioning[0] = children[idx * 8 + i].points_end;
+    children[idx * 8 + i].points_per_partition[0] = parents[idx].points_per_partition[i];
     double[2][3] child_bounds = get_childs_bounds(
       partition_order,
       double[3](x_partition, y_partition, z_partition),
       i,
       parent
     );
-    children[idx + i].bounds_min = child_bounds[0];
-    children[idx + i].bounds_max = child_bounds[1];
+    children[idx * 8 + i].bounds_min = child_bounds[0];
+    children[idx* 8 + i].bounds_max = child_bounds[1];
+  }
+  debug[idx].thread_id = 55555;
+  debug[idx].debug_order = partition_order;
+  for(uint i = 0; i < 8; ++i){
+    debug[idx].debug_borders[i] = parents[idx].node_partitioning[i];
   }
 
-
+  debug[idx].points_start = parents[idx].points_start;
+  debug[idx].points_end = parents[idx].points_end;
 }
diff --git a/pasture-tools/src/lib.rs b/pasture-tools/src/lib.rs
index 1214913..ae7ea54 100644
--- a/pasture-tools/src/lib.rs
+++ b/pasture-tools/src/lib.rs
@@ -1,3 +1,3 @@
 extern crate self as pasture_tools;
-
+#[cfg(feature = "gpu")]
 pub mod acceleration_structures;
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
index 75df6ac..1e41790 100644
--- a/pasture-tools/src/main.rs
+++ b/pasture-tools/src/main.rs
@@ -1,4 +1,3 @@
-#[cfg(feature = "gpu")]
 mod ex {
 
     use pasture_core::containers::{InterleavedVecPointStorage, PointBuffer, PointBufferExt};
@@ -102,36 +101,103 @@ mod ex {
 
         let mut reader = LASReader::from_path(
             //"/home/jnoice/dev/pasture/pasture-io/examples/in/10_points_format_1.las",
-            "/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
+            //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
             //"/home/jnoice/Downloads/interesting.las",
+            "/home/jnoice/Downloads/20150930_matsch_flight2_rgb_densified_point_cloud_part_1 - Cloud.las",
+            //"/home/jnoice/Downloads/45123H3316.laz",
+            //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
         )?;
         let count = reader.remaining_points();
         let mut buffer =
             InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
         reader.read_into(&mut buffer, count)?;
 
-        for point in buffer.iter_point::<LasPointFormat0>().take(5) {
-            println!("{:?}", point);
-        }
         let bounds = reader.get_metadata().bounds().unwrap();
 
-        let device = gpu::Device::new(gpu::DeviceOptions {
-            device_power: gpu::DevicePower::High,
-            device_backend: gpu::DeviceBackend::Vulkan,
-            use_adapter_features: true,
-            use_adapter_limits: true,
-        })
-        .await;
+        // let device = gpu::Device::new(gpu::DeviceOptions {
+        //     device_power: gpu::DevicePower::High,
+        //     device_backend: gpu::DeviceBackend::Vulkan,
+        //     use_adapter_features: true,
+        //     use_adapter_limits: true,
+        // })
+        // .await;
+        //
+        // let mut device = match device {
+        //     Ok(d) => d,
+        //     Err(_) => {
+        //         println!("Failed to request device. Aborting.");
+        //         return Ok(());
+        //     }
+        // };
+        //
+        // device.print_device_info();
+        // device.print_active_features();
+        // device.print_active_limits();
+        // println!("\n");
+        //
+        // let attribs = &[attributes::POSITION_3D];
+        //
+        // let buffer_info_interleaved = gpu::BufferInfoInterleaved {
+        //     attributes: attribs,
+        //     binding: 0,
+        // };
+        //
+        // let mut gpu_point_buffer = GpuPointBufferInterleaved::new();
+        // gpu_point_buffer.malloc(
+        //     count as u64,
+        //     &buffer_info_interleaved,
+        //     &mut device.wgpu_device,
+        // );
+        // gpu_point_buffer.upload(
+        //     &buffer,
+        //     0..buffer.len(),
+        //     &buffer_info_interleaved,
+        //     &mut device.wgpu_device,
+        //     &device.wgpu_queue,
+        // );
+        //
+        // device.set_bind_group(
+        //     0,
+        //     gpu_point_buffer.bind_group_layout.as_ref().unwrap(),
+        //     gpu_point_buffer.bind_group.as_ref().unwrap(),
+        // );
+        // device.set_compute_shader_glsl(include_str!(
+        //     "acceleration_structures/shaders/interleaved.comp"
+        // ));
+        // device.compute(1, 1, 1);
+        //
+        // println!("\n===== COMPUTE =====\n");
+        //
+        // println!("Before:");
+        // for point in point_buffer.iter_point::<LasPointFormat0>().take(5) {
+        //     println!("{:?}", point);
+        // }
+        // println!();
+        //
+        // gpu_point_buffer
+        //     .download_into_interleaved(
+        //         &mut buffer,
+        //         0..count,
+        //         &buffer_info_interleaved,
+        //         &device.wgpu_device,
+        //     )
+        //     .await;
+        //
+        // println!("After:");
+        // for point in point_buffer.iter_point::<LasPointFormat0>().take(5) {
+        //     println!("{:?}", point);
+        // }
 
-        let mut device = match device {
-            Ok(d) => d,
-            Err(_) => {
-                println!("Failed to request device. Aborting.");
+        let mut octree =
+            pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 500).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
                 return Ok(());
             }
         };
 
-        let mut octree = pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 2);
         octree.construct(MyPointType::layout()).await;
         Ok(())
     }

From a82085b9cddc7dbe06eaa69ffc789638dbbcdd71 Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Fri, 28 Jan 2022 12:14:51 +0100
Subject: [PATCH 05/15] large files vulkan error

---
 pasture-tools/Cargo.toml                      |   3 +-
 .../src/acceleration_structures/gpu_octree.rs | 302 ++++++++----------
 pasture-tools/src/main.rs                     |  10 +-
 3 files changed, 148 insertions(+), 167 deletions(-)

diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index 28ba0dd..587f139 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -19,12 +19,13 @@ pasture-derive = {version = "=0.2.0", path = "../pasture-derive" }
 anyhow = "1.0.34"
 clap = "2.33.3"
 log = "0.4"
+env_logger = "0.9.0"
 pretty_env_logger = "0.4.0"
 plotters = "^0.3.0"
 rand = {version = "0.8.3", features = ["small_rng"] }
 
 #gpu related
-wgpu = { version = "0.11.0", features = ["spirv"], optional = true }
+wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
 shaderc = { version = "0.7.2", optional = true }
 futures = { version = "0.3", optional = true }
 bytemuck = { version = "1.5.1", optional = true }
diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 5f4176b..702bb98 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -11,6 +11,7 @@ use std::borrow::Cow;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::prelude::*;
+use wgpu::util::DeviceExt;
 
 #[repr(C)]
 #[derive(PointType, Debug)]
@@ -203,39 +204,24 @@ impl<'a> GpuOctree<'a> {
             points.push(point);
         }
 
-        let mut construction_shader = File::open(
-            "/home/jnoice/dev/pasture/pasture-tools/src/acceleration_structures/shaders/comp.spv",
-        )
-        .unwrap();
-        let mut shader_bytes: Vec<u8> = Vec::new();
-        for byte in construction_shader.bytes() {
-            shader_bytes.push(byte.unwrap());
-        }
-        let shader_words: Vec<u32> = shader_bytes
-            .chunks_exact(4)
-            .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
-            .collect();
         let mut compiler = shaderc::Compiler::new().unwrap();
-        // let shader = self
-        //     .gpu_device
-        //     .create_shader_module(&wgpu::ShaderModuleDescriptor {
-        //         label: Some("NodeGenerationShader"),
-        //         source: wgpu::ShaderSource::SpirV(Cow::from(shader_words.as_slice())),
-        //     });
         let comp_shader = include_str!("shaders/generate_nodes.comp");
-        let comp_spirv = compiler.compile_into_spirv(
-            comp_shader,
-            shaderc::ShaderKind::Compute,
-            "ComputeShader",
-            "main",
-            None
-        )
-        .unwrap();
+        let comp_spirv = compiler
+            .compile_into_spirv(
+                comp_shader,
+                shaderc::ShaderKind::Compute,
+                "ComputeShader",
+                "main",
+                None,
+            )
+            .unwrap();
         let comp_data = wgpu::util::make_spirv(comp_spirv.as_binary_u8());
-        let shader = self.gpu_device.create_shader_module(&wgpu::ShaderModuleDescriptor{
-            label: Some("ModeGenerationShader"),
-            source: comp_data,
-        });
+        let shader = self
+            .gpu_device
+            .create_shader_module(&wgpu::ShaderModuleDescriptor {
+                label: Some("ModeGenerationShader"),
+                source: comp_data,
+            });
         let points_bind_group_layout =
             self.gpu_device
                 .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
@@ -317,18 +303,15 @@ impl<'a> GpuOctree<'a> {
                     entry_point: "main",
                 });
 
-        let gpu_point_buffer = wgpu::util::DeviceExt::create_buffer_init(
-            &self.gpu_device,
-            &wgpu::util::BufferInitDescriptor {
-                label: Some("PointBuffer"),
-                contents: raw_points.as_slice(),
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST
-                    | wgpu::BufferUsages::STORAGE,
-            },
-        );
+        let gpu_point_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("PointBuffer"),
+            contents: &raw_points.as_slice(),
+            usage: wgpu::BufferUsages::MAP_READ
+                | wgpu::BufferUsages::MAP_WRITE
+                | wgpu::BufferUsages::COPY_SRC
+                | wgpu::BufferUsages::COPY_DST
+                | wgpu::BufferUsages::STORAGE,
+        });
 
         let mut root_node = OctreeNode {
             bounds: self.bounds,
@@ -365,83 +348,87 @@ impl<'a> GpuOctree<'a> {
             .flat_map(|x| x.to_le_bytes().to_vec())
             .collect();
 
-        while !current_nodes.is_empty() {
-            //for i in 0..2 {
-            //let num_new_nodes = 8u64.pow(tree_depth) - num_leaves as u64;
+        let point_index_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("IndexBuffer"),
+            contents: &raw_indeces.as_slice(),
+            usage: wgpu::BufferUsages::MAP_READ
+                | wgpu::BufferUsages::MAP_WRITE
+                | wgpu::BufferUsages::COPY_SRC
+                | wgpu::BufferUsages::COPY_DST
+                | wgpu::BufferUsages::STORAGE,
+        });
 
-            let point_index_buffer = wgpu::util::DeviceExt::create_buffer_init(
-                &self.gpu_device,
-                &wgpu::util::BufferInitDescriptor {
-                    label: Some("IndexBuffer"),
-                    contents: raw_indeces.as_slice(),
-                    usage: wgpu::BufferUsages::MAP_READ
-                        | wgpu::BufferUsages::MAP_WRITE
-                        | wgpu::BufferUsages::COPY_SRC
-                        | wgpu::BufferUsages::COPY_DST
-                        | wgpu::BufferUsages::STORAGE,
-                },
-            );
-            let debug_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-                label: Some("DebugBuffer"),
-                size: (3 * 4 + 8 * 4 + 4 + 2 * 4) * current_nodes.len() as u64,
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST
-                    | wgpu::BufferUsages::STORAGE,
+        let debug_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("DebugBuffer"),
+            size: (3 * 4 + 8 * 4 + 4 + 2 * 4) as u64,
+            usage: wgpu::BufferUsages::MAP_READ
+                | wgpu::BufferUsages::MAP_WRITE
+                | wgpu::BufferUsages::COPY_SRC
+                | wgpu::BufferUsages::COPY_DST
+                | wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+
+        let points_bind_group = self
+            .gpu_device
+            .create_bind_group(&wgpu::BindGroupDescriptor {
+                label: Some("PointBufferBindGroup"),
+                layout: &points_bind_group_layout,
+                entries: &[
+                    wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: gpu_point_buffer.as_entire_binding(),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding: 1,
+                        resource: point_index_buffer.as_entire_binding(),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding: 2,
+                        resource: debug_buffer.as_entire_binding(),
+                    },
+                ],
+            });
+        let mut iterations = current_nodes.len();
+        //while !current_nodes.is_empty() {
+        for i in 0..iterations {
+            //let num_new_nodes = 8u64.pow(tree_depth) - num_leaves as u64;
+            let child_buffer_size = 120 * current_nodes.len() as u64 * 8 as u64;
+            let child_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                size: child_buffer_size,
+                usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
                 mapped_at_creation: false,
             });
-
             let new_nodes_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
                 label: Some("NewNodesBuffer"),
                 size: //(mem::size_of::<OctreeNode>() - mem::size_of::<Box<[OctreeNode]>>()) as u64
-                    128
-                    * current_nodes.len() as u64 * 8 as u64,
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
+                    child_buffer_size,
+                usage: wgpu::BufferUsages::COPY_SRC
                     | wgpu::BufferUsages::COPY_DST
                     | wgpu::BufferUsages::STORAGE,
                 mapped_at_creation: false,
             });
-            let points_bind_group = self
-                .gpu_device
-                .create_bind_group(&wgpu::BindGroupDescriptor {
-                    label: Some("PointBufferBindGroup"),
-                    layout: &points_bind_group_layout,
-                    entries: &[
-                        wgpu::BindGroupEntry {
-                            binding: 0,
-                            resource: gpu_point_buffer.as_entire_binding(),
-                        },
-                        wgpu::BindGroupEntry {
-                            binding: 1,
-                            resource: point_index_buffer.as_entire_binding(),
-                        },
-                        wgpu::BindGroupEntry {
-                            binding: 2,
-                            resource: debug_buffer.as_entire_binding(),
-                        },
-                    ],
-                });
+
             let mut parent_nodes_raw = Vec::new();
             for node in &current_nodes {
                 parent_nodes_raw.append(&mut node.into_raw());
             }
-
-            let parent_nodes_buffer = wgpu::util::DeviceExt::create_buffer_init(
-                &self.gpu_device,
+            let parent_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                size: parent_nodes_raw.len() as u64,
+                usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+                mapped_at_creation: false,
+            });
+            let parent_nodes_buffer = self.gpu_device.create_buffer_init(
                 &wgpu::util::BufferInitDescriptor {
                     label: Some("ParentNodesBuffer"),
                     contents: parent_nodes_raw.as_slice(),
-                    usage: wgpu::BufferUsages::MAP_READ
-                        | wgpu::BufferUsages::MAP_WRITE
-                        | wgpu::BufferUsages::COPY_SRC
+                    usage: wgpu::BufferUsages::COPY_SRC
                         | wgpu::BufferUsages::COPY_DST
                         | wgpu::BufferUsages::STORAGE,
                 },
             );
-
             let nodes_bind_group = self
                 .gpu_device
                 .create_bind_group(&wgpu::BindGroupDescriptor {
@@ -477,104 +464,95 @@ impl<'a> GpuOctree<'a> {
                 compute_pass.insert_debug_marker("Pasture Compute Debug");
                 compute_pass.dispatch(current_nodes.len() as u32, 1, 1);
             }
+            encoder.copy_buffer_to_buffer(&new_nodes_buffer, 0, &child_nodes_buffer_staging, 0, child_buffer_size);
+            encoder.copy_buffer_to_buffer(&parent_nodes_buffer, 0, &parent_nodes_buffer_staging, 0, parent_nodes_raw.len() as u64);
             self.gpu_queue.submit(Some(encoder.finish()));
 
-            let mapped_future = point_index_buffer.slice(..).map_async(wgpu::MapMode::Read);
+            let index_slice = point_index_buffer.slice(..);
+            let mapped_future = index_slice.map_async(wgpu::MapMode::Read);
             self.gpu_device.poll(wgpu::Maintain::Wait);
             if let Ok(()) = mapped_future.await {
-                let mapped_index_buffer = point_index_buffer.slice(..).get_mapped_range().to_vec();
-                let indices: Vec<u32> = mapped_index_buffer
+                let mapped_index_buffer = index_slice.get_mapped_range();
+                let index_vec = mapped_index_buffer.to_vec();
+                let indices: Vec<u32> = index_vec
                     .chunks_exact(4)
                     .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
                     .collect();
 
-                // self.point_partitioning = indices.clone();
-                // indices.iter().for_each(|x| {
-                //     println!("{}", x);
-                // });
-                raw_indeces = mapped_index_buffer.clone();
+
+                raw_indeces = index_vec.clone();
+                drop(mapped_index_buffer);
+                point_index_buffer.unmap();
             }
-            point_index_buffer.unmap();
-            let mapped_debug = debug_buffer.slice(..).map_async(wgpu::MapMode::Read);
+
+            let debug_slice = debug_buffer.slice(..);
+            let mapped_debug = debug_slice.map_async(wgpu::MapMode::Read);
             self.gpu_device.poll(wgpu::Maintain::Wait);
             if let Ok(()) = mapped_debug.await {
-                let mut debug: Vec<u32> = debug_buffer
-                    .slice(..)
-                    .get_mapped_range()
+                let debug_data = debug_slice.get_mapped_range();
+                let mut debug: Vec<u32> = debug_data
                     .to_vec()
                     .chunks_exact(4)
                     .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
                     .collect();
-                for k in 0..current_nodes.len() {
-                    let partition_order: Vec<u32> = debug.drain(..3).collect();
-                    let borders: Vec<u32> = debug.drain(..8).collect();
-                    let thread_id: Vec<u32> = debug.drain(..1).collect();
-                    let start_end: Vec<u32> = debug.drain(..2).collect();
-                    println!(
-                        " Node {} | Partition Order: {:?} \n Partition borders: {:?}\n thread index: {}\n start/end: {:?}",
-                         k, partition_order, borders, thread_id.first().unwrap(), start_end,
+                let partition_order: Vec<u32> = debug.drain(..3).collect();
+                let borders: Vec<u32> = debug.drain(..8).collect();
+                let thread_id: Vec<u32> = debug.drain(..1).collect();
+                let start_end: Vec<u32> = debug.drain(..2).collect();
+                println!(
+                        " Partition Order: {:?} \n Partition borders: {:?}\n thread index: {}\n start/end: {:?}",
+                         partition_order, borders, thread_id.first().unwrap(), start_end,
                     );
-                }
+                drop(debug_data);
+                debug_buffer.unmap();
             }
 
-            let mapped_children = new_nodes_buffer.slice(..).map_async(wgpu::MapMode::Read);
-            self.gpu_device.poll(wgpu::Maintain::Wait);
 
-            let mapped_parents = parent_nodes_buffer.slice(..).map_async(wgpu::MapMode::Read);
+            let parents_slice = parent_nodes_buffer_staging.slice(..);
+            let parents_future = parents_slice.map_async(wgpu::MapMode::Read);
             self.gpu_device.poll(wgpu::Maintain::Wait);
 
-            if matches!(mapped_parents.await, Ok(())) && matches!(mapped_children.await, Ok(())) {
-                let mapped_node_buffer = parent_nodes_buffer.slice(..).get_mapped_range().to_vec();
+            //if matches!(mapped_parents.await, Ok(())) && matches!(mapped_children.await, Ok(())) {
+            if let Ok(()) =parents_future.await {
+                let mapped_nodes_data = parents_slice.get_mapped_range();
+                let mapped_node_buffer = mapped_nodes_data.to_vec();
                 let nodes: Vec<OctreeNode> = mapped_node_buffer
                     .chunks_exact(120)
                     .map(|b| OctreeNode::from_raw(b.to_vec()))
                     .collect();
-                let mapped_children = new_nodes_buffer.slice(..).get_mapped_range().to_vec();
-                let mut children: Vec<OctreeNode> = mapped_children
+
+                let children_slice = child_nodes_buffer_staging.slice(..);
+                let children_future = children_slice.map_async(wgpu::MapMode::Read);
+                self.gpu_device.poll(wgpu::Maintain::Wait);
+
+                if let Ok(()) = children_future.await {
+
+                let mapped_children_data = children_slice.get_mapped_range();
+                let mapped_children_buffer = mapped_children_data.to_vec();
+                let mut children: Vec<OctreeNode> = mapped_children_buffer
                     .chunks_exact(120)
                     .map(|b| OctreeNode::from_raw(b.to_vec()))
                     .collect();
                 let mut generated_children: Vec<&mut OctreeNode> = Vec::new();
                 for mut node in nodes {
                     let children_sizes = node.points_per_partition.clone();
-                    // println!("=========== Node ===========");
-                    // println!("{:?}", node);
+
                     let mut local_children: Vec<OctreeNode> = children.drain(..8).collect();
 
                     node.children = Some(local_children.into_boxed_slice());
                     let mut node_ref = current_nodes.swap_remove(0);
-                    // println!("Number of parents left: {}", current_nodes.len());
                     *node_ref = node;
-                    // println!(
-                    //     "Parent node point start: {}, point end: {}",
-                    //     &node_ref.point_start, &node_ref.point_end
-                    // );
-                    // println!(
-                    //     "Points per partition in parent : {:?}",
-                    //     &node_ref.points_per_partition
-                    // );
-                    // println!("Parent node bounds: {:?}", &node_ref.bounds);
                     let mut children: &mut Box<[OctreeNode]> = node_ref.children.as_mut().unwrap();
                     let iter = (*children).iter_mut();
 
                     let mut child_index = 0;
                     for child in iter {
-                        // println!("========== Child ===========");
-                        // println!("{:?}", child);
-                        // println!(
-                        //     "{}, {}",
-                        //     children_sizes[child_index] != 0,
-                        //     !child.is_leaf(self.points_per_node)
-                        // );
+
                         if children_sizes[child_index] != 0 && !child.is_leaf(self.points_per_node)
                         {
                             generated_children.push(child);
                         } else {
                             num_leaves += 1;
-                            // println!(
-                            //     "Child {} is LEAF. Point Count: {}",
-                            //     child_index, children_sizes[child_index]
-                            // );
                         }
 
                         num_nodes += 1;
@@ -582,24 +560,22 @@ impl<'a> GpuOctree<'a> {
                     }
                 }
                 current_nodes.append(&mut generated_children);
+                drop(mapped_nodes_data);
+                parent_nodes_buffer_staging.unmap();
+                drop(mapped_children_data);
+                child_nodes_buffer_staging.unmap();
+                // parent_nodes_buffer.destroy();
+                // new_nodes_buffer.destroy();
             }
-            parent_nodes_buffer.unmap();
-            new_nodes_buffer.unmap();
-            // println!(
-            //     "Num Nodes: {}, Num New Parents: {}, Num Leaves: {}",
-            //     num_nodes,
-            //     current_nodes.len(),
-            //     num_leaves
-            // );
-
+            }
+            let work_done = self.gpu_queue.on_submitted_work_done();
+            work_done.await;
             tree_depth += 1;
+            iterations = current_nodes.len();
 
-            // println!("{}", tree_depth);
-            // println!("{:?}", current_nodes);
         }
 
         self.root_node = Some(root_node);
-        println!("{:?}", self.root_node);
-        // println!("Tree depth = {}", tree_depth);
+        //println!("{:?}", self.root_node);
     }
 }
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
index 1e41790..5a8d527 100644
--- a/pasture-tools/src/main.rs
+++ b/pasture-tools/src/main.rs
@@ -1,3 +1,6 @@
+#[macro_use]
+extern crate log;
+
 mod ex {
 
     use pasture_core::containers::{InterleavedVecPointStorage, PointBuffer, PointBufferExt};
@@ -49,7 +52,8 @@ mod ex {
 
     async fn run() -> Result<()> {
         // == Init point buffer ======================================================================
-
+        env_logger::init();
+        info!("starting up");
         let points = vec![
             MyPointType {
                 position: Vector3::new(1.0, 0.0, 0.0),
@@ -103,9 +107,9 @@ mod ex {
             //"/home/jnoice/dev/pasture/pasture-io/examples/in/10_points_format_1.las",
             //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
             //"/home/jnoice/Downloads/interesting.las",
-            "/home/jnoice/Downloads/20150930_matsch_flight2_rgb_densified_point_cloud_part_1 - Cloud.las",
+            //"/home/jnoice/Downloads/20150930_matsch_flight2_rgb_densified_point_cloud_part_1 - Cloud.las",
             //"/home/jnoice/Downloads/45123H3316.laz",
-            //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
+            "/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
         )?;
         let count = reader.remaining_points();
         let mut buffer =

From 467974283c6d87af5729499e171621c3d2ff21c9 Mon Sep 17 00:00:00 2001
From: jneus <jannis.neus@live.de>
Date: Fri, 4 Feb 2022 09:17:22 +0100
Subject: [PATCH 06/15] Testing Laptop

---
 pasture-tools/Cargo.toml                      |  3 +-
 .../src/acceleration_structures/gpu_octree.rs | 91 +++++++++++++++++--
 .../shaders/generate_nodes.comp               | 21 +++--
 pasture-tools/src/main.rs                     | 14 +--
 4 files changed, 100 insertions(+), 29 deletions(-)

diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index 587f139..15de7f1 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -23,9 +23,10 @@ env_logger = "0.9.0"
 pretty_env_logger = "0.4.0"
 plotters = "^0.3.0"
 rand = {version = "0.8.3", features = ["small_rng"] }
+tokio = { version = "1.16.1", features = ["full"] }
 
 #gpu related
-wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
+wgpu = { version = "0.11.0", features = ["spirv"], optional = true }
 shaderc = { version = "0.7.2", optional = true }
 futures = { version = "0.3", optional = true }
 bytemuck = { version = "1.5.1", optional = true }
diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 702bb98..014e6d8 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -42,7 +42,7 @@ struct MyPointType {
     pub gps_time: f64,
 }
 #[derive(Debug, Clone)]
-struct OctreeNode {
+pub struct OctreeNode {
     bounds: AABB<f64>,
     children: Option<Box<[OctreeNode]>>,
     node_partitioning: [u32; 8],
@@ -156,7 +156,7 @@ impl<'a> GpuOctree<'a> {
         max_bounds: AABB<f64>,
         points_per_node: u32,
     ) -> Result<GpuOctree<'a>, wgpu::RequestDeviceError> {
-        let instance = wgpu::Instance::new(wgpu::Backends::all());
+        let instance = wgpu::Instance::new(wgpu::Backends::VULKAN);
         let adapter = instance
             .request_adapter(&wgpu::RequestAdapterOptions {
                 power_preference: wgpu::PowerPreference::HighPerformance,
@@ -189,7 +189,7 @@ impl<'a> GpuOctree<'a> {
     pub fn print_tree(&self) {
         println!("{:?}", self.root_node);
     }
-    pub async fn construct(&mut self, layout: PointLayout) {
+    pub async fn construct(&mut self) {
         let point_count = self.point_buffer.len();
         let mut points: Vec<Vector3<f64>> = Vec::new();
         let point_iterator: AttributeIteratorByValue<Vector3<f64>, dyn PointBuffer> =
@@ -291,7 +291,7 @@ impl<'a> GpuOctree<'a> {
             self.gpu_device
                 .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
                     label: Some("ConstructionPipelineLayout"),
-                    bind_group_layouts: &[&points_bind_group_layout, &nodes_bind_group_layout],
+                    bind_group_layouts: &[&nodes_bind_group_layout, &points_bind_group_layout],
                     push_constant_ranges: &[],
                 });
         let compute_pipeline =
@@ -390,8 +390,8 @@ impl<'a> GpuOctree<'a> {
                 ],
             });
         let mut iterations = current_nodes.len();
-        //while !current_nodes.is_empty() {
-        for i in 0..iterations {
+        while !current_nodes.is_empty() {
+        //for i in 0..iterations {
             //let num_new_nodes = 8u64.pow(tree_depth) - num_leaves as u64;
             let child_buffer_size = 120 * current_nodes.len() as u64 * 8 as u64;
             let child_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
@@ -455,8 +455,9 @@ impl<'a> GpuOctree<'a> {
                     label: Some("ConstructionComputePass"),
                 });
                 compute_pass.set_pipeline(&compute_pipeline);
-                compute_pass.set_bind_group(0, &points_bind_group, &[]);
-                compute_pass.set_bind_group(1, &nodes_bind_group, &[]);
+
+                compute_pass.set_bind_group(0, &nodes_bind_group, &[]);
+                compute_pass.set_bind_group(1, &points_bind_group, &[]);
                 println!(
                     "Starting gpu computation with {} threads",
                     current_nodes.len()
@@ -466,6 +467,7 @@ impl<'a> GpuOctree<'a> {
             }
             encoder.copy_buffer_to_buffer(&new_nodes_buffer, 0, &child_nodes_buffer_staging, 0, child_buffer_size);
             encoder.copy_buffer_to_buffer(&parent_nodes_buffer, 0, &parent_nodes_buffer_staging, 0, parent_nodes_raw.len() as u64);
+
             self.gpu_queue.submit(Some(encoder.finish()));
 
             let index_slice = point_index_buffer.slice(..);
@@ -479,7 +481,7 @@ impl<'a> GpuOctree<'a> {
                     .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
                     .collect();
 
-
+                self.point_partitioning = indices.clone();
                 raw_indeces = index_vec.clone();
                 drop(mapped_index_buffer);
                 point_index_buffer.unmap();
@@ -578,4 +580,75 @@ impl<'a> GpuOctree<'a> {
         self.root_node = Some(root_node);
         //println!("{:?}", self.root_node);
     }
+
+    fn get_points(&self, node: &OctreeNode) -> Vec<u32> {
+        let indices = self.point_partitioning[node.point_start as usize..node.point_end as usize].to_vec();
+        return indices;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use pasture_core::containers::InterleavedVecPointStorage;
+    use pasture_core::containers::PointBufferExt;
+    use pasture_io::base::PointReader;
+    use pasture_io::las::LasPointFormat0;
+    use pasture_io::las::LASReader;
+    use pasture_core::layout::PointType;
+    use crate::acceleration_structures::GpuOctree;
+    use crate::acceleration_structures::OctreeNode;
+    use pasture_core::nalgebra::Vector3;
+    use pasture_core::layout::attributes;
+    use std::convert::TryInto;
+    use std::error::Error;
+
+    use tokio;
+
+    #[tokio::test]
+    async fn check_correct_bounds() {
+        let mut reader = LASReader::from_path("/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz");
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(b) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(b) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
+
+        let mut octree = GpuOctree::new(&buffer, bounds, 500).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let mut node = octree.root_node.as_ref().unwrap();
+        let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
+        while !nodes_to_visit.is_empty() {
+            let current_node = nodes_to_visit.pop().unwrap();
+            let current_bounds = current_node.bounds;
+            let point_ids = octree.get_points(&current_node).into_iter();
+            for id in point_ids {
+                let point = buffer.get_point::<LasPointFormat0>(id as usize);
+                let pos: Vector3<f64> = Vector3::from(point.position);
+                println!("Bounds: {:?}", current_bounds);
+                println!("Point: {:?}, id: {}", pos, id);
+                assert!(current_bounds.min().x <= pos.x
+                    && current_bounds.max().x >= pos.x
+                    && current_bounds.min().y <= pos.y
+                    && current_bounds.max().y >= pos.y
+                    && current_bounds.min().z <= pos.z
+                    && current_bounds.max().z >= pos.z);
+            }
+            let children = current_node.children.as_ref().unwrap();
+            (*children).iter().for_each(|x| nodes_to_visit.push(x));
+
+        }
+    }
 }
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
index 614043d..8685038 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -16,21 +16,22 @@ struct Debug {
   uint points_end;
 };
 
-layout(std430, set=0, binding=0) buffer PointBuffer {
+layout(std430, set=0, binding=0) buffer ParentNodes{
+  Node parents[];
+};
+layout(std430, set=0, binding=1) buffer ChildNodes{
+  Node children[];
+};
+layout(std430, set=1, binding=0) buffer PointBuffer {
   double points[][3];
 };
-layout(std430, set=0, binding=1) buffer Partitioning {
+layout(std430, set=1, binding=1) buffer Partitioning {
   uint indeces[];
 };
-layout(std430, set=0, binding=2) buffer DebugBuffer {
+layout(std430, set=1, binding=2) buffer DebugBuffer {
   Debug debug[];
 };
-layout(std430, set=1, binding=0) buffer ParentNodes{
-  Node parents[];
-};
-layout(std430, set=1, binding=1) buffer ChildNodes{
-  Node children[];
-};
+
 layout (local_size_x=1, local_size_y=1, local_size_z=1) in;
 
 uint[3] partitioning_order(double x, double y, double z){
@@ -170,7 +171,7 @@ double[2][3] get_childs_bounds(uint[3] partition_order, double[3] partition_axes
 }
 
 void main() {
-  uint idx = gl_GlobalInvocationID.x;
+  uint idx = gl_WorkGroupID.x;
   debug[idx].thread_id = 1111;
   Node parent = parents[idx];
 
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
index 5a8d527..e4f9a97 100644
--- a/pasture-tools/src/main.rs
+++ b/pasture-tools/src/main.rs
@@ -3,16 +3,11 @@ extern crate log;
 
 mod ex {
 
-    use pasture_core::containers::{InterleavedVecPointStorage, PointBuffer, PointBufferExt};
-    use pasture_core::gpu;
-    use pasture_core::gpu::GpuPointBufferInterleaved;
+    use pasture_core::containers::InterleavedVecPointStorage;
     use pasture_core::layout::PointType;
-    use pasture_core::layout::{attributes, PointAttributeDataType, PointAttributeDefinition};
-    use pasture_core::meta::Metadata;
     use pasture_core::nalgebra::Vector3;
     use pasture_derive::PointType;
     use pasture_io::base::PointReader;
-    use pasture_io::las::las_bounds_to_pasture_bounds;
     use pasture_io::las::LASReader;
     use pasture_io::las::LasPointFormat0;
 
@@ -105,11 +100,12 @@ mod ex {
 
         let mut reader = LASReader::from_path(
             //"/home/jnoice/dev/pasture/pasture-io/examples/in/10_points_format_1.las",
-            //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
+            "/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
             //"/home/jnoice/Downloads/interesting.las",
             //"/home/jnoice/Downloads/20150930_matsch_flight2_rgb_densified_point_cloud_part_1 - Cloud.las",
             //"/home/jnoice/Downloads/45123H3316.laz",
-            "/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
+            //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
+            //"/home/jnoice/Downloads/tirol.las",
         )?;
         let count = reader.remaining_points();
         let mut buffer =
@@ -202,7 +198,7 @@ mod ex {
             }
         };
 
-        octree.construct(MyPointType::layout()).await;
+        octree.construct().await;
         Ok(())
     }
 }

From abf28d8556fc5042f83ff72fdae5027dd8c1543f Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Mon, 14 Feb 2022 10:47:06 +0100
Subject: [PATCH 07/15] tests passed

---
 .../src/acceleration_structures/gpu_octree.rs | 296 ++++++++++++++----
 .../acceleration_structures/shaders/comp.spv  | Bin 2076 -> 0 bytes
 .../shaders/generate_nodes.comp               | 196 +++++++-----
 pasture-tools/src/main.rs                     |   6 +-
 4 files changed, 346 insertions(+), 152 deletions(-)
 delete mode 100644 pasture-tools/src/acceleration_structures/shaders/comp.spv

diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 014e6d8..3c56bed 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -12,6 +12,7 @@ use std::convert::TryInto;
 use std::fs::File;
 use std::io::prelude::*;
 use wgpu::util::DeviceExt;
+use std::fmt;
 
 #[repr(C)]
 #[derive(PointType, Debug)]
@@ -44,7 +45,7 @@ struct MyPointType {
 #[derive(Debug, Clone)]
 pub struct OctreeNode {
     bounds: AABB<f64>,
-    children: Option<Box<[OctreeNode]>>,
+    children: Option<Box<[OctreeNode; 8]>>,
     node_partitioning: [u32; 8],
     points_per_partition: [u32; 8],
     point_start: u32,
@@ -63,11 +64,16 @@ pub struct GpuOctree<'a> {
 
 impl OctreeNode {
     fn is_leaf(&self, points_per_node: u32) -> bool {
-        // println!(
-        //     "\npoint start: {}, point end: {}\n",
-        //     self.point_start, self.point_end
-        // );
-        return self.point_end - self.point_start <= points_per_node;
+         // println!(
+         //     "\npoint start: {}, point end: {}\n",
+         //     self.point_start, self.point_end
+         // );
+        let diff: i64 = self.point_end as i64 - self.point_start as i64;
+        return diff <= points_per_node as i64;
+    }
+    fn is_empty(&self) -> bool {
+        let diff: i64 = self.point_end as i64 - self.point_start as i64;
+        diff < 0
     }
     fn into_raw(&self) -> Vec<u8> {
         let mut raw_node: Vec<u8> = Vec::new();
@@ -150,6 +156,24 @@ impl OctreeNode {
     }
 }
 
+impl fmt::Display for OctreeNode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f,"####### Octree Node #######\n");
+        write!(f, "Bounds: {:?}\n", self.bounds);
+        write!(f, "Start: {}, End: {}\n", self.point_start, self.point_end);
+        write!(f, "Node Partitioning: {:?}\n", self.node_partitioning);
+        write!(f, "Points per partition: {:?}\n", self.points_per_partition);
+        write!(f, "Chilren: ");
+        if let Some(c) = &self.children {
+            c.iter().for_each(|x| {write!(f, "    {}", x);});
+        }
+        else {
+            write!(f, "None\n");
+        }
+        write!(f, "##########\n")
+    }
+}
+
 impl<'a> GpuOctree<'a> {
     pub async fn new(
         point_buffer: &'a dyn PointBuffer,
@@ -319,10 +343,10 @@ impl<'a> GpuOctree<'a> {
             node_partitioning: [0; 8],
             points_per_partition: [0; 8],
             point_start: 0,
-            point_end: point_count as u32 - 1,
+            point_end: point_count as u32,
         };
-        root_node.node_partitioning[0] = point_count as u32 - 1;
-        root_node.points_per_partition[0] = point_count as u32 - 1;
+        root_node.node_partitioning[0] = point_count as u32;
+        root_node.points_per_partition[0] = point_count as u32;
         let xdiff = &root_node.bounds.max().x - &root_node.bounds.min().x;
         let ydiff = &root_node.bounds.max().y - &root_node.bounds.min().y;
         let zdiff = &root_node.bounds.max().z - &root_node.bounds.min().z;
@@ -344,19 +368,11 @@ impl<'a> GpuOctree<'a> {
         let mut current_nodes = vec![&mut root_node];
         let mut children_nodes: Vec<Box<[OctreeNode]>> = Vec::new();
 
-        let mut raw_indeces: Vec<u8> = (0u32..(point_count - 1) as u32)
+        let mut raw_indeces: Vec<u8> = (0u32..point_count as u32)
             .flat_map(|x| x.to_le_bytes().to_vec())
             .collect();
 
-        let point_index_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-            label: Some("IndexBuffer"),
-            contents: &raw_indeces.as_slice(),
-            usage: wgpu::BufferUsages::MAP_READ
-                | wgpu::BufferUsages::MAP_WRITE
-                | wgpu::BufferUsages::COPY_SRC
-                | wgpu::BufferUsages::COPY_DST
-                | wgpu::BufferUsages::STORAGE,
-        });
+
 
         let debug_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
             label: Some("DebugBuffer"),
@@ -369,30 +385,42 @@ impl<'a> GpuOctree<'a> {
             mapped_at_creation: false,
         });
 
-        let points_bind_group = self
-            .gpu_device
-            .create_bind_group(&wgpu::BindGroupDescriptor {
-                label: Some("PointBufferBindGroup"),
-                layout: &points_bind_group_layout,
-                entries: &[
-                    wgpu::BindGroupEntry {
-                        binding: 0,
-                        resource: gpu_point_buffer.as_entire_binding(),
-                    },
-                    wgpu::BindGroupEntry {
-                        binding: 1,
-                        resource: point_index_buffer.as_entire_binding(),
-                    },
-                    wgpu::BindGroupEntry {
-                        binding: 2,
-                        resource: debug_buffer.as_entire_binding(),
-                    },
-                ],
-            });
+
         let mut iterations = current_nodes.len();
         while !current_nodes.is_empty() {
         //for i in 0..iterations {
             //let num_new_nodes = 8u64.pow(tree_depth) - num_leaves as u64;
+            let point_index_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                label: Some("IndexBuffer"),
+                contents: &raw_indeces.as_slice(),
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::MAP_WRITE
+                    | wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
+            });
+
+            let points_bind_group = self
+                .gpu_device
+                .create_bind_group(&wgpu::BindGroupDescriptor {
+                    label: Some("PointBufferBindGroup"),
+                    layout: &points_bind_group_layout,
+                    entries: &[
+                        wgpu::BindGroupEntry {
+                            binding: 0,
+                            resource: gpu_point_buffer.as_entire_binding(),
+                        },
+                        wgpu::BindGroupEntry {
+                            binding: 1,
+                            resource: point_index_buffer.as_entire_binding(),
+                        },
+                        wgpu::BindGroupEntry {
+                            binding: 2,
+                            resource: debug_buffer.as_entire_binding(),
+                        },
+                    ],
+                });
+
             let child_buffer_size = 120 * current_nodes.len() as u64 * 8 as u64;
             let child_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
                 label: None,
@@ -476,12 +504,15 @@ impl<'a> GpuOctree<'a> {
             if let Ok(()) = mapped_future.await {
                 let mapped_index_buffer = index_slice.get_mapped_range();
                 let index_vec = mapped_index_buffer.to_vec();
-                let indices: Vec<u32> = index_vec
+                let mut indices: Vec<u32> = index_vec
                     .chunks_exact(4)
                     .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
                     .collect();
 
                 self.point_partitioning = indices.clone();
+                indices.sort();
+                indices.dedup();
+                //println!("{:?}", self.point_partitioning.len() - indices.len());
                 raw_indeces = index_vec.clone();
                 drop(mapped_index_buffer);
                 point_index_buffer.unmap();
@@ -515,7 +546,7 @@ impl<'a> GpuOctree<'a> {
             self.gpu_device.poll(wgpu::Maintain::Wait);
 
             //if matches!(mapped_parents.await, Ok(())) && matches!(mapped_children.await, Ok(())) {
-            if let Ok(()) =parents_future.await {
+            if let Ok(()) = parents_future.await {
                 let mapped_nodes_data = parents_slice.get_mapped_range();
                 let mapped_node_buffer = mapped_nodes_data.to_vec();
                 let nodes: Vec<OctreeNode> = mapped_node_buffer
@@ -541,17 +572,25 @@ impl<'a> GpuOctree<'a> {
 
                     let mut local_children: Vec<OctreeNode> = children.drain(..8).collect();
 
-                    node.children = Some(local_children.into_boxed_slice());
-                    let mut node_ref = current_nodes.swap_remove(0);
+                    let child_array: [OctreeNode; 8] = local_children.try_into().unwrap();
+                    node.children = Some(Box::new(child_array));
+
+                    let mut node_ref = current_nodes.remove(0);
                     *node_ref = node;
-                    let mut children: &mut Box<[OctreeNode]> = node_ref.children.as_mut().unwrap();
-                    let iter = (*children).iter_mut();
+                    println!("{}", node_ref);
+                    let mut children: &mut Box<[OctreeNode; 8]> = node_ref.children.as_mut().unwrap();
+
+                    let iter = children.iter_mut();
 
                     let mut child_index = 0;
+
+                    //println!("Child Range: {} - {}", node_ref.point_start, node_ref.point_end);
                     for child in iter {
+                        //println!("Node: {}", &child);
 
                         if children_sizes[child_index] != 0 && !child.is_leaf(self.points_per_node)
                         {
+
                             generated_children.push(child);
                         } else {
                             num_leaves += 1;
@@ -578,7 +617,11 @@ impl<'a> GpuOctree<'a> {
         }
 
         self.root_node = Some(root_node);
-        //println!("{:?}", self.root_node);
+        //println!("Root Bounds {:?}", self.root_node.as_ref().unwrap().bounds);
+        //println!("{}", self.root_node.as_ref().unwrap());
+
+        //println!("{:?}", self.point_partitioning.len());
+        //println!("{:?}",a);
     }
 
     fn get_points(&self, node: &OctreeNode) -> Vec<u32> {
@@ -606,7 +649,9 @@ mod tests {
 
     #[tokio::test]
     async fn check_correct_bounds() {
-        let mut reader = LASReader::from_path("/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz");
+        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
+                                            "/home/jnoice/Downloads/interesting.las"
+    );
         let mut reader = match reader {
             Ok(a) => a,
             Err(b) => panic!("Could not create LAS Reader"),
@@ -619,7 +664,7 @@ mod tests {
         };
         let bounds = reader.get_metadata().bounds().unwrap();
 
-        let mut octree = GpuOctree::new(&buffer, bounds, 500).await;
+        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {
@@ -630,25 +675,150 @@ mod tests {
         octree.construct().await;
         let mut node = octree.root_node.as_ref().unwrap();
         let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
+        while !nodes_to_visit.is_empty() {
+            let current_node = nodes_to_visit.remove(0);
+            //if let None = current_node.children{
+
+                let current_bounds = current_node.bounds;
+                let point_ids = octree.get_points(&current_node).into_iter();
+                let mut i = 0;
+                let current_start = current_node.point_start;
+                for id in point_ids {
+                    let point = buffer.get_point::<LasPointFormat0>(id as usize);
+                    let pos: Vector3<f64> = Vector3::from(point.position);
+                    println!("Bounds: {:?}", current_bounds);
+                    // println!("Start: {}, End  {}", current_node.point_start, current_node.point_end);
+                    // println!("Node Partitioning {:?}", current_node.node_partitioning);
+                    println!("Point: {:?}, id: {} in [{}, {}]", pos,current_start + i, current_node.point_start, current_node.point_end-1);
+                    //println!("{:?}", current_node);
+                    // current_node.children.as_ref().unwrap().iter().for_each(|x| println!("{:?}", x));
+                    assert!(current_bounds.min().x <= pos.x
+                        && current_bounds.max().x >= pos.x
+                        && current_bounds.min().y <= pos.y
+                        && current_bounds.max().y >= pos.y
+                        && current_bounds.min().z <= pos.z
+                        && current_bounds.max().z >= pos.z);
+                    i+=1;
+                }
+            //}
+            //else {
+               if let Some(children) = current_node.children.as_ref() {
+                //let children = current_node.children.as_ref().unwrap();
+                (*children).iter().for_each(|x| nodes_to_visit.push(x));
+                }
+            //}
+        }
+    }
+
+    #[tokio::test]
+    async fn check_point_count() {
+        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
+                                            "/home/jnoice/Downloads/interesting.las"
+    );
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(b) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(b) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
+
+        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let mut node = octree.root_node.as_ref().unwrap();
+        let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
+        let mut point_count: usize = 0;
         while !nodes_to_visit.is_empty() {
             let current_node = nodes_to_visit.pop().unwrap();
-            let current_bounds = current_node.bounds;
-            let point_ids = octree.get_points(&current_node).into_iter();
-            for id in point_ids {
-                let point = buffer.get_point::<LasPointFormat0>(id as usize);
-                let pos: Vector3<f64> = Vector3::from(point.position);
-                println!("Bounds: {:?}", current_bounds);
-                println!("Point: {:?}, id: {}", pos, id);
-                assert!(current_bounds.min().x <= pos.x
-                    && current_bounds.max().x >= pos.x
-                    && current_bounds.min().y <= pos.y
-                    && current_bounds.max().y >= pos.y
-                    && current_bounds.min().z <= pos.z
-                    && current_bounds.max().z >= pos.z);
+            if let None = current_node.children {
+                println!("{}", current_node);
+                //println!("{:?}", current_node.points_per_partition);
+                point_count += current_node.points_per_partition[0] as usize;
+            }
+            else {
+                let children = current_node.children.as_ref().unwrap();
+                (*children).iter().for_each(|x| nodes_to_visit.push(x));
             }
-            let children = current_node.children.as_ref().unwrap();
-            (*children).iter().for_each(|x| nodes_to_visit.push(x));
+        }
+        println!("Point count of octree: {}, Point Count of Buffer {}", point_count, count);
+        assert!(point_count == count);
+    }
+    #[tokio::test]
+    async fn check_point_partitioning_duplicates() {
+        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
+                                            "/home/jnoice/Downloads/interesting.las"
+    );
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(b) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(b) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
 
+        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let mut indices = octree.point_partitioning.clone();
+        indices.sort();
+        indices.dedup();
+        assert!(indices.len() == octree.point_partitioning.len());
+    }
+    #[tokio::test]
+    async fn check_node_overflows() {
+        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
+                                            "/home/jnoice/Downloads/interesting.las"
+    );
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(b) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(b) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
+
+        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let mut node = octree.root_node.as_ref().unwrap();
+        let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
+        while !nodes_to_visit.is_empty() {
+            let current_node = nodes_to_visit.pop().unwrap();
+            assert!(current_node.point_start <= current_node.point_end);
+            if let Some(children) = &current_node.children {
+                (*children).iter().for_each(|x| nodes_to_visit.push(x));
+            }
         }
     }
 }
diff --git a/pasture-tools/src/acceleration_structures/shaders/comp.spv b/pasture-tools/src/acceleration_structures/shaders/comp.spv
deleted file mode 100644
index 406c88ed51514b6d81bb2158dc2b2a7de6f3cdd3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2076
zcmZ9M=}uHZ5QSS{h7nLeS=>MtcLY%ZQ3Tm!Kv7X2fDl<iGJ;_-=p*?E{`5)w<1@G<
zCVt;#D!H+fqE6MRK3!e6huWI{`jk$jhEz(g)A4FeHDprKH{csnF^kK~Z<mMm57vi9
zMu)9DooaO%*Gx?TT!+11+ukwXh_#S6*NUkbR?FTR;%_7hROI&+;7V}Y>ql^<OlPaI
zy1e^tt+KT95qGe?yR$S;349@)#josc?;Olkb`RGRCl<4u{0H;1hnt%ldsNn@c9Sgk
zb>k~*ds`d(2dhWb++Z&EshXR}<vwS*VtS5$v$20z331|Ls?YMkwD%w9*)JqmWp3+i
zw2RgF48Dl<V2jl|0aL$!SY`HmPbcnv@6m_3BJMp7v;Fqx%I`N9`|Tya-`M?p4$i5`
zzD+OH*882>IWzhrq|Z8Tn=&)sNqiD7c8|PoM?Lcs)z~|m8z8o}33FM03NL5A1C<v1
zG^V!i$5`zy{yHyn-C*ZptVSE&e$9ENv9pRhbbxy>&#?#NPR!;0^LT4L%eV{g+q5V3
zjD0ue+NkWu$6aoM)s4DAuyw&d0IMCm?<&tP5SI|}PJS!T?4e>A?>=+xHTHX%lYdpk
z){pUa%&Y5iUcbor+jOtr%-@!NhO9F`jrm>t7L(Md(~Oy)Lcx0D;Ld@~&7k7EE?~~s
zIm(5OGxUASH=$~L33EpFCl{KR!Ok@_d%&S7m)GnimJiK7aA?YX<z9Z5E12iW|KuLm
z@UB0cE_W3(=X}J`%XP5#^X~R!{F|Dn)kp0OaDEoIh@)>g--NZ^eE^e>zHfu0Z@JJM
z0{eDi7I(m*DVNtACYBG)yWr52i&@+Qd!D?r7{R-oh1`A2oU;%|FQed?g*_QNi>TE{
z?O3%o`W`2azU6%XF^h*_`RMx*IQo_g&BtJWC!skB4o$hd<`l7fXg&dlrd-To8ti%U
z&SDntau(D08O)rs5JxX_U}y1zcVEQMWA^6v@O{ch?Ne~f%^G9p7P|WHoM->q0v3DA
zVE2BhzT-u(InS`)XLy(W$}M5$>{lE*FTuyZsY3QYVujfM8=aVM+t?cWiv9Y(Ol=(d
E2Uoj+KmY&$

diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
index 8685038..d5e4f1c 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -60,82 +60,103 @@ void swap( uint a,  uint b){
   indeces[a] = indeces[b];
   indeces[b] = tmp;
 }
+uint partition_run(uint start, uint end, double pivot, uint axis) {
+  uint i = start;
+  uint j = start;
 
-void partitioning(uint axis, double threshold, uint iteration, uint id){
-  // uint start = parent.points_start;
-  // uint end = parent.points_end;
-  // uint iteration_end = 1;
-  // if(iteration > 0) {
-  //   end = partition_borders[iteration - 1];
-  //   iteration_end = iteration == 1 ? 2 : 4;
-  // }
-  // for(uint k = 0; k < iteration_end; ++k){
-  //   while(start < end){
-  //     if(points[indeces[start]][axis] <= threshold){
-  //       ++start;
-  //       if(points[indeces[end]][axis] > threshold){
-  //         --end;
+  while(i < end) {
+    if(points[indeces[i]][axis] > pivot){
+      ++i;
+    }
+    else if(points[indeces[i]][axis] <= pivot){
+      swap(i, j);
+      ++i;
+      ++j;
+    }
+  }
+
+  return j;
+}
+
+void partitioning(uint[3] axes, double[3] pivots, uint id){
+  uint start = parents[id].points_start;
+  uint end = parents[id].node_partitioning[0];
+  // for(uint k = 0; k < 8; ++k) {
+  //   if(parents[id].node_partitioning[k] != 0 && k < 4 && start < parents[id].points_end){
+  //     uint end = parents[id].node_partitioning[k];
+  //     end = end > parents[id].points_start ? end - 1 : end;
+  //     while(start <= end) {
+  //
+  //       debug[id].points_start = start;
+  //       debug[id].points_end = end;
+  //       if(points[indeces[start]][axis] <= threshold){
+  //         ++start;
   //       }
-  //     } else {
-  //       if(points[indeces[end]][axis] <= threshold){
+  //       else if(points[indeces[end]][axis] <= threshold){
   //         swap(start, end);
-  //         ++start;
-  //         --end;
-  //       } else {
+  //
+  //       }
+  //       else {
   //         --end;
   //       }
   //     }
+  //
+  //
+  //     local_partitioning[k * 2] = points[indeces[start]][axis] <= threshold ? start : start;
+  //     local_partitioning[k * 2 + 1] = parents[id].node_partitioning[k];
+  //     start = parents[id].node_partitioning[k];
   //   }
-  //   //num_nodes = start;
-  //   if(iteration_end - k <= 2){
-  //     end = parent.points_end;
-  //   } else {
-  //     end = partition_borders[k * 2];
+  //   // else if(start >= parents[id].points_end && k < 4){
+  //   //   local_partitioning[k * 2] = parents[id].points_end;
+  //   //   local_partitioning[k * 2 + 1] = parents[id].points_end;
+  //   //   start = parents[id].points_end;
+  //   // }
+  //   parents[id].node_partitioning[k] = local_partitioning[k];
+  //}
+  // for(uint k = 0; k < 4; ++k) {
+  //   uint end = parents[id].node_partitioning[k];
+  //   if(end > 0){
+  //     start = partition_run(start, end, threshold, axis);
+  //     local_partitioning[k * 2] = start;
+  //     local_partitioning[k * 2 + 1] = parents[id].node_partitioning[k];
   //   }
-  //   partition_borders[iteration_end - 1 + k] = start;
-  //   if(iteration == 1) {
-  //     start = partition_borders[0] + 1;
+  // }
+  // for(uint i = 0; i < 8; ++i) {
+  //   parents[id].node_partitioning[i] = local_partitioning[i];
+  // }
+  parents[id].node_partitioning[3] = partition_run(start, end, pivots[axes[0]], axes[0]);
+  // if(parents[id].node_partitioning[3] > start){
+  //   parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
+  //   if(parents[id].node_partitioning[1] > start) {
+  //     parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
   //   }
   //   else {
-  //     if(k == 0) start = partition_borders[1] + 1;
-  //     else if(k == 1) start = partition_borders[0] + 1;
-  //     else if(k == 2) start = partition_borders[2] + 1;
+  //     parents[id].node_partitioning[0] = start;
   //   }
+  //   if(parents[id].node_partitioning[1] == parents[id].node_partitioning[3]) {
+  //     parents[id].node_partitioning[2] = parents[id].node_partitioning[1];
+  //   }
+  //   parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
   // }
-  uint start = parents[id].points_start;
-  debug[id].thread_id = 9999;
-  uint[8] local_partitioning;
-  for(uint k = 0; k < 8; ++k) {
-    if(parents[id].node_partitioning[k] != 0 && k < 7 && start < parents[id].points_end){
-      uint end = parents[id].node_partitioning[k];
-      while(start < end) {
-        debug[id].thread_id = 8888;
-        debug[id].points_start = start;
-        debug[id].points_end = end;
-        if(points[indeces[start]][axis] <= threshold){
-          ++start;
-        }
-        else if(points[indeces[end]][axis] <= threshold){
-          swap(start, end);
-          ++start;
-          --end;
-        }
-        if(points[indeces[end]][axis] > threshold){
-          --end;
-        }
-      }
-
-      local_partitioning[k * 2] = start;
-      local_partitioning[k * 2 + 1] = parents[id].node_partitioning[k];
-      start = parents[id].node_partitioning[k] ;
-    }
-    else if(start >= parents[id].points_end){
-      local_partitioning[k * 2] = parents[id].points_end;
-      local_partitioning[k * 2 + 1] = parents[id].points_end;
-      start = parents[id].points_end;
-    }
-    parents[id].node_partitioning[k] = local_partitioning[k];
-  }
+  // else {
+  //   parents[id].node_partitioning[0] = start;
+  //   parents[id].node_partitioning[1] = start;
+  //   parents[id].node_partitioning[2] = start;
+  // }
+  parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
+  parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
+  parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
+  parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivots[axes[1]], axes[1]);
+  // if(parents[id].node_partitioning[5] == parents[id].node_partitioning[3]) {
+  //   parents[id].node_partitioning[4] = parents[id].node_partitioning[3];
+  // }
+  // else {
+  //   parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
+  //
+  // }
+  parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
+  parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivots[axes[2]], axes[2]);
+  parents[id].node_partitioning[7] = end;
 }
 
 bool[3] partitioned_to_right(uint[3] partition_order, uint index){
@@ -143,7 +164,7 @@ bool[3] partitioned_to_right(uint[3] partition_order, uint index){
   if(index % 2 != 0){
     on_right_side[partition_order[2]] = true;
   }
-  else if( index % 4 != 0){
+  if(index >= 2 && index <= 3 || index >= 6){
     on_right_side[partition_order[1]] = true;
   }
   if(index >= 4){
@@ -172,7 +193,6 @@ double[2][3] get_childs_bounds(uint[3] partition_order, double[3] partition_axes
 
 void main() {
   uint idx = gl_WorkGroupID.x;
-  debug[idx].thread_id = 1111;
   Node parent = parents[idx];
 
   double x_diff = parent.bounds_max[0] - parent.bounds_min[0];
@@ -181,22 +201,22 @@ void main() {
   double x_partition = parent.bounds_min[0] + 0.5 * abs(x_diff);
   double y_partition = parent.bounds_min[1] + 0.5 * abs(y_diff);
   double z_partition = parent.bounds_min[2] + 0.5 * abs(z_diff);
-  debug[idx].thread_id = 2222;
+  double[3] partition_pivots = double[3](x_partition, y_partition, z_partition);
   uint[3] partition_order = partitioning_order(abs(x_diff), abs(y_diff), abs(z_diff));
-  for(uint i = 0; i < 3; ++i){
-    uint partition_axis = partition_order[i];
-    switch(partition_axis){
-      case 0:
-        partitioning(partition_axis, x_partition, i, idx);
-        break;
-      case 1:
-        partitioning(partition_axis, y_partition, i, idx);
-        break;
-      case 2:
-        partitioning(partition_axis, z_partition, i, idx);
-    }
-  }
-  debug[idx].thread_id = 3333;
+  // for(uint i = 0; i < 3; ++i){
+  //   uint partition_axis = partition_order[i];
+  //   switch(partition_axis){
+  //     case 0:
+  //       partitioning(partition_axis, x_partition, i, idx);
+  //       break;
+  //     case 1:
+  //       partitioning(partition_axis, y_partition, i, idx);
+  //       break;
+  //     case 2:
+  //       partitioning(partition_axis, z_partition, i, idx);
+  //   }
+  // }
+  partitioning(partition_order, partition_pivots, idx);
   // parents[idx].node_partitioning[0] = partition_borders[3];
   // parents[idx].node_partitioning[1] = partition_borders[1];
   // parents[idx].node_partitioning[2] = partition_borders[4];
@@ -213,16 +233,21 @@ void main() {
     parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i] - parents[idx].node_partitioning[i-1];
     }
   }
-  debug[idx].thread_id = 4444;
   for(uint i = 0; i < 8; ++i){
     if(i == 0) {
       children[idx * 8 + i].points_start = parents[idx].points_start;
     } else {
       children[idx * 8 + i].points_start = parents[idx].node_partitioning[i - 1];
     }
+    // if(parents[idx].points_per_partition[i] == 0 || i == 7) {
+    //   children[idx * 8 + i].points_end = parents[idx].node_partitioning[i];
+    // }
+    // else {
+    //   children[idx * 8 + i].points_end = parents[idx].node_partitioning[i] - 1;
+    // }
+    children[idx * 8 + i].points_per_partition[0] = parents[idx].points_per_partition[i];
     children[idx * 8 + i].points_end = parents[idx].node_partitioning[i];
     children[idx * 8 + i].node_partitioning[0] = children[idx * 8 + i].points_end;
-    children[idx * 8 + i].points_per_partition[0] = parents[idx].points_per_partition[i];
     double[2][3] child_bounds = get_childs_bounds(
       partition_order,
       double[3](x_partition, y_partition, z_partition),
@@ -232,12 +257,11 @@ void main() {
     children[idx * 8 + i].bounds_min = child_bounds[0];
     children[idx* 8 + i].bounds_max = child_bounds[1];
   }
-  debug[idx].thread_id = 55555;
   debug[idx].debug_order = partition_order;
   for(uint i = 0; i < 8; ++i){
     debug[idx].debug_borders[i] = parents[idx].node_partitioning[i];
   }
 
-  debug[idx].points_start = parents[idx].points_start;
-  debug[idx].points_end = parents[idx].points_end;
+  //debug[idx].points_start = parents[idx].points_start;
+  //debug[idx].points_end = parents[idx].points_end;
 }
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
index e4f9a97..55e5bf3 100644
--- a/pasture-tools/src/main.rs
+++ b/pasture-tools/src/main.rs
@@ -100,8 +100,8 @@ mod ex {
 
         let mut reader = LASReader::from_path(
             //"/home/jnoice/dev/pasture/pasture-io/examples/in/10_points_format_1.las",
-            "/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
-            //"/home/jnoice/Downloads/interesting.las",
+            //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
+            "/home/jnoice/Downloads/interesting.las",
             //"/home/jnoice/Downloads/20150930_matsch_flight2_rgb_densified_point_cloud_part_1 - Cloud.las",
             //"/home/jnoice/Downloads/45123H3316.laz",
             //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
@@ -189,7 +189,7 @@ mod ex {
         // }
 
         let mut octree =
-            pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 500).await;
+            pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 50).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {

From 9b3a567c91afedd20c1ce166d24688e6108c5617 Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Tue, 15 Feb 2022 16:38:51 +0100
Subject: [PATCH 08/15] cleaned up code

---
 .../src/acceleration_structures/gpu_octree.rs | 294 ++++++++----------
 .../shaders/generate_nodes.comp               |  28 +-
 pasture-tools/src/main.rs                     |   6 +-
 3 files changed, 151 insertions(+), 177 deletions(-)

diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 3c56bed..1658afa 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -10,38 +10,12 @@ use pasture_derive::PointType;
 use std::borrow::Cow;
 use std::convert::TryInto;
 use std::fs::File;
-use std::io::prelude::*;
 use wgpu::util::DeviceExt;
 use std::fmt;
+use std::sync::Arc;
+use std::thread;
+use std::mem;
 
-#[repr(C)]
-#[derive(PointType, Debug)]
-struct MyPointType {
-    #[pasture(BUILTIN_POSITION_3D)]
-    pub position: Vector3<f64>,
-    #[pasture(BUILTIN_COLOR_RGB)]
-    pub icolor: Vector3<u16>,
-    #[pasture(attribute = "MyColorF32")]
-    pub fcolor: Vector3<f32>,
-    #[pasture(attribute = "MyVec3U8")]
-    pub byte_vec: Vector3<u8>,
-    #[pasture(BUILTIN_CLASSIFICATION)]
-    pub classification: u8,
-    #[pasture(BUILTIN_INTENSITY)]
-    pub intensity: u16,
-    #[pasture(BUILTIN_SCAN_ANGLE)]
-    pub scan_angle: i16,
-    #[pasture(BUILTIN_SCAN_DIRECTION_FLAG)]
-    pub scan_dir_flag: bool,
-    #[pasture(attribute = "MyInt32")]
-    pub my_int: i32,
-    #[pasture(BUILTIN_WAVEFORM_PACKET_SIZE)]
-    pub packet_size: u32,
-    #[pasture(BUILTIN_RETURN_POINT_WAVEFORM_LOCATION)]
-    pub ret_point_loc: f32,
-    #[pasture(BUILTIN_GPS_TIME)]
-    pub gps_time: f64,
-}
 #[derive(Debug, Clone)]
 pub struct OctreeNode {
     bounds: AABB<f64>,
@@ -63,11 +37,12 @@ pub struct GpuOctree<'a> {
 }
 
 impl OctreeNode {
+    fn size() -> usize {
+        let mut size = mem::size_of::<OctreeNode>();
+        size -= mem::size_of::<Option<Box<[OctreeNode; 8]>>>();
+        size
+    }
     fn is_leaf(&self, points_per_node: u32) -> bool {
-         // println!(
-         //     "\npoint start: {}, point end: {}\n",
-         //     self.point_start, self.point_end
-         // );
         let diff: i64 = self.point_end as i64 - self.point_start as i64;
         return diff <= points_per_node as i64;
     }
@@ -102,9 +77,8 @@ impl OctreeNode {
                 .collect(),
         );
         raw_node.append(&mut self.point_start.to_le_bytes().to_vec());
-        //[0u8; 4].iter().for_each(|&x| raw_node.push(x));
         raw_node.append(&mut self.point_end.to_le_bytes().to_vec());
-        //[0u8; 4].iter().for_each(|&x| raw_node.push(x));
+
         raw_node
     }
     fn from_raw(mut data: Vec<u8>) -> Self {
@@ -165,7 +139,7 @@ impl fmt::Display for OctreeNode {
         write!(f, "Points per partition: {:?}\n", self.points_per_partition);
         write!(f, "Chilren: ");
         if let Some(c) = &self.children {
-            c.iter().for_each(|x| {write!(f, "    {}", x);});
+            c.iter().for_each(|x| {write!(f, "  {}", x);});
         }
         else {
             write!(f, "None\n");
@@ -210,20 +184,26 @@ impl<'a> GpuOctree<'a> {
             points_per_node,
         })
     }
+
     pub fn print_tree(&self) {
-        println!("{:?}", self.root_node);
+        println!("{}", self.root_node.as_ref().unwrap());
     }
+
     pub async fn construct(&mut self) {
         let point_count = self.point_buffer.len();
         let mut points: Vec<Vector3<f64>> = Vec::new();
+
         let point_iterator: AttributeIteratorByValue<Vector3<f64>, dyn PointBuffer> =
             self.point_buffer.iter_attribute(&attributes::POSITION_3D);
+
         let mut raw_points = vec![0u8; 24 * point_count];
+
         self.point_buffer.get_raw_attribute_range(
             0..point_count,
             &attributes::POSITION_3D,
             raw_points.as_mut_slice(),
         );
+
         for point in point_iterator {
             points.push(point);
         }
@@ -239,6 +219,7 @@ impl<'a> GpuOctree<'a> {
                 None,
             )
             .unwrap();
+
         let comp_data = wgpu::util::make_spirv(comp_spirv.as_binary_u8());
         let shader = self
             .gpu_device
@@ -246,6 +227,7 @@ impl<'a> GpuOctree<'a> {
                 label: Some("ModeGenerationShader"),
                 source: comp_data,
             });
+
         let points_bind_group_layout =
             self.gpu_device
                 .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
@@ -270,19 +252,20 @@ impl<'a> GpuOctree<'a> {
                             },
                             count: None,
                         },
-                        wgpu::BindGroupLayoutEntry {
-                            binding: 2,
-                            visibility: wgpu::ShaderStages::COMPUTE,
-                            ty: wgpu::BindingType::Buffer {
-                                ty: wgpu::BufferBindingType::Storage { read_only: false },
-                                has_dynamic_offset: false,
-                                min_binding_size: None,
-                            },
-                            count: None,
-                        },
+                        // wgpu::BindGroupLayoutEntry {
+                        //     binding: 2,
+                        //     visibility: wgpu::ShaderStages::COMPUTE,
+                        //     ty: wgpu::BindingType::Buffer {
+                        //         ty: wgpu::BufferBindingType::Storage { read_only: false },
+                        //         has_dynamic_offset: false,
+                        //         min_binding_size: None,
+                        //     },
+                        //     count: None,
+                        // },
                     ],
                     label: Some("PointBufferBindGroupLayout"),
                 });
+
         let mut nodes_bind_group_layout =
             self.gpu_device
                 .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
@@ -318,6 +301,7 @@ impl<'a> GpuOctree<'a> {
                     bind_group_layouts: &[&nodes_bind_group_layout, &points_bind_group_layout],
                     push_constant_ranges: &[],
                 });
+
         let compute_pipeline =
             self.gpu_device
                 .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
@@ -330,10 +314,7 @@ impl<'a> GpuOctree<'a> {
         let gpu_point_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
             label: Some("PointBuffer"),
             contents: &raw_points.as_slice(),
-            usage: wgpu::BufferUsages::MAP_READ
-                | wgpu::BufferUsages::MAP_WRITE
-                | wgpu::BufferUsages::COPY_SRC
-                | wgpu::BufferUsages::COPY_DST
+            usage: wgpu::BufferUsages::COPY_DST
                 | wgpu::BufferUsages::STORAGE,
         });
 
@@ -372,63 +353,61 @@ impl<'a> GpuOctree<'a> {
             .flat_map(|x| x.to_le_bytes().to_vec())
             .collect();
 
-
-
-        let debug_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-            label: Some("DebugBuffer"),
-            size: (3 * 4 + 8 * 4 + 4 + 2 * 4) as u64,
-            usage: wgpu::BufferUsages::MAP_READ
-                | wgpu::BufferUsages::MAP_WRITE
-                | wgpu::BufferUsages::COPY_SRC
+        // let debug_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+        //     label: Some("DebugBuffer"),
+        //     size: (3 * 4 + 8 * 4 + 4 + 2 * 4) as u64,
+        //     usage: wgpu::BufferUsages::MAP_READ
+        //         | wgpu::BufferUsages::STORAGE,
+        //     mapped_at_creation: false,
+        // });
+
+        let point_index_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("IndexBuffer"),
+            contents: &raw_indeces.as_slice(),
+            usage: wgpu::BufferUsages::COPY_SRC
                 | wgpu::BufferUsages::COPY_DST
                 | wgpu::BufferUsages::STORAGE,
-            mapped_at_creation: false,
         });
-
+        let index_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("CPU_IndexBuffer"),
+            size: raw_indeces.len() as u64,
+            usage: wgpu::BufferUsages::COPY_DST
+                | wgpu::BufferUsages::MAP_READ,
+                mapped_at_creation: false,
+        });
+        let points_bind_group = self
+            .gpu_device
+            .create_bind_group(&wgpu::BindGroupDescriptor {
+                label: Some("PointBufferBindGroup"),
+                layout: &points_bind_group_layout,
+                entries: &[
+                    wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: gpu_point_buffer.as_entire_binding(),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding: 1,
+                        resource: point_index_buffer.as_entire_binding(),
+                    },
+                    // wgpu::BindGroupEntry {
+                    //     binding: 2,
+                    //     resource: debug_buffer.as_entire_binding(),
+                    // },
+                ],
+            });
 
         let mut iterations = current_nodes.len();
-        while !current_nodes.is_empty() {
-        //for i in 0..iterations {
-            //let num_new_nodes = 8u64.pow(tree_depth) - num_leaves as u64;
-            let point_index_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-                label: Some("IndexBuffer"),
-                contents: &raw_indeces.as_slice(),
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::MAP_WRITE
-                    | wgpu::BufferUsages::COPY_SRC
-                    | wgpu::BufferUsages::COPY_DST
-                    | wgpu::BufferUsages::STORAGE,
-            });
 
-            let points_bind_group = self
-                .gpu_device
-                .create_bind_group(&wgpu::BindGroupDescriptor {
-                    label: Some("PointBufferBindGroup"),
-                    layout: &points_bind_group_layout,
-                    entries: &[
-                        wgpu::BindGroupEntry {
-                            binding: 0,
-                            resource: gpu_point_buffer.as_entire_binding(),
-                        },
-                        wgpu::BindGroupEntry {
-                            binding: 1,
-                            resource: point_index_buffer.as_entire_binding(),
-                        },
-                        wgpu::BindGroupEntry {
-                            binding: 2,
-                            resource: debug_buffer.as_entire_binding(),
-                        },
-                    ],
-                });
+        while !current_nodes.is_empty() {
 
-            let child_buffer_size = 120 * current_nodes.len() as u64 * 8 as u64;
+            let child_buffer_size =  8 * (OctreeNode::size() * current_nodes.len()) as u64; //8 * 120 * current_nodes.len() as u64;
             let child_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-                label: None,
+                label: Some("CPU_NewNodesBuffer"),
                 size: child_buffer_size,
                 usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
                 mapped_at_creation: false,
             });
-            let new_nodes_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+            let child_nodes_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
                 label: Some("NewNodesBuffer"),
                 size: //(mem::size_of::<OctreeNode>() - mem::size_of::<Box<[OctreeNode]>>()) as u64
                     child_buffer_size,
@@ -443,9 +422,10 @@ impl<'a> GpuOctree<'a> {
                 parent_nodes_raw.append(&mut node.into_raw());
             }
             let parent_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-                label: None,
+                label: Some("CPU_ParentNodesBuffer"),
                 size: parent_nodes_raw.len() as u64,
-                usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+                usage: wgpu::BufferUsages::MAP_READ
+                    | wgpu::BufferUsages::COPY_DST,
                 mapped_at_creation: false,
             });
             let parent_nodes_buffer = self.gpu_device.create_buffer_init(
@@ -469,7 +449,7 @@ impl<'a> GpuOctree<'a> {
                         },
                         wgpu::BindGroupEntry {
                             binding: 1,
-                            resource: new_nodes_buffer.as_entire_binding(),
+                            resource: child_nodes_buffer.as_entire_binding(),
                         },
                     ],
                 });
@@ -486,21 +466,21 @@ impl<'a> GpuOctree<'a> {
 
                 compute_pass.set_bind_group(0, &nodes_bind_group, &[]);
                 compute_pass.set_bind_group(1, &points_bind_group, &[]);
-                println!(
-                    "Starting gpu computation with {} threads",
-                    current_nodes.len()
-                );
+
                 compute_pass.insert_debug_marker("Pasture Compute Debug");
                 compute_pass.dispatch(current_nodes.len() as u32, 1, 1);
             }
-            encoder.copy_buffer_to_buffer(&new_nodes_buffer, 0, &child_nodes_buffer_staging, 0, child_buffer_size);
+            encoder.copy_buffer_to_buffer(&child_nodes_buffer, 0, &child_nodes_buffer_staging, 0, child_buffer_size);
             encoder.copy_buffer_to_buffer(&parent_nodes_buffer, 0, &parent_nodes_buffer_staging, 0, parent_nodes_raw.len() as u64);
+            encoder.copy_buffer_to_buffer(&point_index_buffer, 0, &index_buffer_staging, 0, raw_indeces.len() as u64);
 
             self.gpu_queue.submit(Some(encoder.finish()));
 
-            let index_slice = point_index_buffer.slice(..);
+            let index_slice = index_buffer_staging.slice(..);
             let mapped_future = index_slice.map_async(wgpu::MapMode::Read);
+
             self.gpu_device.poll(wgpu::Maintain::Wait);
+
             if let Ok(()) = mapped_future.await {
                 let mapped_index_buffer = index_slice.get_mapped_range();
                 let index_vec = mapped_index_buffer.to_vec();
@@ -510,47 +490,43 @@ impl<'a> GpuOctree<'a> {
                     .collect();
 
                 self.point_partitioning = indices.clone();
-                indices.sort();
-                indices.dedup();
-                //println!("{:?}", self.point_partitioning.len() - indices.len());
+
                 raw_indeces = index_vec.clone();
                 drop(mapped_index_buffer);
-                point_index_buffer.unmap();
-            }
-
-            let debug_slice = debug_buffer.slice(..);
-            let mapped_debug = debug_slice.map_async(wgpu::MapMode::Read);
-            self.gpu_device.poll(wgpu::Maintain::Wait);
-            if let Ok(()) = mapped_debug.await {
-                let debug_data = debug_slice.get_mapped_range();
-                let mut debug: Vec<u32> = debug_data
-                    .to_vec()
-                    .chunks_exact(4)
-                    .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
-                    .collect();
-                let partition_order: Vec<u32> = debug.drain(..3).collect();
-                let borders: Vec<u32> = debug.drain(..8).collect();
-                let thread_id: Vec<u32> = debug.drain(..1).collect();
-                let start_end: Vec<u32> = debug.drain(..2).collect();
-                println!(
-                        " Partition Order: {:?} \n Partition borders: {:?}\n thread index: {}\n start/end: {:?}",
-                         partition_order, borders, thread_id.first().unwrap(), start_end,
-                    );
-                drop(debug_data);
-                debug_buffer.unmap();
+                index_buffer_staging.unmap();
             }
 
+            // let debug_slice = debug_buffer.slice(..);
+            // let mapped_debug = debug_slice.map_async(wgpu::MapMode::Read);
+            // self.gpu_device.poll(wgpu::Maintain::Wait);
+            // if let Ok(()) = mapped_debug.await {
+            //     let debug_data = debug_slice.get_mapped_range();
+            //     let mut debug: Vec<u32> = debug_data
+            //         .to_vec()
+            //         .chunks_exact(4)
+            //         .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
+            //         .collect();
+            //     let partition_order: Vec<u32> = debug.drain(..3).collect();
+            //     let borders: Vec<u32> = debug.drain(..8).collect();
+            //     let thread_id: Vec<u32> = debug.drain(..1).collect();
+            //     let start_end: Vec<u32> = debug.drain(..2).collect();
+            //     println!(
+            //             " Partition Order: {:?} \n Partition borders: {:?}\n thread index: {}\n start/end: {:?}",
+            //              partition_order, borders, thread_id.first().unwrap(), start_end,
+            //         );
+            //     drop(debug_data);
+            //     debug_buffer.unmap();
+            // }
 
             let parents_slice = parent_nodes_buffer_staging.slice(..);
             let parents_future = parents_slice.map_async(wgpu::MapMode::Read);
-            self.gpu_device.poll(wgpu::Maintain::Wait);
 
-            //if matches!(mapped_parents.await, Ok(())) && matches!(mapped_children.await, Ok(())) {
+            self.gpu_device.poll(wgpu::Maintain::Wait);
             if let Ok(()) = parents_future.await {
                 let mapped_nodes_data = parents_slice.get_mapped_range();
                 let mapped_node_buffer = mapped_nodes_data.to_vec();
                 let nodes: Vec<OctreeNode> = mapped_node_buffer
-                    .chunks_exact(120)
+                    .chunks_exact(OctreeNode::size())
                     .map(|b| OctreeNode::from_raw(b.to_vec()))
                     .collect();
 
@@ -563,7 +539,7 @@ impl<'a> GpuOctree<'a> {
                 let mapped_children_data = children_slice.get_mapped_range();
                 let mapped_children_buffer = mapped_children_data.to_vec();
                 let mut children: Vec<OctreeNode> = mapped_children_buffer
-                    .chunks_exact(120)
+                    .chunks_exact(OctreeNode::size())
                     .map(|b| OctreeNode::from_raw(b.to_vec()))
                     .collect();
                 let mut generated_children: Vec<&mut OctreeNode> = Vec::new();
@@ -577,22 +553,18 @@ impl<'a> GpuOctree<'a> {
 
                     let mut node_ref = current_nodes.remove(0);
                     *node_ref = node;
-                    println!("{}", node_ref);
+
                     let mut children: &mut Box<[OctreeNode; 8]> = node_ref.children.as_mut().unwrap();
 
                     let iter = children.iter_mut();
 
                     let mut child_index = 0;
 
-                    //println!("Child Range: {} - {}", node_ref.point_start, node_ref.point_end);
                     for child in iter {
-                        //println!("Node: {}", &child);
-
-                        if children_sizes[child_index] != 0 && !child.is_leaf(self.points_per_node)
-                        {
-
+                        if children_sizes[child_index] != 0 && !child.is_leaf(self.points_per_node) {
                             generated_children.push(child);
-                        } else {
+                        }
+                        else {
                             num_leaves += 1;
                         }
 
@@ -605,17 +577,22 @@ impl<'a> GpuOctree<'a> {
                 parent_nodes_buffer_staging.unmap();
                 drop(mapped_children_data);
                 child_nodes_buffer_staging.unmap();
-                // parent_nodes_buffer.destroy();
-                // new_nodes_buffer.destroy();
+                parent_nodes_buffer.destroy();
+                child_nodes_buffer.destroy();
+                parent_nodes_buffer_staging.destroy();
+                child_nodes_buffer_staging.destroy();
             }
             }
             let work_done = self.gpu_queue.on_submitted_work_done();
             work_done.await;
+            //println!("====== PASS FINISHED ======", );
             tree_depth += 1;
             iterations = current_nodes.len();
 
         }
-
+        gpu_point_buffer.destroy();
+        point_index_buffer.destroy();
+        index_buffer_staging.destroy();
         self.root_node = Some(root_node);
         //println!("Root Bounds {:?}", self.root_node.as_ref().unwrap().bounds);
         //println!("{}", self.root_node.as_ref().unwrap());
@@ -647,11 +624,14 @@ mod tests {
 
     use tokio;
 
+    static FILE: &'static str = //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
+                                //"/home/jnoice/Downloads/interesting.las"
+                                //"/home/jnoice/Downloads/45123H3316.laz"
+                                "/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz"
+                                ;
     #[tokio::test]
     async fn check_correct_bounds() {
-        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
-                                            "/home/jnoice/Downloads/interesting.las"
-    );
+        let mut reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
             Err(b) => panic!("Could not create LAS Reader"),
@@ -664,7 +644,7 @@ mod tests {
         };
         let bounds = reader.get_metadata().bounds().unwrap();
 
-        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let mut octree = GpuOctree::new(&buffer, bounds, 12341).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {
@@ -678,7 +658,7 @@ mod tests {
         while !nodes_to_visit.is_empty() {
             let current_node = nodes_to_visit.remove(0);
             //if let None = current_node.children{
-
+                assert_ne!(current_node.node_partitioning, [0; 8]);
                 let current_bounds = current_node.bounds;
                 let point_ids = octree.get_points(&current_node).into_iter();
                 let mut i = 0;
@@ -712,9 +692,7 @@ mod tests {
 
     #[tokio::test]
     async fn check_point_count() {
-        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
-                                            "/home/jnoice/Downloads/interesting.las"
-    );
+        let mut reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
             Err(b) => panic!("Could not create LAS Reader"),
@@ -756,9 +734,7 @@ mod tests {
     }
     #[tokio::test]
     async fn check_point_partitioning_duplicates() {
-        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
-                                            "/home/jnoice/Downloads/interesting.las"
-    );
+        let mut reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
             Err(b) => panic!("Could not create LAS Reader"),
@@ -787,9 +763,7 @@ mod tests {
     }
     #[tokio::test]
     async fn check_node_overflows() {
-        let mut reader = LASReader::from_path(//"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
-                                            "/home/jnoice/Downloads/interesting.las"
-    );
+        let mut reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
             Err(b) => panic!("Could not create LAS Reader"),
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
index d5e4f1c..a3298ff 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -8,13 +8,13 @@ struct Node {
   uint points_start;
   uint points_end;
 };
-struct Debug {
-  uint debug_order[3];
-  uint debug_borders[8];
-  uint thread_id;
-  uint points_start;
-  uint points_end;
-};
+// struct Debug {
+//   uint debug_order[3];
+//   uint debug_borders[8];
+//   uint thread_id;
+//   uint points_start;
+//   uint points_end;
+// };
 
 layout(std430, set=0, binding=0) buffer ParentNodes{
   Node parents[];
@@ -28,9 +28,9 @@ layout(std430, set=1, binding=0) buffer PointBuffer {
 layout(std430, set=1, binding=1) buffer Partitioning {
   uint indeces[];
 };
-layout(std430, set=1, binding=2) buffer DebugBuffer {
-  Debug debug[];
-};
+// layout(std430, set=1, binding=2) buffer DebugBuffer {
+//   Debug debug[];
+// };
 
 layout (local_size_x=1, local_size_y=1, local_size_z=1) in;
 
@@ -257,10 +257,10 @@ void main() {
     children[idx * 8 + i].bounds_min = child_bounds[0];
     children[idx* 8 + i].bounds_max = child_bounds[1];
   }
-  debug[idx].debug_order = partition_order;
-  for(uint i = 0; i < 8; ++i){
-    debug[idx].debug_borders[i] = parents[idx].node_partitioning[i];
-  }
+  // debug[idx].debug_order = partition_order;
+  // for(uint i = 0; i < 8; ++i){
+  //   debug[idx].debug_borders[i] = parents[idx].node_partitioning[i];
+  // }
 
   //debug[idx].points_start = parents[idx].points_start;
   //debug[idx].points_end = parents[idx].points_end;
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
index 55e5bf3..e87ac22 100644
--- a/pasture-tools/src/main.rs
+++ b/pasture-tools/src/main.rs
@@ -101,10 +101,10 @@ mod ex {
         let mut reader = LASReader::from_path(
             //"/home/jnoice/dev/pasture/pasture-io/examples/in/10_points_format_1.las",
             //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
-            "/home/jnoice/Downloads/interesting.las",
+            //"/home/jnoice/Downloads/interesting.las",
             //"/home/jnoice/Downloads/20150930_matsch_flight2_rgb_densified_point_cloud_part_1 - Cloud.las",
             //"/home/jnoice/Downloads/45123H3316.laz",
-            //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
+            "/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
             //"/home/jnoice/Downloads/tirol.las",
         )?;
         let count = reader.remaining_points();
@@ -189,7 +189,7 @@ mod ex {
         // }
 
         let mut octree =
-            pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 50).await;
+            pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 500).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {

From ac50ed60e30848d22d15c0b112c96b908afee9a4 Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Mon, 21 Feb 2022 17:54:08 +0100
Subject: [PATCH 09/15] Octree Node Construction

---
 .../src/acceleration_structures/gpu_octree.rs | 507 +++++++++---------
 .../src/acceleration_structures/mod.rs        |   2 +-
 .../shaders/generate_nodes.comp               |   7 -
 pasture-tools/src/main.rs                     | 214 --------
 4 files changed, 247 insertions(+), 483 deletions(-)
 delete mode 100644 pasture-tools/src/main.rs

diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 1658afa..e2ccca1 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -1,20 +1,14 @@
-use pasture_core::containers::attr1::AttributeIteratorByValue;
-use pasture_core::math::AABB;
-use pasture_core::nalgebra::Point3;
 use pasture_core::{
-    containers::{PointBuffer, PointBufferExt},
-    layout::{attributes, PointLayout},
-    nalgebra::Vector3,
+    containers::{attr1::AttributeIteratorByValue, PointBuffer, PointBufferExt, PerAttributePointBufferExt},
+    layout::attributes,
+    math::AABB,
+    nalgebra::{Point3, Vector3},
 };
-use pasture_derive::PointType;
-use std::borrow::Cow;
+
 use std::convert::TryInto;
-use std::fs::File;
-use wgpu::util::DeviceExt;
 use std::fmt;
-use std::sync::Arc;
-use std::thread;
 use std::mem;
+use wgpu::util::DeviceExt;
 
 #[derive(Debug, Clone)]
 pub struct OctreeNode {
@@ -32,24 +26,28 @@ pub struct GpuOctree<'a> {
     point_buffer: &'a dyn PointBuffer,
     point_partitioning: Vec<u32>,
     root_node: Option<OctreeNode>,
+    depth: u32,
     bounds: AABB<f64>,
     points_per_node: u32,
 }
 
 impl OctreeNode {
+    /// Get the number of bytes, a node allocates on the gpu.
+    /// Because the `children` pointer is not required for GPU node creation,
+    /// it's size is neglected.
     fn size() -> usize {
         let mut size = mem::size_of::<OctreeNode>();
         size -= mem::size_of::<Option<Box<[OctreeNode; 8]>>>();
         size
     }
+    /// Checks if the given node has less than or equal to `points_per_node` points.
+    /// If yes, the node is a leaf.
     fn is_leaf(&self, points_per_node: u32) -> bool {
         let diff: i64 = self.point_end as i64 - self.point_start as i64;
         return diff <= points_per_node as i64;
     }
-    fn is_empty(&self) -> bool {
-        let diff: i64 = self.point_end as i64 - self.point_start as i64;
-        diff < 0
-    }
+    /// Returns a vector of the nodes raw data. As with `size(), the field
+    /// `children`is not included, as it is not necessary for GPU computation.
     fn into_raw(&self) -> Vec<u8> {
         let mut raw_node: Vec<u8> = Vec::new();
         for coord in self.bounds.min().iter() {
@@ -81,6 +79,8 @@ impl OctreeNode {
 
         raw_node
     }
+    /// Converts a vector of raw data back into `OctreeNode`.
+    /// Panics, if the vector has not enough data.
     fn from_raw(mut data: Vec<u8>) -> Self {
         let raw_bounds: Vec<u8> = data.drain(..24).collect();
         let bounds_iter = raw_bounds.chunks_exact(8);
@@ -116,7 +116,7 @@ impl OctreeNode {
             points_per_partition[i] = *rest_iter.next().unwrap();
         }
         let points_start = *rest_iter.next().unwrap();
-        //rest_iter.next();
+
         let points_end = *rest_iter.next().unwrap();
 
         OctreeNode {
@@ -128,20 +128,26 @@ impl OctreeNode {
             point_end: points_end,
         }
     }
+    /// Checks if `pos` is within the bounds of the node.
+    fn contains(&self, pos: Vector3<f64>) -> bool {
+        let point: Point3<f64> = Point3::new(pos.x, pos.y, pos.z);
+        self.bounds.contains(&point)
+    }
 }
 
 impl fmt::Display for OctreeNode {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f,"####### Octree Node #######\n");
+        write!(f, "####### Octree Node #######\n");
         write!(f, "Bounds: {:?}\n", self.bounds);
         write!(f, "Start: {}, End: {}\n", self.point_start, self.point_end);
         write!(f, "Node Partitioning: {:?}\n", self.node_partitioning);
         write!(f, "Points per partition: {:?}\n", self.points_per_partition);
         write!(f, "Chilren: ");
         if let Some(c) = &self.children {
-            c.iter().for_each(|x| {write!(f, "  {}", x);});
-        }
-        else {
+            c.iter().for_each(|x| {
+                write!(f, "  {}", x);
+            });
+        } else {
             write!(f, "None\n");
         }
         write!(f, "##########\n")
@@ -149,11 +155,24 @@ impl fmt::Display for OctreeNode {
 }
 
 impl<'a> GpuOctree<'a> {
+    /// Creates an empty Octree accelerated by the GPU.
+    /// 
+    /// `point_buffer`: pasture buffer containing the point cloud data
+    /// 
+    /// `max_bounds`: boundary of the point cloud
+    /// 
+    /// `points_per_node`: threshold for a node becoming a leaf
+    /// 
+    /// The generated instance has no constructed octree. To get the octree,
+    /// run `construct()`.
     pub async fn new(
         point_buffer: &'a dyn PointBuffer,
         max_bounds: AABB<f64>,
         points_per_node: u32,
     ) -> Result<GpuOctree<'a>, wgpu::RequestDeviceError> {
+        if points_per_node < 1 {
+            panic!("Cannot build octree with less than 1 point per node!")
+        }
         let instance = wgpu::Instance::new(wgpu::Backends::VULKAN);
         let adapter = instance
             .request_adapter(&wgpu::RequestAdapterOptions {
@@ -173,13 +192,14 @@ impl<'a> GpuOctree<'a> {
                 None,
             )
             .await?;
-        println!("GPU Adapter limits: {:?}", adapter.limits());
+            
         Ok(GpuOctree {
             gpu_device: device,
             gpu_queue: queue,
             point_buffer,
             point_partitioning: (0..point_buffer.len() as u32).collect(),
             root_node: None,
+            depth: 0,
             bounds: max_bounds,
             points_per_node,
         })
@@ -188,14 +208,14 @@ impl<'a> GpuOctree<'a> {
     pub fn print_tree(&self) {
         println!("{}", self.root_node.as_ref().unwrap());
     }
-
+    /// Run top-down construction of the octree.
+    /// 
+    /// Starting from the root, on each level the children of all current leaves
+    /// are computed and put into the next compute stage, if these children are big enough.
     pub async fn construct(&mut self) {
         let point_count = self.point_buffer.len();
-        let mut points: Vec<Vector3<f64>> = Vec::new();
-
-        let point_iterator: AttributeIteratorByValue<Vector3<f64>, dyn PointBuffer> =
-            self.point_buffer.iter_attribute(&attributes::POSITION_3D);
-
+        
+        // point cloud data, later uploaded to GPU
         let mut raw_points = vec![0u8; 24 * point_count];
 
         self.point_buffer.get_raw_attribute_range(
@@ -204,10 +224,6 @@ impl<'a> GpuOctree<'a> {
             raw_points.as_mut_slice(),
         );
 
-        for point in point_iterator {
-            points.push(point);
-        }
-
         let mut compiler = shaderc::Compiler::new().unwrap();
         let comp_shader = include_str!("shaders/generate_nodes.comp");
         let comp_spirv = compiler
@@ -224,10 +240,13 @@ impl<'a> GpuOctree<'a> {
         let shader = self
             .gpu_device
             .create_shader_module(&wgpu::ShaderModuleDescriptor {
-                label: Some("ModeGenerationShader"),
+                label: Some("NodeGenerationShader"),
                 source: comp_data,
             });
 
+        // 2 Bind groups are used
+        // - points_bind_group for point cloud data and point indices
+        // - nodes_bind_group for parent nodes and children nodes computed by GPU
         let points_bind_group_layout =
             self.gpu_device
                 .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
@@ -252,21 +271,11 @@ impl<'a> GpuOctree<'a> {
                             },
                             count: None,
                         },
-                        // wgpu::BindGroupLayoutEntry {
-                        //     binding: 2,
-                        //     visibility: wgpu::ShaderStages::COMPUTE,
-                        //     ty: wgpu::BindingType::Buffer {
-                        //         ty: wgpu::BufferBindingType::Storage { read_only: false },
-                        //         has_dynamic_offset: false,
-                        //         min_binding_size: None,
-                        //     },
-                        //     count: None,
-                        // },
                     ],
                     label: Some("PointBufferBindGroupLayout"),
                 });
 
-        let mut nodes_bind_group_layout =
+        let nodes_bind_group_layout =
             self.gpu_device
                 .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
                     label: Some("NodesBindGroupLayout"),
@@ -311,12 +320,13 @@ impl<'a> GpuOctree<'a> {
                     entry_point: "main",
                 });
 
-        let gpu_point_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-            label: Some("PointBuffer"),
-            contents: &raw_points.as_slice(),
-            usage: wgpu::BufferUsages::COPY_DST
-                | wgpu::BufferUsages::STORAGE,
-        });
+        let gpu_point_buffer =
+            self.gpu_device
+                .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                    label: Some("PointBuffer"),
+                    contents: &raw_points.as_slice(),
+                    usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::STORAGE,
+                });
 
         let mut root_node = OctreeNode {
             bounds: self.bounds,
@@ -328,52 +338,31 @@ impl<'a> GpuOctree<'a> {
         };
         root_node.node_partitioning[0] = point_count as u32;
         root_node.points_per_partition[0] = point_count as u32;
-        let xdiff = &root_node.bounds.max().x - &root_node.bounds.min().x;
-        let ydiff = &root_node.bounds.max().y - &root_node.bounds.min().y;
-        let zdiff = &root_node.bounds.max().z - &root_node.bounds.min().z;
-        println!("Point count: {}", point_count);
-        println!("xdiff {}", xdiff);
-        println!("ydiff {}", ydiff);
-        println!("zdiff {}", zdiff);
-        let xpartition = &root_node.bounds.min().x + 0.5 * xdiff;
-        let ypartition = &root_node.bounds.min().y + 0.5 * ydiff;
-        let zpartition = &root_node.bounds.min().z + 0.5 * zdiff;
-        println!("x_partition {}", xpartition);
-        println!("y_partition {}", ypartition);
-        println!("z_partition {}", zpartition);
 
         let mut tree_depth = 1;
         let mut num_leaves: u32 = 0;
         let mut num_nodes: u32 = 1;
 
         let mut current_nodes = vec![&mut root_node];
-        let mut children_nodes: Vec<Box<[OctreeNode]>> = Vec::new();
 
-        let mut raw_indeces: Vec<u8> = (0u32..point_count as u32)
+        let raw_indeces: Vec<u8> = (0u32..point_count as u32)
             .flat_map(|x| x.to_le_bytes().to_vec())
             .collect();
 
-        // let debug_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-        //     label: Some("DebugBuffer"),
-        //     size: (3 * 4 + 8 * 4 + 4 + 2 * 4) as u64,
-        //     usage: wgpu::BufferUsages::MAP_READ
-        //         | wgpu::BufferUsages::STORAGE,
-        //     mapped_at_creation: false,
-        // });
-
-        let point_index_buffer = self.gpu_device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-            label: Some("IndexBuffer"),
-            contents: &raw_indeces.as_slice(),
-            usage: wgpu::BufferUsages::COPY_SRC
-                | wgpu::BufferUsages::COPY_DST
-                | wgpu::BufferUsages::STORAGE,
-        });
+        let point_index_buffer =
+            self.gpu_device
+                .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                    label: Some("IndexBuffer"),
+                    contents: &raw_indeces.as_slice(),
+                    usage: wgpu::BufferUsages::COPY_SRC
+                        | wgpu::BufferUsages::COPY_DST
+                        | wgpu::BufferUsages::STORAGE,
+                });
         let index_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
             label: Some("CPU_IndexBuffer"),
             size: raw_indeces.len() as u64,
-            usage: wgpu::BufferUsages::COPY_DST
-                | wgpu::BufferUsages::MAP_READ,
-                mapped_at_creation: false,
+            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+            mapped_at_creation: false,
         });
         let points_bind_group = self
             .gpu_device
@@ -389,27 +378,22 @@ impl<'a> GpuOctree<'a> {
                         binding: 1,
                         resource: point_index_buffer.as_entire_binding(),
                     },
-                    // wgpu::BindGroupEntry {
-                    //     binding: 2,
-                    //     resource: debug_buffer.as_entire_binding(),
-                    // },
                 ],
             });
 
-        let mut iterations = current_nodes.len();
-
         while !current_nodes.is_empty() {
-
-            let child_buffer_size =  8 * (OctreeNode::size() * current_nodes.len()) as u64; //8 * 120 * current_nodes.len() as u64;
-            let child_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-                label: Some("CPU_NewNodesBuffer"),
-                size: child_buffer_size,
-                usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
-                mapped_at_creation: false,
-            });
+            // Nodes buffers are created inside the loop, as their size changes per iteration
+            let child_buffer_size = 8 * (OctreeNode::size() * current_nodes.len()) as u64; 
+            let child_nodes_buffer_staging =
+                self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("CPU_NewNodesBuffer"),
+                    size: child_buffer_size,
+                    usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
+                });
             let child_nodes_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
                 label: Some("NewNodesBuffer"),
-                size: //(mem::size_of::<OctreeNode>() - mem::size_of::<Box<[OctreeNode]>>()) as u64
+                size: 
                     child_buffer_size,
                 usage: wgpu::BufferUsages::COPY_SRC
                     | wgpu::BufferUsages::COPY_DST
@@ -421,22 +405,22 @@ impl<'a> GpuOctree<'a> {
             for node in &current_nodes {
                 parent_nodes_raw.append(&mut node.into_raw());
             }
-            let parent_nodes_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-                label: Some("CPU_ParentNodesBuffer"),
-                size: parent_nodes_raw.len() as u64,
-                usage: wgpu::BufferUsages::MAP_READ
-                    | wgpu::BufferUsages::COPY_DST,
-                mapped_at_creation: false,
-            });
-            let parent_nodes_buffer = self.gpu_device.create_buffer_init(
-                &wgpu::util::BufferInitDescriptor {
-                    label: Some("ParentNodesBuffer"),
-                    contents: parent_nodes_raw.as_slice(),
-                    usage: wgpu::BufferUsages::COPY_SRC
-                        | wgpu::BufferUsages::COPY_DST
-                        | wgpu::BufferUsages::STORAGE,
-                },
-            );
+            let parent_nodes_buffer_staging =
+                self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("CPU_ParentNodesBuffer"),
+                    size: parent_nodes_raw.len() as u64,
+                    usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
+                });
+            let parent_nodes_buffer =
+                self.gpu_device
+                    .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                        label: Some("ParentNodesBuffer"),
+                        contents: parent_nodes_raw.as_slice(),
+                        usage: wgpu::BufferUsages::COPY_SRC
+                            | wgpu::BufferUsages::COPY_DST
+                            | wgpu::BufferUsages::STORAGE,
+                    });
             let nodes_bind_group = self
                 .gpu_device
                 .create_bind_group(&wgpu::BindGroupDescriptor {
@@ -470,9 +454,27 @@ impl<'a> GpuOctree<'a> {
                 compute_pass.insert_debug_marker("Pasture Compute Debug");
                 compute_pass.dispatch(current_nodes.len() as u32, 1, 1);
             }
-            encoder.copy_buffer_to_buffer(&child_nodes_buffer, 0, &child_nodes_buffer_staging, 0, child_buffer_size);
-            encoder.copy_buffer_to_buffer(&parent_nodes_buffer, 0, &parent_nodes_buffer_staging, 0, parent_nodes_raw.len() as u64);
-            encoder.copy_buffer_to_buffer(&point_index_buffer, 0, &index_buffer_staging, 0, raw_indeces.len() as u64);
+            encoder.copy_buffer_to_buffer(
+                &child_nodes_buffer,
+                0,
+                &child_nodes_buffer_staging,
+                0,
+                child_buffer_size,
+            );
+            encoder.copy_buffer_to_buffer(
+                &parent_nodes_buffer,
+                0,
+                &parent_nodes_buffer_staging,
+                0,
+                parent_nodes_raw.len() as u64,
+            );
+            encoder.copy_buffer_to_buffer(
+                &point_index_buffer,
+                0,
+                &index_buffer_staging,
+                0,
+                raw_indeces.len() as u64,
+            );
 
             self.gpu_queue.submit(Some(encoder.finish()));
 
@@ -480,44 +482,21 @@ impl<'a> GpuOctree<'a> {
             let mapped_future = index_slice.map_async(wgpu::MapMode::Read);
 
             self.gpu_device.poll(wgpu::Maintain::Wait);
-
+            // Read in the changes of the global point partitioning
             if let Ok(()) = mapped_future.await {
                 let mapped_index_buffer = index_slice.get_mapped_range();
                 let index_vec = mapped_index_buffer.to_vec();
-                let mut indices: Vec<u32> = index_vec
+                let indices: Vec<u32> = index_vec
                     .chunks_exact(4)
                     .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
                     .collect();
 
                 self.point_partitioning = indices.clone();
 
-                raw_indeces = index_vec.clone();
                 drop(mapped_index_buffer);
                 index_buffer_staging.unmap();
             }
 
-            // let debug_slice = debug_buffer.slice(..);
-            // let mapped_debug = debug_slice.map_async(wgpu::MapMode::Read);
-            // self.gpu_device.poll(wgpu::Maintain::Wait);
-            // if let Ok(()) = mapped_debug.await {
-            //     let debug_data = debug_slice.get_mapped_range();
-            //     let mut debug: Vec<u32> = debug_data
-            //         .to_vec()
-            //         .chunks_exact(4)
-            //         .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
-            //         .collect();
-            //     let partition_order: Vec<u32> = debug.drain(..3).collect();
-            //     let borders: Vec<u32> = debug.drain(..8).collect();
-            //     let thread_id: Vec<u32> = debug.drain(..1).collect();
-            //     let start_end: Vec<u32> = debug.drain(..2).collect();
-            //     println!(
-            //             " Partition Order: {:?} \n Partition borders: {:?}\n thread index: {}\n start/end: {:?}",
-            //              partition_order, borders, thread_id.first().unwrap(), start_end,
-            //         );
-            //     drop(debug_data);
-            //     debug_buffer.unmap();
-            // }
-
             let parents_slice = parent_nodes_buffer_staging.slice(..);
             let parents_future = parents_slice.map_async(wgpu::MapMode::Read);
 
@@ -535,116 +514,123 @@ impl<'a> GpuOctree<'a> {
                 self.gpu_device.poll(wgpu::Maintain::Wait);
 
                 if let Ok(()) = children_future.await {
-
-                let mapped_children_data = children_slice.get_mapped_range();
-                let mapped_children_buffer = mapped_children_data.to_vec();
-                let mut children: Vec<OctreeNode> = mapped_children_buffer
-                    .chunks_exact(OctreeNode::size())
-                    .map(|b| OctreeNode::from_raw(b.to_vec()))
-                    .collect();
-                let mut generated_children: Vec<&mut OctreeNode> = Vec::new();
-                for mut node in nodes {
-                    let children_sizes = node.points_per_partition.clone();
-
-                    let mut local_children: Vec<OctreeNode> = children.drain(..8).collect();
-
-                    let child_array: [OctreeNode; 8] = local_children.try_into().unwrap();
-                    node.children = Some(Box::new(child_array));
-
-                    let mut node_ref = current_nodes.remove(0);
-                    *node_ref = node;
-
-                    let mut children: &mut Box<[OctreeNode; 8]> = node_ref.children.as_mut().unwrap();
-
-                    let iter = children.iter_mut();
-
-                    let mut child_index = 0;
-
-                    for child in iter {
-                        if children_sizes[child_index] != 0 && !child.is_leaf(self.points_per_node) {
-                            generated_children.push(child);
-                        }
-                        else {
-                            num_leaves += 1;
+                    let mapped_children_data = children_slice.get_mapped_range();
+                    let mapped_children_buffer = mapped_children_data.to_vec();
+                    let mut children: Vec<OctreeNode> = mapped_children_buffer
+                        .chunks_exact(OctreeNode::size())
+                        .map(|b| OctreeNode::from_raw(b.to_vec()))
+                        .collect();
+                    let mut generated_children: Vec<&mut OctreeNode> = Vec::new();
+                    for mut node in nodes {
+                        let children_sizes = node.points_per_partition.clone();
+
+                        let local_children: Vec<OctreeNode> = children.drain(..8).collect();
+
+                        let child_array: [OctreeNode; 8] = local_children.try_into().unwrap();
+                        node.children = Some(Box::new(child_array));
+
+                        let node_ref = current_nodes.remove(0);
+                        *node_ref = node;
+
+                        let children: &mut Box<[OctreeNode; 8]> =
+                            node_ref.children.as_mut().unwrap();
+
+                        let iter = children.iter_mut();
+
+                        let mut child_index = 0;
+
+                        for child in iter {
+                            if children_sizes[child_index] != 0
+                                && !child.is_leaf(self.points_per_node)
+                            {
+                                generated_children.push(child);
+                            } else {
+                                num_leaves += 1;
+                            }
+
+                            num_nodes += 1;
+                            child_index += 1;
                         }
-
-                        num_nodes += 1;
-                        child_index += 1;
                     }
+                    current_nodes.append(&mut generated_children);
+                    drop(mapped_nodes_data);
+                    parent_nodes_buffer_staging.unmap();
+                    drop(mapped_children_data);
+                    child_nodes_buffer_staging.unmap();
+                    parent_nodes_buffer.destroy();
+                    child_nodes_buffer.destroy();
+                    parent_nodes_buffer_staging.destroy();
+                    child_nodes_buffer_staging.destroy();
                 }
-                current_nodes.append(&mut generated_children);
-                drop(mapped_nodes_data);
-                parent_nodes_buffer_staging.unmap();
-                drop(mapped_children_data);
-                child_nodes_buffer_staging.unmap();
-                parent_nodes_buffer.destroy();
-                child_nodes_buffer.destroy();
-                parent_nodes_buffer_staging.destroy();
-                child_nodes_buffer_staging.destroy();
-            }
             }
             let work_done = self.gpu_queue.on_submitted_work_done();
             work_done.await;
-            //println!("====== PASS FINISHED ======", );
-            tree_depth += 1;
-            iterations = current_nodes.len();
 
+            tree_depth += 1;
         }
+        
         gpu_point_buffer.destroy();
         point_index_buffer.destroy();
         index_buffer_staging.destroy();
         self.root_node = Some(root_node);
-        //println!("Root Bounds {:?}", self.root_node.as_ref().unwrap().bounds);
-        //println!("{}", self.root_node.as_ref().unwrap());
-
-        //println!("{:?}", self.point_partitioning.len());
-        //println!("{:?}",a);
+        self.depth = tree_depth;
     }
 
     fn get_points(&self, node: &OctreeNode) -> Vec<u32> {
-        let indices = self.point_partitioning[node.point_start as usize..node.point_end as usize].to_vec();
+        let indices =
+            self.point_partitioning[node.point_start as usize..node.point_end as usize].to_vec();
         return indices;
     }
+
+    fn deepest_octant(&self, node: &'a OctreeNode, pos: Vector3<f64>) -> &'a OctreeNode {
+        if let Some(children) = node.children.as_ref() {
+            for child in children.iter() {
+                if !child.is_leaf(self.points_per_node) && child.contains(pos) {
+                    return self.deepest_octant(child, pos);
+                }
+            }
+        }
+        node
+    }
+
 }
 
 #[cfg(test)]
 mod tests {
+    use crate::acceleration_structures::GpuOctree;
+    use crate::acceleration_structures::gpu_octree::OctreeNode;
     use pasture_core::containers::InterleavedVecPointStorage;
     use pasture_core::containers::PointBufferExt;
-    use pasture_io::base::PointReader;
-    use pasture_io::las::LasPointFormat0;
-    use pasture_io::las::LASReader;
     use pasture_core::layout::PointType;
-    use crate::acceleration_structures::GpuOctree;
-    use crate::acceleration_structures::OctreeNode;
     use pasture_core::nalgebra::Vector3;
-    use pasture_core::layout::attributes;
-    use std::convert::TryInto;
-    use std::error::Error;
+    use pasture_io::base::PointReader;
+    use pasture_io::las::LASReader;
+    use pasture_io::las::LasPointFormat0;
 
     use tokio;
 
     static FILE: &'static str = //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
-                                //"/home/jnoice/Downloads/interesting.las"
+                                "/home/jnoice/Downloads/interesting.las"
                                 //"/home/jnoice/Downloads/45123H3316.laz"
-                                "/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz"
+                                //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz"
                                 ;
     #[tokio::test]
     async fn check_correct_bounds() {
-        let mut reader = LASReader::from_path(FILE);
+        let reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
-            Err(b) => panic!("Could not create LAS Reader"),
+            Err(_) => panic!("Could not create LAS Reader"),
         };
         let count = reader.remaining_points();
-        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
-        let data_read = match reader.read_into(&mut buffer, count) {
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
             Ok(a) => a,
-            Err(b) => panic!("Could not write Point Buffer"),
+            Err(_) => panic!("Could not write Point Buffer"),
         };
         let bounds = reader.get_metadata().bounds().unwrap();
 
-        let mut octree = GpuOctree::new(&buffer, bounds, 12341).await;
+        let octree = GpuOctree::new(&buffer, bounds, 75).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {
@@ -653,59 +639,54 @@ mod tests {
             }
         };
         octree.construct().await;
-        let mut node = octree.root_node.as_ref().unwrap();
+        let node = octree.root_node.as_ref().unwrap();
         let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
         while !nodes_to_visit.is_empty() {
             let current_node = nodes_to_visit.remove(0);
-            //if let None = current_node.children{
-                assert_ne!(current_node.node_partitioning, [0; 8]);
-                let current_bounds = current_node.bounds;
-                let point_ids = octree.get_points(&current_node).into_iter();
-                let mut i = 0;
-                let current_start = current_node.point_start;
-                for id in point_ids {
-                    let point = buffer.get_point::<LasPointFormat0>(id as usize);
-                    let pos: Vector3<f64> = Vector3::from(point.position);
-                    println!("Bounds: {:?}", current_bounds);
-                    // println!("Start: {}, End  {}", current_node.point_start, current_node.point_end);
-                    // println!("Node Partitioning {:?}", current_node.node_partitioning);
-                    println!("Point: {:?}, id: {} in [{}, {}]", pos,current_start + i, current_node.point_start, current_node.point_end-1);
-                    //println!("{:?}", current_node);
-                    // current_node.children.as_ref().unwrap().iter().for_each(|x| println!("{:?}", x));
-                    assert!(current_bounds.min().x <= pos.x
+            println!("Partition {:?}", current_node.node_partitioning);
+            assert!((current_node.point_start == 0 &&
+                current_node.point_end == 0 &&
+                current_node.node_partitioning == [0; 8]) || 
+                current_node.node_partitioning != [0; 8]);
+            let current_bounds = current_node.bounds;
+            let point_ids = octree.get_points(&current_node).into_iter();
+            for id in point_ids {
+                let point = buffer.get_point::<LasPointFormat0>(id as usize);
+                let pos: Vector3<f64> = Vector3::from(point.position);
+                
+                assert!(
+                    current_bounds.min().x <= pos.x
                         && current_bounds.max().x >= pos.x
                         && current_bounds.min().y <= pos.y
                         && current_bounds.max().y >= pos.y
                         && current_bounds.min().z <= pos.z
-                        && current_bounds.max().z >= pos.z);
-                    i+=1;
-                }
-            //}
-            //else {
-               if let Some(children) = current_node.children.as_ref() {
-                //let children = current_node.children.as_ref().unwrap();
+                        && current_bounds.max().z >= pos.z
+                );
+                i += 1;
+            }
+            if let Some(children) = current_node.children.as_ref() {
                 (*children).iter().for_each(|x| nodes_to_visit.push(x));
-                }
-            //}
+            }
         }
     }
 
     #[tokio::test]
     async fn check_point_count() {
-        let mut reader = LASReader::from_path(FILE);
+        let reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
-            Err(b) => panic!("Could not create LAS Reader"),
+            Err(_) => panic!("Could not create LAS Reader"),
         };
         let count = reader.remaining_points();
-        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
-        let data_read = match reader.read_into(&mut buffer, count) {
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
             Ok(a) => a,
-            Err(b) => panic!("Could not write Point Buffer"),
+            Err(_) => panic!("Could not write Point Buffer"),
         };
         let bounds = reader.get_metadata().bounds().unwrap();
 
-        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let octree = GpuOctree::new(&buffer, bounds, 50).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {
@@ -714,40 +695,42 @@ mod tests {
             }
         };
         octree.construct().await;
-        let mut node = octree.root_node.as_ref().unwrap();
+        let node = octree.root_node.as_ref().unwrap();
         let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
         let mut point_count: usize = 0;
         while !nodes_to_visit.is_empty() {
             let current_node = nodes_to_visit.pop().unwrap();
             if let None = current_node.children {
                 println!("{}", current_node);
-                //println!("{:?}", current_node.points_per_partition);
                 point_count += current_node.points_per_partition[0] as usize;
-            }
-            else {
+            } else {
                 let children = current_node.children.as_ref().unwrap();
                 (*children).iter().for_each(|x| nodes_to_visit.push(x));
             }
         }
-        println!("Point count of octree: {}, Point Count of Buffer {}", point_count, count);
+        println!(
+            "Point count of octree: {}, Point Count of Buffer {}",
+            point_count, count
+        );
         assert!(point_count == count);
     }
     #[tokio::test]
     async fn check_point_partitioning_duplicates() {
-        let mut reader = LASReader::from_path(FILE);
+        let reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
-            Err(b) => panic!("Could not create LAS Reader"),
+            Err(_) => panic!("Could not create LAS Reader"),
         };
         let count = reader.remaining_points();
-        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
-        let data_read = match reader.read_into(&mut buffer, count) {
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
             Ok(a) => a,
-            Err(b) => panic!("Could not write Point Buffer"),
+            Err(_) => panic!("Could not write Point Buffer"),
         };
         let bounds = reader.get_metadata().bounds().unwrap();
 
-        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let octree = GpuOctree::new(&buffer, bounds, 50).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {
@@ -763,20 +746,21 @@ mod tests {
     }
     #[tokio::test]
     async fn check_node_overflows() {
-        let mut reader = LASReader::from_path(FILE);
+        let reader = LASReader::from_path(FILE);
         let mut reader = match reader {
             Ok(a) => a,
-            Err(b) => panic!("Could not create LAS Reader"),
+            Err(_) => panic!("Could not create LAS Reader"),
         };
         let count = reader.remaining_points();
-        let mut buffer = InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
-        let data_read = match reader.read_into(&mut buffer, count) {
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
             Ok(a) => a,
-            Err(b) => panic!("Could not write Point Buffer"),
+            Err(_) => panic!("Could not write Point Buffer"),
         };
         let bounds = reader.get_metadata().bounds().unwrap();
 
-        let mut octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let octree = GpuOctree::new(&buffer, bounds, 50).await;
         let mut octree = match octree {
             Ok(a) => a,
             Err(b) => {
@@ -785,7 +769,7 @@ mod tests {
             }
         };
         octree.construct().await;
-        let mut node = octree.root_node.as_ref().unwrap();
+        let node = octree.root_node.as_ref().unwrap();
         let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
         while !nodes_to_visit.is_empty() {
             let current_node = nodes_to_visit.pop().unwrap();
@@ -795,4 +779,5 @@ mod tests {
             }
         }
     }
+    
 }
diff --git a/pasture-tools/src/acceleration_structures/mod.rs b/pasture-tools/src/acceleration_structures/mod.rs
index a30ce70..6181ef8 100644
--- a/pasture-tools/src/acceleration_structures/mod.rs
+++ b/pasture-tools/src/acceleration_structures/mod.rs
@@ -1,2 +1,2 @@
 mod gpu_octree;
-pub use self::gpu_octree::*;
+pub use gpu_octree::GpuOctree;
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
index a3298ff..c10d6be 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -8,13 +8,6 @@ struct Node {
   uint points_start;
   uint points_end;
 };
-// struct Debug {
-//   uint debug_order[3];
-//   uint debug_borders[8];
-//   uint thread_id;
-//   uint points_start;
-//   uint points_end;
-// };
 
 layout(std430, set=0, binding=0) buffer ParentNodes{
   Node parents[];
diff --git a/pasture-tools/src/main.rs b/pasture-tools/src/main.rs
deleted file mode 100644
index e87ac22..0000000
--- a/pasture-tools/src/main.rs
+++ /dev/null
@@ -1,214 +0,0 @@
-#[macro_use]
-extern crate log;
-
-mod ex {
-
-    use pasture_core::containers::InterleavedVecPointStorage;
-    use pasture_core::layout::PointType;
-    use pasture_core::nalgebra::Vector3;
-    use pasture_derive::PointType;
-    use pasture_io::base::PointReader;
-    use pasture_io::las::LASReader;
-    use pasture_io::las::LasPointFormat0;
-
-    use anyhow::Result;
-    #[repr(C)]
-    #[derive(PointType, Debug)]
-    struct MyPointType {
-        #[pasture(BUILTIN_POSITION_3D)]
-        pub position: Vector3<f64>,
-        #[pasture(BUILTIN_COLOR_RGB)]
-        pub icolor: Vector3<u16>,
-        #[pasture(attribute = "MyColorF32")]
-        pub fcolor: Vector3<f32>,
-        #[pasture(attribute = "MyVec3U8")]
-        pub byte_vec: Vector3<u8>,
-        #[pasture(BUILTIN_CLASSIFICATION)]
-        pub classification: u8,
-        #[pasture(BUILTIN_INTENSITY)]
-        pub intensity: u16,
-        #[pasture(BUILTIN_SCAN_ANGLE)]
-        pub scan_angle: i16,
-        #[pasture(BUILTIN_SCAN_DIRECTION_FLAG)]
-        pub scan_dir_flag: bool,
-        #[pasture(attribute = "MyInt32")]
-        pub my_int: i32,
-        #[pasture(BUILTIN_WAVEFORM_PACKET_SIZE)]
-        pub packet_size: u32,
-        #[pasture(BUILTIN_RETURN_POINT_WAVEFORM_LOCATION)]
-        pub ret_point_loc: f32,
-        #[pasture(BUILTIN_GPS_TIME)]
-        pub gps_time: f64,
-    }
-
-    pub fn main() {
-        futures::executor::block_on(run());
-    }
-
-    async fn run() -> Result<()> {
-        // == Init point buffer ======================================================================
-        env_logger::init();
-        info!("starting up");
-        let points = vec![
-            MyPointType {
-                position: Vector3::new(1.0, 0.0, 0.0),
-                icolor: Vector3::new(255, 0, 0),
-                fcolor: Vector3::new(1.0, 1.0, 1.0),
-                byte_vec: Vector3::new(1, 0, 0),
-                classification: 1,
-                intensity: 1,
-                scan_angle: -1,
-                scan_dir_flag: true,
-                my_int: -100000,
-                packet_size: 1,
-                ret_point_loc: 1.0,
-                gps_time: 1.0,
-            },
-            MyPointType {
-                position: Vector3::new(0.0, 1.0, 0.0),
-                icolor: Vector3::new(0, 255, 0),
-                fcolor: Vector3::new(0.0, 1.0, 0.0),
-                byte_vec: Vector3::new(0, 1, 0),
-                classification: 2,
-                intensity: 2,
-                scan_angle: -2,
-                scan_dir_flag: false,
-                my_int: -200000,
-                packet_size: 2,
-                ret_point_loc: 2.0,
-                gps_time: 2.0,
-            },
-            MyPointType {
-                position: Vector3::new(0.0, 0.0, 1.0),
-                icolor: Vector3::new(0, 0, 255),
-                fcolor: Vector3::new(0.0, 0.0, 1.0),
-                byte_vec: Vector3::new(0, 0, 1),
-                classification: 3,
-                intensity: 3,
-                scan_angle: -3,
-                scan_dir_flag: true,
-                my_int: -300000,
-                packet_size: 3,
-                ret_point_loc: 3.0,
-                gps_time: 3.0,
-            },
-        ];
-
-        let layout = MyPointType::layout();
-        let mut point_buffer = InterleavedVecPointStorage::new(layout);
-        point_buffer.push_points(points.as_slice());
-
-        let mut reader = LASReader::from_path(
-            //"/home/jnoice/dev/pasture/pasture-io/examples/in/10_points_format_1.las",
-            //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz",
-            //"/home/jnoice/Downloads/interesting.las",
-            //"/home/jnoice/Downloads/20150930_matsch_flight2_rgb_densified_point_cloud_part_1 - Cloud.las",
-            //"/home/jnoice/Downloads/45123H3316.laz",
-            "/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz",
-            //"/home/jnoice/Downloads/tirol.las",
-        )?;
-        let count = reader.remaining_points();
-        let mut buffer =
-            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
-        reader.read_into(&mut buffer, count)?;
-
-        let bounds = reader.get_metadata().bounds().unwrap();
-
-        // let device = gpu::Device::new(gpu::DeviceOptions {
-        //     device_power: gpu::DevicePower::High,
-        //     device_backend: gpu::DeviceBackend::Vulkan,
-        //     use_adapter_features: true,
-        //     use_adapter_limits: true,
-        // })
-        // .await;
-        //
-        // let mut device = match device {
-        //     Ok(d) => d,
-        //     Err(_) => {
-        //         println!("Failed to request device. Aborting.");
-        //         return Ok(());
-        //     }
-        // };
-        //
-        // device.print_device_info();
-        // device.print_active_features();
-        // device.print_active_limits();
-        // println!("\n");
-        //
-        // let attribs = &[attributes::POSITION_3D];
-        //
-        // let buffer_info_interleaved = gpu::BufferInfoInterleaved {
-        //     attributes: attribs,
-        //     binding: 0,
-        // };
-        //
-        // let mut gpu_point_buffer = GpuPointBufferInterleaved::new();
-        // gpu_point_buffer.malloc(
-        //     count as u64,
-        //     &buffer_info_interleaved,
-        //     &mut device.wgpu_device,
-        // );
-        // gpu_point_buffer.upload(
-        //     &buffer,
-        //     0..buffer.len(),
-        //     &buffer_info_interleaved,
-        //     &mut device.wgpu_device,
-        //     &device.wgpu_queue,
-        // );
-        //
-        // device.set_bind_group(
-        //     0,
-        //     gpu_point_buffer.bind_group_layout.as_ref().unwrap(),
-        //     gpu_point_buffer.bind_group.as_ref().unwrap(),
-        // );
-        // device.set_compute_shader_glsl(include_str!(
-        //     "acceleration_structures/shaders/interleaved.comp"
-        // ));
-        // device.compute(1, 1, 1);
-        //
-        // println!("\n===== COMPUTE =====\n");
-        //
-        // println!("Before:");
-        // for point in point_buffer.iter_point::<LasPointFormat0>().take(5) {
-        //     println!("{:?}", point);
-        // }
-        // println!();
-        //
-        // gpu_point_buffer
-        //     .download_into_interleaved(
-        //         &mut buffer,
-        //         0..count,
-        //         &buffer_info_interleaved,
-        //         &device.wgpu_device,
-        //     )
-        //     .await;
-        //
-        // println!("After:");
-        // for point in point_buffer.iter_point::<LasPointFormat0>().take(5) {
-        //     println!("{:?}", point);
-        // }
-
-        let mut octree =
-            pasture_tools::acceleration_structures::GpuOctree::new(&buffer, bounds, 500).await;
-        let mut octree = match octree {
-            Ok(a) => a,
-            Err(b) => {
-                println!("{:?}", b);
-                return Ok(());
-            }
-        };
-
-        octree.construct().await;
-        Ok(())
-    }
-}
-
-#[cfg(feature = "gpu")]
-fn main() {
-    ex::main();
-}
-
-#[cfg(not(feature = "gpu"))]
-fn main() {
-    println!("Whoops");
-}

From 913815fb5b4bcd20320518b9d463003e8d8a3539 Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Mon, 21 Feb 2022 19:09:35 +0100
Subject: [PATCH 10/15] Removed left over expression

---
 pasture-tools/src/acceleration_structures/gpu_octree.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index e2ccca1..a80f16e 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -662,7 +662,7 @@ mod tests {
                         && current_bounds.min().z <= pos.z
                         && current_bounds.max().z >= pos.z
                 );
-                i += 1;
+                
             }
             if let Some(children) = current_node.children.as_ref() {
                 (*children).iter().for_each(|x| nodes_to_visit.push(x));

From 8353ffe86d80f8eb954bd64da931e2ff661a6c71 Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Mon, 21 Feb 2022 19:14:46 +0100
Subject: [PATCH 11/15] Updated Upstream wgpu version

---
 pasture-core/Cargo.toml  | 2 +-
 pasture-tools/Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pasture-core/Cargo.toml b/pasture-core/Cargo.toml
index b07b68b..db75f22 100644
--- a/pasture-core/Cargo.toml
+++ b/pasture-core/Cargo.toml
@@ -24,7 +24,7 @@ itertools = "0.10.0"
 byteorder = "1.4.2"
 
 # GPU related
-wgpu = { version = "0.11.0", features = ["spirv"], optional = true }
+wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
 shaderc = { version = "0.7.2", optional = true }
 futures = { version = "0.3", optional = true }
 bytemuck = { version = "1.5.1", optional = true }
diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index 15de7f1..9481922 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -26,7 +26,7 @@ rand = {version = "0.8.3", features = ["small_rng"] }
 tokio = { version = "1.16.1", features = ["full"] }
 
 #gpu related
-wgpu = { version = "0.11.0", features = ["spirv"], optional = true }
+wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
 shaderc = { version = "0.7.2", optional = true }
 futures = { version = "0.3", optional = true }
 bytemuck = { version = "1.5.1", optional = true }

From 4de54e3f7cd7152507bdaddeab58293ce7dffa68 Mon Sep 17 00:00:00 2001
From: Jannis Neus <jannis.neus@live.de>
Date: Wed, 23 Feb 2022 14:39:54 +0100
Subject: [PATCH 12/15] Added single NNS

---
 .../src/acceleration_structures/gpu_octree.rs | 111 ++++++++++++++++--
 .../shaders/find_max_values.comp              |  35 ------
 .../shaders/generate_nodes.comp               | 109 +----------------
 3 files changed, 106 insertions(+), 149 deletions(-)
 delete mode 100644 pasture-tools/src/acceleration_structures/shaders/find_max_values.comp

diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index a80f16e..2620f7d 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -8,7 +8,9 @@ use pasture_core::{
 use std::convert::TryInto;
 use std::fmt;
 use std::mem;
+use std::time;
 use wgpu::util::DeviceExt;
+use bitvec::prelude::*;
 
 #[derive(Debug, Clone)]
 pub struct OctreeNode {
@@ -46,6 +48,9 @@ impl OctreeNode {
         let diff: i64 = self.point_end as i64 - self.point_start as i64;
         return diff <= points_per_node as i64;
     }
+    fn is_empty(&self) -> bool {
+        self.point_start == self.point_end && self.points_per_partition[0] == 0
+    }
     /// Returns a vector of the nodes raw data. As with `size(), the field
     /// `children`is not included, as it is not necessary for GPU computation.
     fn into_raw(&self) -> Vec<u8> {
@@ -129,9 +134,14 @@ impl OctreeNode {
         }
     }
     /// Checks if `pos` is within the bounds of the node.
-    fn contains(&self, pos: Vector3<f64>) -> bool {
-        let point: Point3<f64> = Point3::new(pos.x, pos.y, pos.z);
-        self.bounds.contains(&point)
+    fn contains(&self, pos: Point3<f64>) -> bool {
+        self.bounds.contains(&pos)
+    }
+
+    fn get_closest_child(pos: Point3<f64>) -> u32 {
+        let child_id = 0;
+
+        child_id
     }
 }
 
@@ -206,6 +216,8 @@ impl<'a> GpuOctree<'a> {
     }
 
     pub fn print_tree(&self) {
+        println!("Num Points: {}", self.point_buffer.len());
+        println!("Tree Depth: {}", self.depth);
         println!("{}", self.root_node.as_ref().unwrap());
     }
     /// Run top-down construction of the octree.
@@ -217,13 +229,16 @@ impl<'a> GpuOctree<'a> {
         
         // point cloud data, later uploaded to GPU
         let mut raw_points = vec![0u8; 24 * point_count];
-
+        let now = time::Instant::now();
         self.point_buffer.get_raw_attribute_range(
             0..point_count,
             &attributes::POSITION_3D,
             raw_points.as_mut_slice(),
         );
+        let elapsed = now.elapsed();
+        println!("Octree - Getting raw point data took {} ms", elapsed.as_millis());
 
+        let now = time::Instant::now();
         let mut compiler = shaderc::Compiler::new().unwrap();
         let comp_shader = include_str!("shaders/generate_nodes.comp");
         let comp_spirv = compiler
@@ -380,8 +395,11 @@ impl<'a> GpuOctree<'a> {
                     },
                 ],
             });
-
+            let elapsed = now.elapsed();
+            println!("Octree - GPU prep took {} ms", elapsed.as_millis());
+        let now_compute = time::Instant::now();
         while !current_nodes.is_empty() {
+            let now = time::Instant::now();
             // Nodes buffers are created inside the loop, as their size changes per iteration
             let child_buffer_size = 8 * (OctreeNode::size() * current_nodes.len()) as u64; 
             let child_nodes_buffer_staging =
@@ -567,8 +585,11 @@ impl<'a> GpuOctree<'a> {
             work_done.await;
 
             tree_depth += 1;
+            let elapsed = now.elapsed();
+            println!("Octree - Compute Pass took {} ms", elapsed.as_millis());
         }
-        
+        let elapsed = now_compute.elapsed();
+        println!("Octree - Whole Computation loop took {} ms", elapsed.as_millis());
         gpu_point_buffer.destroy();
         point_index_buffer.destroy();
         index_buffer_staging.destroy();
@@ -582,16 +603,88 @@ impl<'a> GpuOctree<'a> {
         return indices;
     }
 
-    fn deepest_octant(&self, node: &'a OctreeNode, pos: Vector3<f64>) -> &'a OctreeNode {
+    fn deepest_octant(&self, node: &'a OctreeNode, pos: Point3<f64>, max_distance: f64) -> &'a OctreeNode {
         if let Some(children) = node.children.as_ref() {
             for child in children.iter() {
-                if !child.is_leaf(self.points_per_node) && child.contains(pos) {
-                    return self.deepest_octant(child, pos);
+                let bounds_extent = child.bounds.extent();
+                if !child.is_leaf(self.points_per_node)
+                    && child.contains(pos)
+                    && bounds_extent.x >= max_distance * 2.0
+                    && bounds_extent.y >= max_distance * 2.0
+                    && bounds_extent.z >= max_distance * 2.0
+                    {
+                    return self.deepest_octant(child, pos, max_distance);
                 }
             }
         }
         node
     }
+    fn nearest_neighbor_helper(&self, pos: &Point3<f64>, dist: f64, node: &OctreeNode) -> Option<u32> {
+        let mut axes: Vector3<f64> = 0.5 * (node.bounds.max() - node.bounds.min());
+        axes += Vector3::new(node.bounds.min().x, node.bounds.min().y, node.bounds.min().z);
+        let pos_vector = Vector3::new(pos.x, pos.y, pos.z);
+        let mut nearest_index: Option<u32> = None;
+        let mut shortest_distance = f64::MAX;
+        if let Some(children) = node.children.as_ref() {
+            // Sort children according to proximity to pos
+            let mut sorted_children: Vec<&OctreeNode> = Vec::new();
+            for i in 0..8 {
+                let mut k = 0;
+                let mut center = 0.5 * (children[i].bounds.max() - children[i].bounds.min());
+                center += Vector3::new(children[i].bounds.min().x, children[i].bounds.min().y, children[i].bounds.min().z);
+                let curr_dist = center.metric_distance(&pos_vector);
+                for c in sorted_children.iter(){
+                    let sorted_center = 0.5 * (c.bounds.max() - c.bounds.min()) + Vector3::new(c.bounds.min().x, c.bounds.min().y, c.bounds.min().z);
+                    let sorted_dist = sorted_center.metric_distance(&pos_vector);
+                    if sorted_dist < curr_dist {
+                        k += 1;
+                    }
+                }
+                sorted_children.insert(k, &children[i]);
+                
+            }
+            for c in sorted_children.iter(){
+                if let None = nearest_index {
+                    nearest_index = self.nearest_neighbor_helper(pos, dist, c);
+                    if let Some(index) = nearest_index {
+                        let point = self.point_buffer.get_attribute::<Vector3<f64>>(&attributes::POSITION_3D, index as usize);
+                        shortest_distance = point.metric_distance(&pos_vector);
+                    }
+                }
+                else {
+                    let current_nearest = self.nearest_neighbor_helper(pos, dist, c);
+                    if let Some(index) = current_nearest {
+                        let point = self.point_buffer.get_attribute::<Vector3<f64>>(&attributes::POSITION_3D, current_nearest.unwrap() as usize);
+                        let curr_dist = point.metric_distance(&pos_vector);
+                        if curr_dist < shortest_distance {
+                            nearest_index = current_nearest;
+                            shortest_distance = curr_dist;
+                        }
+                    }
+                }
+            }
+            
+        }
+        else if !node.is_empty(){
+            let point_indices = self.get_points(node);
+            for i in point_indices.iter() {
+                let point = self.point_buffer.get_attribute::<Vector3<f64>>(&attributes::POSITION_3D, *i as usize);
+                let curr_dist = point.metric_distance(&pos_vector);
+                
+                if  curr_dist <= dist && curr_dist < shortest_distance {
+                    shortest_distance = curr_dist;
+                    nearest_index = Some(i.clone())
+                }
+            }
+        }
+        nearest_index
+    }
+
+    pub fn nearest_neighbor(&self, pos: Point3<f64>, max_distance: f64) -> Option<u32> {
+        let node = self.deepest_octant(self.root_node.as_ref().unwrap(), pos, max_distance);
+        let neighbor = self.nearest_neighbor_helper(&pos, max_distance, &node);
+        neighbor
+    }
 
 }
 
diff --git a/pasture-tools/src/acceleration_structures/shaders/find_max_values.comp b/pasture-tools/src/acceleration_structures/shaders/find_max_values.comp
deleted file mode 100644
index aeb7c8e..0000000
--- a/pasture-tools/src/acceleration_structures/shaders/find_max_values.comp
+++ /dev/null
@@ -1,35 +0,0 @@
-#version 450
-
-layout(std430, set=0, binding=0) buffer PointCloud {
-  dvec3 pointBuffer[];
-};
-
-layout(std430, set=0, binding=1) buffer ResultBuffer {
-  uint resultBuffer;
-};
-
-shared uint largest_x;
-shared uint largest_y;
-shared uint largest_z;
-
-void main() {
-  uint idx = gl_LocalInvocationID.x;
-
-  if(abs(pointBuffer[idx].x) > abs(pointBuffer[largest_x].x)) {
-    // atomicExchange(largest_x, idx);
-    largest_x = idx;
-  }
-  if(abs(pointBuffer[idx].y) > abs(pointBuffer[largest_y].y)) {
-    //atomicExchange(largest_y, idx);
-    largest_y = idx;
-  }
-  if(abs(pointBuffer[idx].z) > abs(pointBuffer[largest_z].z)) {
-    atomicExchange(largest_z, idx);
-    largest_z = idx;
-  }
-
-  barrier();
-  if(idx == 0){
-    resultBuffer = largest_x;
-  }
-}
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
index c10d6be..d227091 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -21,9 +21,6 @@ layout(std430, set=1, binding=0) buffer PointBuffer {
 layout(std430, set=1, binding=1) buffer Partitioning {
   uint indeces[];
 };
-// layout(std430, set=1, binding=2) buffer DebugBuffer {
-//   Debug debug[];
-// };
 
 layout (local_size_x=1, local_size_y=1, local_size_z=1) in;
 
@@ -74,79 +71,14 @@ uint partition_run(uint start, uint end, double pivot, uint axis) {
 void partitioning(uint[3] axes, double[3] pivots, uint id){
   uint start = parents[id].points_start;
   uint end = parents[id].node_partitioning[0];
-  // for(uint k = 0; k < 8; ++k) {
-  //   if(parents[id].node_partitioning[k] != 0 && k < 4 && start < parents[id].points_end){
-  //     uint end = parents[id].node_partitioning[k];
-  //     end = end > parents[id].points_start ? end - 1 : end;
-  //     while(start <= end) {
-  //
-  //       debug[id].points_start = start;
-  //       debug[id].points_end = end;
-  //       if(points[indeces[start]][axis] <= threshold){
-  //         ++start;
-  //       }
-  //       else if(points[indeces[end]][axis] <= threshold){
-  //         swap(start, end);
-  //
-  //       }
-  //       else {
-  //         --end;
-  //       }
-  //     }
-  //
-  //
-  //     local_partitioning[k * 2] = points[indeces[start]][axis] <= threshold ? start : start;
-  //     local_partitioning[k * 2 + 1] = parents[id].node_partitioning[k];
-  //     start = parents[id].node_partitioning[k];
-  //   }
-  //   // else if(start >= parents[id].points_end && k < 4){
-  //   //   local_partitioning[k * 2] = parents[id].points_end;
-  //   //   local_partitioning[k * 2 + 1] = parents[id].points_end;
-  //   //   start = parents[id].points_end;
-  //   // }
-  //   parents[id].node_partitioning[k] = local_partitioning[k];
-  //}
-  // for(uint k = 0; k < 4; ++k) {
-  //   uint end = parents[id].node_partitioning[k];
-  //   if(end > 0){
-  //     start = partition_run(start, end, threshold, axis);
-  //     local_partitioning[k * 2] = start;
-  //     local_partitioning[k * 2 + 1] = parents[id].node_partitioning[k];
-  //   }
-  // }
-  // for(uint i = 0; i < 8; ++i) {
-  //   parents[id].node_partitioning[i] = local_partitioning[i];
-  // }
+  
   parents[id].node_partitioning[3] = partition_run(start, end, pivots[axes[0]], axes[0]);
-  // if(parents[id].node_partitioning[3] > start){
-  //   parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
-  //   if(parents[id].node_partitioning[1] > start) {
-  //     parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
-  //   }
-  //   else {
-  //     parents[id].node_partitioning[0] = start;
-  //   }
-  //   if(parents[id].node_partitioning[1] == parents[id].node_partitioning[3]) {
-  //     parents[id].node_partitioning[2] = parents[id].node_partitioning[1];
-  //   }
-  //   parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
-  // }
-  // else {
-  //   parents[id].node_partitioning[0] = start;
-  //   parents[id].node_partitioning[1] = start;
-  //   parents[id].node_partitioning[2] = start;
-  // }
+  
   parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
   parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
   parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
   parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivots[axes[1]], axes[1]);
-  // if(parents[id].node_partitioning[5] == parents[id].node_partitioning[3]) {
-  //   parents[id].node_partitioning[4] = parents[id].node_partitioning[3];
-  // }
-  // else {
-  //   parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
-  //
-  // }
+  
   parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
   parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivots[axes[2]], axes[2]);
   parents[id].node_partitioning[7] = end;
@@ -196,28 +128,8 @@ void main() {
   double z_partition = parent.bounds_min[2] + 0.5 * abs(z_diff);
   double[3] partition_pivots = double[3](x_partition, y_partition, z_partition);
   uint[3] partition_order = partitioning_order(abs(x_diff), abs(y_diff), abs(z_diff));
-  // for(uint i = 0; i < 3; ++i){
-  //   uint partition_axis = partition_order[i];
-  //   switch(partition_axis){
-  //     case 0:
-  //       partitioning(partition_axis, x_partition, i, idx);
-  //       break;
-  //     case 1:
-  //       partitioning(partition_axis, y_partition, i, idx);
-  //       break;
-  //     case 2:
-  //       partitioning(partition_axis, z_partition, i, idx);
-  //   }
-  // }
+  
   partitioning(partition_order, partition_pivots, idx);
-  // parents[idx].node_partitioning[0] = partition_borders[3];
-  // parents[idx].node_partitioning[1] = partition_borders[1];
-  // parents[idx].node_partitioning[2] = partition_borders[4];
-  // parents[idx].node_partitioning[3] = partition_borders[0];
-  // parents[idx].node_partitioning[4] = partition_borders[5];
-  // parents[idx].node_partitioning[5] = partition_borders[2];
-  // parents[idx].node_partitioning[6] = partition_borders[6];
-  // parents[idx].node_partitioning[7] = parents[idx].points_end;
 
   for(uint i = 0; i < 8; ++i){
     if(i == 0) {
@@ -232,12 +144,6 @@ void main() {
     } else {
       children[idx * 8 + i].points_start = parents[idx].node_partitioning[i - 1];
     }
-    // if(parents[idx].points_per_partition[i] == 0 || i == 7) {
-    //   children[idx * 8 + i].points_end = parents[idx].node_partitioning[i];
-    // }
-    // else {
-    //   children[idx * 8 + i].points_end = parents[idx].node_partitioning[i] - 1;
-    // }
     children[idx * 8 + i].points_per_partition[0] = parents[idx].points_per_partition[i];
     children[idx * 8 + i].points_end = parents[idx].node_partitioning[i];
     children[idx * 8 + i].node_partitioning[0] = children[idx * 8 + i].points_end;
@@ -250,11 +156,4 @@ void main() {
     children[idx * 8 + i].bounds_min = child_bounds[0];
     children[idx* 8 + i].bounds_max = child_bounds[1];
   }
-  // debug[idx].debug_order = partition_order;
-  // for(uint i = 0; i < 8; ++i){
-  //   debug[idx].debug_borders[i] = parents[idx].node_partitioning[i];
-  // }
-
-  //debug[idx].points_start = parents[idx].points_start;
-  //debug[idx].points_end = parents[idx].points_end;
 }

From c179e68257d366121293fcf89f5b84b45c6aa252 Mon Sep 17 00:00:00 2001
From: jneus <jannis.neus@live.de>
Date: Mon, 28 Feb 2022 23:37:55 +0100
Subject: [PATCH 13/15] Enhanced NNS

---
 pasture-core/src/gpu/device.rs                | 206 ++++++-----
 pasture-tools/Cargo.toml                      |   2 +
 .../src/acceleration_structures/gpu_octree.rs | 320 ++++++++++--------
 .../shaders/generate_nodes.comp               | 144 +++++---
 4 files changed, 390 insertions(+), 282 deletions(-)

diff --git a/pasture-core/src/gpu/device.rs b/pasture-core/src/gpu/device.rs
index 43ab74c..92b9ffd 100644
--- a/pasture-core/src/gpu/device.rs
+++ b/pasture-core/src/gpu/device.rs
@@ -1,7 +1,7 @@
 use crate::layout;
-use wgpu::util::DeviceExt;
 use std::collections::BTreeMap;
 use std::ops::BitOr;
+use wgpu::util::DeviceExt;
 
 /// The base structure used to get access to the GPU. In addition it handles things like
 /// shader compilation and the actual dispatch of work to the GPU.
@@ -66,18 +66,19 @@ impl<'a> Device<'a> {
     ///     };
     /// });
     /// ```
-    pub async fn new(device_options: DeviceOptions) -> Result<Device<'a>, wgpu::RequestDeviceError> {
+    pub async fn new(
+        device_options: DeviceOptions,
+    ) -> Result<Device<'a>, wgpu::RequestDeviceError> {
         // == Create an instance from the desired backend =========================================
 
         let backend_bits = match device_options.device_backend {
             // DeviceBackend::Primary => { wgpu::Backends::PRIMARY }
             // DeviceBackend::Secondary => { wgpu::Backends::SECONDARY }
-            DeviceBackend::Vulkan => { wgpu::Backends::VULKAN }
-            // DeviceBackend::Metal => { wgpu::Backends::METAL }
-            // DeviceBackend::Dx12 => { wgpu::Backends::DX12 }
-            // DeviceBackend::Dx11 => { wgpu::Backends::DX11 }
-            // DeviceBackend::OpenGL => { wgpu::Backends::GL }
-            // DeviceBackend::Browser => { wgpu::Backends::BROWSER_WEBGPU }
+            DeviceBackend::Vulkan => wgpu::Backends::VULKAN, // DeviceBackend::Metal => { wgpu::Backends::METAL }
+                                                             // DeviceBackend::Dx12 => { wgpu::Backends::DX12 }
+                                                             // DeviceBackend::Dx11 => { wgpu::Backends::DX11 }
+                                                             // DeviceBackend::OpenGL => { wgpu::Backends::GL }
+                                                             // DeviceBackend::Browser => { wgpu::Backends::BROWSER_WEBGPU }
         };
 
         let instance = wgpu::Instance::new(backend_bits);
@@ -92,13 +93,13 @@ impl<'a> Device<'a> {
         // The adapter gives us a handle to the actual device.
         // We can query some GPU information, such as the device name, its type (discrete vs integrated)
         // or the backend that is being used.
-        let adapter = instance.request_adapter(
-            &wgpu::RequestAdapterOptions {
+        let adapter = instance
+            .request_adapter(&wgpu::RequestAdapterOptions {
                 power_preference: power_pref,
                 compatible_surface: None,
                 force_fallback_adapter: false,
-            }
-        ).await;
+            })
+            .await;
 
         let adapter = match adapter {
             Some(a) => a,
@@ -107,12 +108,17 @@ impl<'a> Device<'a> {
 
         // == Create a device and a queue from the given adapter ==================================
 
-        if !adapter.features().contains(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS) {
+        if !adapter
+            .features()
+            .contains(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS)
+        {
             return Result::Err(wgpu::RequestDeviceError);
         }
 
         let features = match device_options.use_adapter_features {
-            true => adapter.features().bitor(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS),
+            true => adapter
+                .features()
+                .bitor(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS),
             false => wgpu::Features::MAPPABLE_PRIMARY_BUFFERS,
         };
 
@@ -121,14 +127,16 @@ impl<'a> Device<'a> {
             false => wgpu::Limits::default(),
         };
 
-        let (wgpu_device, wgpu_queue) = adapter.request_device(
-            &wgpu::DeviceDescriptor {
-                label: Some("wgpu_device_and_queue"),
-                features,
-                limits,
-            },
-            None,
-        ).await?;
+        let (wgpu_device, wgpu_queue) = adapter
+            .request_device(
+                &wgpu::DeviceDescriptor {
+                    label: Some("wgpu_device_and_queue"),
+                    features,
+                    limits,
+                },
+                None,
+            )
+            .await?;
 
         // == Other fields =========================================================================
 
@@ -161,12 +169,18 @@ impl<'a> Device<'a> {
 
     /// Displays the features that the physical GPU is able to support.
     pub fn print_adapter_features(&self) {
-        println!("Features supported by the adapter: {:?}", self.adapter.features());
+        println!(
+            "Features supported by the adapter: {:?}",
+            self.adapter.features()
+        );
     }
 
     /// Displays the features that are currently active.
     pub fn print_active_features(&self) {
-        println!("Currently active features: {:?}", self.wgpu_device.features());
+        println!(
+            "Currently active features: {:?}",
+            self.wgpu_device.features()
+        );
     }
 
     /// Displays the default limits that are likely supported by all devices.
@@ -176,7 +190,10 @@ impl<'a> Device<'a> {
 
     /// Displays the best limits the physical GPU can support.
     pub fn print_adapter_limits(&self) {
-        println!("\"Best\" limits supported by the adapter: {:?}", self.adapter.limits());
+        println!(
+            "\"Best\" limits supported by the adapter: {:?}",
+            self.adapter.limits()
+        );
     }
 
     /// Displays the limits that are currently active.
@@ -191,46 +208,46 @@ impl<'a> Device<'a> {
     /// * `uniform_as_bytes` - the uniform's content as bytes. Make sure it's correctly aligned
     ///                        according to the `std140` layout rules.
     /// * `binding` - the binding at which the uniform buffer object is set in the shader.
-    pub fn create_uniform_bind_group(&self, uniform_as_bytes: &[u8], binding: u32) -> (wgpu::BindGroupLayout, wgpu::BindGroup) {
+    pub fn create_uniform_bind_group(
+        &self,
+        uniform_as_bytes: &[u8],
+        binding: u32,
+    ) -> (wgpu::BindGroupLayout, wgpu::BindGroup) {
         // TODO: separate buffer from bind group -> should probably become part of Device state
-        let uniform_buffer = self.wgpu_device.create_buffer_init(
-            &wgpu::util::BufferInitDescriptor {
-                label: Some("uniform_buffer"),
-                contents: uniform_as_bytes,
-                usage: wgpu::BufferUsages::UNIFORM,
-            }
-        );
-
-        let uniform_bind_group_layout = self.wgpu_device.create_bind_group_layout(
-            &wgpu::BindGroupLayoutDescriptor {
-                label: Some("uniform_bind_group_layout"),
-                entries: &[
-                    wgpu::BindGroupLayoutEntry {
+        let uniform_buffer =
+            self.wgpu_device
+                .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                    label: Some("uniform_buffer"),
+                    contents: uniform_as_bytes,
+                    usage: wgpu::BufferUsages::UNIFORM,
+                });
+
+        let uniform_bind_group_layout =
+            self.wgpu_device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: Some("uniform_bind_group_layout"),
+                    entries: &[wgpu::BindGroupLayoutEntry {
                         binding,
                         visibility: wgpu::ShaderStages::COMPUTE,
                         ty: wgpu::BindingType::Buffer {
                             ty: wgpu::BufferBindingType::Uniform,
                             has_dynamic_offset: false,
-                            min_binding_size: None
+                            min_binding_size: None,
                         },
-                        count: None
-                    }
-                ],
-            }
-        );
+                        count: None,
+                    }],
+                });
 
-        let uniform_bind_group = self.wgpu_device.create_bind_group(
-            &wgpu::BindGroupDescriptor {
+        let uniform_bind_group = self
+            .wgpu_device
+            .create_bind_group(&wgpu::BindGroupDescriptor {
                 label: Some("uniform_bind_group"),
                 layout: &uniform_bind_group_layout,
-                entries: &[
-                    wgpu::BindGroupEntry {
-                        binding,
-                        resource: uniform_buffer.as_entire_binding(),
-                    },
-                ],
-            }
-        );
+                entries: &[wgpu::BindGroupEntry {
+                    binding,
+                    resource: uniform_buffer.as_entire_binding(),
+                }],
+            });
 
         (uniform_bind_group_layout, uniform_bind_group)
     }
@@ -238,7 +255,12 @@ impl<'a> Device<'a> {
     /// Associate a bind group and its layout with a given set on the shader side.
     /// Eg. if on the shader we have a buffer with `layout(std430, set=2, binding=0)`,
     /// then the passed in `index` should equal 2.
-    pub fn set_bind_group(&mut self, index: u32, bind_group_layout: &'a wgpu::BindGroupLayout, bind_group: &'a wgpu::BindGroup) {
+    pub fn set_bind_group(
+        &mut self,
+        index: u32,
+        bind_group_layout: &'a wgpu::BindGroupLayout,
+        bind_group: &'a wgpu::BindGroup,
+    ) {
         let bind_group_pair = BindGroupPair {
             bind_group_layout,
             bind_group,
@@ -253,7 +275,7 @@ impl<'a> Device<'a> {
             &wgpu::ShaderModuleDescriptor {
                 label: Some("wgsl_computer_shader_module"),
                 source: wgpu::ShaderSource::Wgsl(wgsl_compute_shader_src.into()),
-            }
+            },
         ));
 
         let pipeline = self.create_compute_pipeline(self.cs_module.as_ref().unwrap());
@@ -270,7 +292,10 @@ impl<'a> Device<'a> {
         self.compute_pipeline = Some(pipeline);
     }
 
-    fn compile_glsl_and_create_compute_module(&self, compute_shader_src: &str) -> Option<wgpu::ShaderModule> {
+    fn compile_glsl_and_create_compute_module(
+        &self,
+        compute_shader_src: &str,
+    ) -> Option<wgpu::ShaderModule> {
         // WebGPU wants its shaders pre-compiled in binary SPIR-V format.
         // So we'll take the source code of our compute shader and compile it
         // with the help of the shaderc crate.
@@ -289,35 +314,37 @@ impl<'a> Device<'a> {
         // Now with the binary data we can create and return our ShaderModule,
         // which will be executed on the GPU within our compute pipeline.
         Some(
-            self.wgpu_device.create_shader_module(&wgpu::ShaderModuleDescriptor {
-                label: Some("glsl_compute_shader_module"),
-                source: cs_data,
-            })
+            self.wgpu_device
+                .create_shader_module(&wgpu::ShaderModuleDescriptor {
+                    label: Some("glsl_compute_shader_module"),
+                    source: cs_data,
+                }),
         )
     }
 
     fn create_compute_pipeline(&self, cs_module: &wgpu::ShaderModule) -> wgpu::ComputePipeline {
-        let layouts = self.bind_group_data
+        let layouts = self
+            .bind_group_data
             .values()
             .map(|pair| pair.bind_group_layout)
             .collect::<Vec<&'a wgpu::BindGroupLayout>>();
 
-        let compute_pipeline_layout = self.wgpu_device.create_pipeline_layout(
-            &wgpu::PipelineLayoutDescriptor {
-                label: Some("compute_pipeline_layout"),
-                bind_group_layouts: layouts.as_slice(),
-                push_constant_ranges: &[],
-            }
-        );
-
-        let compute_pipeline = self.wgpu_device.create_compute_pipeline(
-            &wgpu::ComputePipelineDescriptor {
-                label: Some("compute_pipeline"),
-                layout: Some(&compute_pipeline_layout),
-                module: &cs_module,
-                entry_point: "main",
-            }
-        );
+        let compute_pipeline_layout =
+            self.wgpu_device
+                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                    label: Some("compute_pipeline_layout"),
+                    bind_group_layouts: layouts.as_slice(),
+                    push_constant_ranges: &[],
+                });
+
+        let compute_pipeline =
+            self.wgpu_device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: Some("compute_pipeline"),
+                    layout: Some(&compute_pipeline_layout),
+                    module: &cs_module,
+                    entry_point: "main",
+                });
 
         compute_pipeline
     }
@@ -333,15 +360,16 @@ impl<'a> Device<'a> {
         // The resulting CommandBuffer can then be submitted to the GPU via a Queue.
         // Signal the end of the batch with CommandEncoder#finish().
         let mut encoder =
-            self.wgpu_device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: Some("command_encoder") });
+            self.wgpu_device
+                .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                    label: Some("command_encoder"),
+                });
 
         {
             // The compute pass will start ("dispatch") our compute shader.
-            let mut compute_pass = encoder.begin_compute_pass(
-                &wgpu::ComputePassDescriptor {
-                    label: Some("compute_pass")
-                }
-            );
+            let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("compute_pass"),
+            });
             compute_pass.set_pipeline(self.compute_pipeline.as_ref().unwrap());
 
             for (i, bind_group_pair) in self.bind_group_data.values().enumerate() {
@@ -390,7 +418,9 @@ pub enum DevicePower {
 
 impl Default for DevicePower {
     /// Default is [DevicePower::Low]
-    fn default() -> Self { Self::Low }
+    fn default() -> Self {
+        Self::Low
+    }
 }
 
 /// Currently only `Vulkan` is supported, because it is the only backend that allows 64-bit floats
@@ -411,7 +441,9 @@ pub enum DeviceBackend {
 
 impl Default for DeviceBackend {
     /// Default is `Vulkan`
-    fn default() -> Self { Self::Vulkan }
+    fn default() -> Self {
+        Self::Vulkan
+    }
 }
 
 // TODO: consider usage (readonly vs read/write, shader stages, ...), size, mapped_at_creation, etc.
diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index 9481922..8bdf0db 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -24,6 +24,8 @@ pretty_env_logger = "0.4.0"
 plotters = "^0.3.0"
 rand = {version = "0.8.3", features = ["small_rng"] }
 tokio = { version = "1.16.1", features = ["full"] }
+priority-queue = "1.2.1"
+ordered-float = "2.10.0"
 
 #gpu related
 wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-tools/src/acceleration_structures/gpu_octree.rs
index 2620f7d..4c0be27 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-tools/src/acceleration_structures/gpu_octree.rs
@@ -4,13 +4,14 @@ use pasture_core::{
     math::AABB,
     nalgebra::{Point3, Vector3},
 };
-
+use priority_queue::DoublePriorityQueue;
+use ordered_float::OrderedFloat;
 use std::convert::TryInto;
 use std::fmt;
 use std::mem;
 use std::time;
+use std::thread;
 use wgpu::util::DeviceExt;
-use bitvec::prelude::*;
 
 #[derive(Debug, Clone)]
 pub struct OctreeNode {
@@ -22,10 +23,11 @@ pub struct OctreeNode {
     point_end: u32,
 }
 
-pub struct GpuOctree<'a> {
+pub struct GpuOctree{
     gpu_device: wgpu::Device,
     gpu_queue: wgpu::Queue,
-    point_buffer: &'a dyn PointBuffer,
+    point_buffer: Vec<Vector3<f64>>,
+    raw_points: Vec<u8>,
     point_partitioning: Vec<u32>,
     root_node: Option<OctreeNode>,
     depth: u32,
@@ -33,6 +35,12 @@ pub struct GpuOctree<'a> {
     points_per_node: u32,
 }
 
+enum OctreeRelation {
+    In,
+    Out,
+    Partial,
+}
+
 impl OctreeNode {
     /// Get the number of bytes, a node allocates on the gpu.
     /// Because the `children` pointer is not required for GPU node creation,
@@ -113,6 +121,7 @@ impl OctreeNode {
             .collect();
         let mut rest_iter = rest_data.iter_mut();
         let mut node_partitioning = [0u32; 8];
+        use std::sync::{mpsc::channel, Arc, Mutex};
         for i in 0..8 {
             node_partitioning[i] = *rest_iter.next().unwrap();
         }
@@ -133,16 +142,40 @@ impl OctreeNode {
             point_end: points_end,
         }
     }
-    /// Checks if `pos` is within the bounds of the node.
-    fn contains(&self, pos: Point3<f64>) -> bool {
-        self.bounds.contains(&pos)
-    }
-
-    fn get_closest_child(pos: Point3<f64>) -> u32 {
-        let child_id = 0;
 
-        child_id
+    fn relation_to_point(&self, pos: &Vector3<f64>, radius: f64) -> OctreeRelation {
+        let node_extent = self.bounds.extent();
+        let node_center = self.bounds.center().coords;
+        let x_diff = (pos.x - node_center.x).abs();
+        let y_diff = (pos.y - node_center.y).abs();
+        let z_diff = (pos.z - node_center.z).abs();
+
+        // Point and radius outside of node
+        let max_diff = Vector3::new(
+            node_extent.x / 2. + radius,
+            node_extent.y / 2. + radius,
+            node_extent.z / 2. + radius
+        );
+        if x_diff >= max_diff.x || y_diff >= max_diff.y || z_diff >= max_diff.z {
+            return OctreeRelation::Out;
+        }
+        let radius_squared = radius * radius;
+        if x_diff <= node_extent.x || y_diff <= node_extent.y || z_diff <= node_extent.z {
+            let radius_squared = radius * radius;
+            let distance_squared = f64::powi(x_diff + node_extent.x * 0.5, 2) + f64::powi(y_diff + node_extent.y * 0.5, 2) + f64::powi(z_diff + node_extent.z * 0.5, 2);
+            // Whole Node lies within radius
+            if radius_squared >= distance_squared {
+                return OctreeRelation::In;
+            }
+            return OctreeRelation::Partial;
+        }
+        let distance_squared = f64::powi(x_diff - node_extent.x * 0.5, 2) + f64::powi(y_diff - node_extent.y * 0.5, 2) + f64::powi(z_diff - node_extent.z * 0.5, 2);
+        if radius_squared >= distance_squared{
+            return OctreeRelation::Partial;
+        }
+        return OctreeRelation::Out;
     }
+    
 }
 
 impl fmt::Display for OctreeNode {
@@ -164,7 +197,7 @@ impl fmt::Display for OctreeNode {
     }
 }
 
-impl<'a> GpuOctree<'a> {
+impl GpuOctree {
     /// Creates an empty Octree accelerated by the GPU.
     /// 
     /// `point_buffer`: pasture buffer containing the point cloud data
@@ -176,10 +209,10 @@ impl<'a> GpuOctree<'a> {
     /// The generated instance has no constructed octree. To get the octree,
     /// run `construct()`.
     pub async fn new(
-        point_buffer: &'a dyn PointBuffer,
+        point_buffer: & dyn PointBuffer,
         max_bounds: AABB<f64>,
         points_per_node: u32,
-    ) -> Result<GpuOctree<'a>, wgpu::RequestDeviceError> {
+    ) -> Result<GpuOctree, wgpu::RequestDeviceError> {
         if points_per_node < 1 {
             panic!("Cannot build octree with less than 1 point per node!")
         }
@@ -203,10 +236,25 @@ impl<'a> GpuOctree<'a> {
             )
             .await?;
             
+        let mut points: Vec<Vector3<f64>> = Vec::new();
+        let point_iter = AttributeIteratorByValue::new(point_buffer, &attributes::POSITION_3D);
+        for point in point_iter {
+            points.push(point);
+        }
+
+        let point_count = point_buffer.len();
+        let mut raw_points = vec![0u8; 24 * point_count];
+        point_buffer.get_raw_attribute_range(
+            0..point_count,
+            &attributes::POSITION_3D,
+            raw_points.as_mut_slice(),
+        );
+
         Ok(GpuOctree {
             gpu_device: device,
             gpu_queue: queue,
-            point_buffer,
+            point_buffer: points,
+            raw_points,
             point_partitioning: (0..point_buffer.len() as u32).collect(),
             root_node: None,
             depth: 0,
@@ -227,18 +275,8 @@ impl<'a> GpuOctree<'a> {
     pub async fn construct(&mut self) {
         let point_count = self.point_buffer.len();
         
-        // point cloud data, later uploaded to GPU
-        let mut raw_points = vec![0u8; 24 * point_count];
-        let now = time::Instant::now();
-        self.point_buffer.get_raw_attribute_range(
-            0..point_count,
-            &attributes::POSITION_3D,
-            raw_points.as_mut_slice(),
-        );
-        let elapsed = now.elapsed();
-        println!("Octree - Getting raw point data took {} ms", elapsed.as_millis());
-
-        let now = time::Instant::now();
+        let mut raw_points = &self.raw_points;
+        
         let mut compiler = shaderc::Compiler::new().unwrap();
         let comp_shader = include_str!("shaders/generate_nodes.comp");
         let comp_spirv = compiler
@@ -360,25 +398,25 @@ impl<'a> GpuOctree<'a> {
 
         let mut current_nodes = vec![&mut root_node];
 
-        let raw_indeces: Vec<u8> = (0u32..point_count as u32)
-            .flat_map(|x| x.to_le_bytes().to_vec())
-            .collect();
-
+        let index_range: Vec<u32> = (0u32..point_count as u32).map(u32::from).collect::<Vec<u32>>();
+        let raw_indeces: &[u8] = bytemuck::cast_slice(index_range.as_slice());
         let point_index_buffer =
             self.gpu_device
                 .create_buffer_init(&wgpu::util::BufferInitDescriptor {
                     label: Some("IndexBuffer"),
-                    contents: &raw_indeces.as_slice(),
+                    contents: &raw_indeces,
                     usage: wgpu::BufferUsages::COPY_SRC
                         | wgpu::BufferUsages::COPY_DST
                         | wgpu::BufferUsages::STORAGE,
                 });
+        
         let index_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
             label: Some("CPU_IndexBuffer"),
             size: raw_indeces.len() as u64,
             usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
             mapped_at_creation: false,
         });
+        
         let points_bind_group = self
             .gpu_device
             .create_bind_group(&wgpu::BindGroupDescriptor {
@@ -395,11 +433,8 @@ impl<'a> GpuOctree<'a> {
                     },
                 ],
             });
-            let elapsed = now.elapsed();
-            println!("Octree - GPU prep took {} ms", elapsed.as_millis());
-        let now_compute = time::Instant::now();
         while !current_nodes.is_empty() {
-            let now = time::Instant::now();
+            let num_threads = current_nodes.len();
             // Nodes buffers are created inside the loop, as their size changes per iteration
             let child_buffer_size = 8 * (OctreeNode::size() * current_nodes.len()) as u64; 
             let child_nodes_buffer_staging =
@@ -418,7 +453,7 @@ impl<'a> GpuOctree<'a> {
                     | wgpu::BufferUsages::STORAGE,
                 mapped_at_creation: false,
             });
-
+            
             let mut parent_nodes_raw = Vec::new();
             for node in &current_nodes {
                 parent_nodes_raw.append(&mut node.into_raw());
@@ -496,30 +531,14 @@ impl<'a> GpuOctree<'a> {
 
             self.gpu_queue.submit(Some(encoder.finish()));
 
-            let index_slice = index_buffer_staging.slice(..);
-            let mapped_future = index_slice.map_async(wgpu::MapMode::Read);
-
-            self.gpu_device.poll(wgpu::Maintain::Wait);
-            // Read in the changes of the global point partitioning
-            if let Ok(()) = mapped_future.await {
-                let mapped_index_buffer = index_slice.get_mapped_range();
-                let index_vec = mapped_index_buffer.to_vec();
-                let indices: Vec<u32> = index_vec
-                    .chunks_exact(4)
-                    .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
-                    .collect();
-
-                self.point_partitioning = indices.clone();
-
-                drop(mapped_index_buffer);
-                index_buffer_staging.unmap();
-            }
-
             let parents_slice = parent_nodes_buffer_staging.slice(..);
             let parents_future = parents_slice.map_async(wgpu::MapMode::Read);
-
+            let children_slice = child_nodes_buffer_staging.slice(..);
+            let children_future = children_slice.map_async(wgpu::MapMode::Read);
+            
             self.gpu_device.poll(wgpu::Maintain::Wait);
             if let Ok(()) = parents_future.await {
+                let download_now = time::Instant::now();
                 let mapped_nodes_data = parents_slice.get_mapped_range();
                 let mapped_node_buffer = mapped_nodes_data.to_vec();
                 let nodes: Vec<OctreeNode> = mapped_node_buffer
@@ -527,10 +546,6 @@ impl<'a> GpuOctree<'a> {
                     .map(|b| OctreeNode::from_raw(b.to_vec()))
                     .collect();
 
-                let children_slice = child_nodes_buffer_staging.slice(..);
-                let children_future = children_slice.map_async(wgpu::MapMode::Read);
-                self.gpu_device.poll(wgpu::Maintain::Wait);
-
                 if let Ok(()) = children_future.await {
                     let mapped_children_data = children_slice.get_mapped_range();
                     let mapped_children_buffer = mapped_children_data.to_vec();
@@ -538,7 +553,6 @@ impl<'a> GpuOctree<'a> {
                         .chunks_exact(OctreeNode::size())
                         .map(|b| OctreeNode::from_raw(b.to_vec()))
                         .collect();
-                    let mut generated_children: Vec<&mut OctreeNode> = Vec::new();
                     for mut node in nodes {
                         let children_sizes = node.points_per_partition.clone();
 
@@ -561,7 +575,7 @@ impl<'a> GpuOctree<'a> {
                             if children_sizes[child_index] != 0
                                 && !child.is_leaf(self.points_per_node)
                             {
-                                generated_children.push(child);
+                                current_nodes.push(child);
                             } else {
                                 num_leaves += 1;
                             }
@@ -570,7 +584,6 @@ impl<'a> GpuOctree<'a> {
                             child_index += 1;
                         }
                     }
-                    current_nodes.append(&mut generated_children);
                     drop(mapped_nodes_data);
                     parent_nodes_buffer_staging.unmap();
                     drop(mapped_children_data);
@@ -581,15 +594,32 @@ impl<'a> GpuOctree<'a> {
                     child_nodes_buffer_staging.destroy();
                 }
             }
+            
             let work_done = self.gpu_queue.on_submitted_work_done();
             work_done.await;
 
             tree_depth += 1;
-            let elapsed = now.elapsed();
-            println!("Octree - Compute Pass took {} ms", elapsed.as_millis());
+
+        }
+        
+        let index_slice = index_buffer_staging.slice(..);
+        let mapped_future = index_slice.map_async(wgpu::MapMode::Read);
+
+        self.gpu_device.poll(wgpu::Maintain::Wait);
+        // Read in the changes of the global point partitioning
+        if let Ok(()) = mapped_future.await {
+            let mapped_index_buffer = index_slice.get_mapped_range();
+            let index_vec = mapped_index_buffer.to_vec();
+            let indices: Vec<u32> = index_vec
+                .chunks_exact(4)
+                .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
+                .collect();
+
+            self.point_partitioning = indices.clone();
+
+            drop(mapped_index_buffer);
+            index_buffer_staging.unmap();
         }
-        let elapsed = now_compute.elapsed();
-        println!("Octree - Whole Computation loop took {} ms", elapsed.as_millis());
         gpu_point_buffer.destroy();
         point_index_buffer.destroy();
         index_buffer_staging.destroy();
@@ -603,87 +633,94 @@ impl<'a> GpuOctree<'a> {
         return indices;
     }
 
-    fn deepest_octant(&self, node: &'a OctreeNode, pos: Point3<f64>, max_distance: f64) -> &'a OctreeNode {
-        if let Some(children) = node.children.as_ref() {
-            for child in children.iter() {
-                let bounds_extent = child.bounds.extent();
-                if !child.is_leaf(self.points_per_node)
-                    && child.contains(pos)
-                    && bounds_extent.x >= max_distance * 2.0
-                    && bounds_extent.y >= max_distance * 2.0
-                    && bounds_extent.z >= max_distance * 2.0
-                    {
-                    return self.deepest_octant(child, pos, max_distance);
-                }
-            }
+    pub fn k_nearest_neighbors(&self, pos: Vector3<f64>, radius: f64, k: usize) -> Vec<u32> {
+        if k < 1 {
+            return vec![];
         }
-        node
-    }
-    fn nearest_neighbor_helper(&self, pos: &Point3<f64>, dist: f64, node: &OctreeNode) -> Option<u32> {
-        let mut axes: Vector3<f64> = 0.5 * (node.bounds.max() - node.bounds.min());
-        axes += Vector3::new(node.bounds.min().x, node.bounds.min().y, node.bounds.min().z);
-        let pos_vector = Vector3::new(pos.x, pos.y, pos.z);
-        let mut nearest_index: Option<u32> = None;
-        let mut shortest_distance = f64::MAX;
-        if let Some(children) = node.children.as_ref() {
-            // Sort children according to proximity to pos
-            let mut sorted_children: Vec<&OctreeNode> = Vec::new();
-            for i in 0..8 {
-                let mut k = 0;
-                let mut center = 0.5 * (children[i].bounds.max() - children[i].bounds.min());
-                center += Vector3::new(children[i].bounds.min().x, children[i].bounds.min().y, children[i].bounds.min().z);
-                let curr_dist = center.metric_distance(&pos_vector);
-                for c in sorted_children.iter(){
-                    let sorted_center = 0.5 * (c.bounds.max() - c.bounds.min()) + Vector3::new(c.bounds.min().x, c.bounds.min().y, c.bounds.min().z);
-                    let sorted_dist = sorted_center.metric_distance(&pos_vector);
-                    if sorted_dist < curr_dist {
-                        k += 1;
+        let node = self.root_node.as_ref().unwrap();
+        let mut worklist = vec![node];
+        let point_buffer = &self.point_buffer;
+        let mut points = DoublePriorityQueue::new();
+
+        if pos.x - radius > node.bounds.max().x ||
+            pos.x + radius < node.bounds.min().x ||
+            pos.y - radius > node.bounds.max().y ||
+            pos.y + radius < node.bounds.min().y ||
+            pos.z - radius > node.bounds.max().z ||
+            pos.z + radius < node.bounds.min().z 
+        {
+            return vec![];
+        }
+        let radius_squared = radius * radius;
+        while !worklist.is_empty() {
+            let node = worklist.pop().unwrap();
+            if node.is_leaf(self.points_per_node) {
+                let point_indices = self.get_points(node);
+                for i in point_indices.iter() {
+                    let point = point_buffer[*i as usize];
+                    let curr_dist = point - pos;
+                    let curr_dist = f64::powi(curr_dist.x, 2) + f64::powi(curr_dist.y, 2) + f64::powi(curr_dist.z, 2);
+                    if  curr_dist <= radius_squared{
+                        if points.len() >= k {
+                            let (_, dist) = points.peek_max().unwrap();
+                            if *dist > OrderedFloat(curr_dist) {
+                                points.pop_max();
+                                points.push(i.clone(), OrderedFloat(curr_dist));
+                            }
+                        }
+                        else {
+                            points.push(i.clone(), OrderedFloat(curr_dist));
+                        }
                     }
                 }
-                sorted_children.insert(k, &children[i]);
-                
+                let point_indices: Vec<u32> = self.get_points(node);
             }
-            for c in sorted_children.iter(){
-                if let None = nearest_index {
-                    nearest_index = self.nearest_neighbor_helper(pos, dist, c);
-                    if let Some(index) = nearest_index {
-                        let point = self.point_buffer.get_attribute::<Vector3<f64>>(&attributes::POSITION_3D, index as usize);
-                        shortest_distance = point.metric_distance(&pos_vector);
+            else {
+                
+                match node.relation_to_point(&pos, radius) {
+                    OctreeRelation::In => {
+                        //let now = time::Instant::now();
+                        let point_indices = self.get_points(node);
+                        for i in point_indices.iter() {
+                            let point = point_buffer[*i as usize];
+                            let curr_dist = point - pos;
+                            let curr_dist = f64::powi(curr_dist.x, 2) + f64::powi(curr_dist.y, 2) + f64::powi(curr_dist.z, 2);
+                            //let curr_dist = curr_dist.x * curr_dist.x + curr_dist.y * curr_dist.y + curr_dist.z * curr_dist.z;
+                            if  curr_dist <= radius_squared {
+                                if points.len() >= k {
+                                    let (_, dist) = points.peek_max().unwrap();
+                                    if *dist > OrderedFloat(curr_dist) {
+                                        points.pop_max();
+                                        points.push(i.clone(), OrderedFloat(curr_dist));
+                                    }
+                                }
+                                else {
+                                    points.push(i.clone(), OrderedFloat(curr_dist));
+                                }
+                            }
+                        }
                     }
-                }
-                else {
-                    let current_nearest = self.nearest_neighbor_helper(pos, dist, c);
-                    if let Some(index) = current_nearest {
-                        let point = self.point_buffer.get_attribute::<Vector3<f64>>(&attributes::POSITION_3D, current_nearest.unwrap() as usize);
-                        let curr_dist = point.metric_distance(&pos_vector);
-                        if curr_dist < shortest_distance {
-                            nearest_index = current_nearest;
-                            shortest_distance = curr_dist;
+                    OctreeRelation::Partial => {
+                        if let Some(children) = node.children.as_ref() {
+                            children.iter().for_each(|c| {
+                                if !c.is_empty(){ 
+                                    worklist.push(c);
+                                }
+                            });
                         }
                     }
-                }
+                    OctreeRelation::Out => {}
+                }; 
             }
-            
         }
-        else if !node.is_empty(){
-            let point_indices = self.get_points(node);
-            for i in point_indices.iter() {
-                let point = self.point_buffer.get_attribute::<Vector3<f64>>(&attributes::POSITION_3D, *i as usize);
-                let curr_dist = point.metric_distance(&pos_vector);
-                
-                if  curr_dist <= dist && curr_dist < shortest_distance {
-                    shortest_distance = curr_dist;
-                    nearest_index = Some(i.clone())
-                }
-            }
+        if points.is_empty() {
+            return vec![];
         }
-        nearest_index
-    }
-
-    pub fn nearest_neighbor(&self, pos: Point3<f64>, max_distance: f64) -> Option<u32> {
-        let node = self.deepest_octant(self.root_node.as_ref().unwrap(), pos, max_distance);
-        let neighbor = self.nearest_neighbor_helper(&pos, max_distance, &node);
-        neighbor
+        let mut nearest = points.into_ascending_sorted_vec();
+        nearest.truncate(k);
+        
+        return nearest;
+        
     }
 
 }
@@ -703,9 +740,10 @@ mod tests {
     use tokio;
 
     static FILE: &'static str = //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
-                                "/home/jnoice/Downloads/interesting.las"
+                                //"/home/jnoice/Downloads/interesting.las"
                                 //"/home/jnoice/Downloads/45123H3316.laz"
                                 //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz"
+                                "/home/jnoice/Downloads/portland.laz"
                                 ;
     #[tokio::test]
     async fn check_correct_bounds() {
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
index d227091..c4a839d 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
@@ -22,7 +22,7 @@ layout(std430, set=1, binding=1) buffer Partitioning {
   uint indeces[];
 };
 
-layout (local_size_x=1, local_size_y=1, local_size_z=1) in;
+layout (local_size_x=8, local_size_y=1, local_size_z=1) in;
 
 uint[3] partitioning_order(double x, double y, double z){
   uint[] order = uint[3](0,1,2);
@@ -68,33 +68,62 @@ uint partition_run(uint start, uint end, double pivot, uint axis) {
   return j;
 }
 
-void partitioning(uint[3] axes, double[3] pivots, uint id){
+// void partitioning(uint[3] axes, double[3] pivots, uint id, uint local_thread_id){
+//   uint start = parents[id].points_start;
+//   uint end = parents[id].node_partitioning[0];
+
+//   parents[id].node_partitioning[3] = partition_run(start, end, pivots[axes[0]], axes[0]);
+
+//   parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
+
+//   parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivots[axes[1]], axes[1]);
+
+//   parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
+
+//   parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
+
+//   parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
+
+//   parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivots[axes[2]], axes[2]);
+
+//   parents[id].node_partitioning[7] = end;
+// }
+void partition_pass_first(uint axis, double pivot, uint id, uint thread_id) {
   uint start = parents[id].points_start;
-  uint end = parents[id].node_partitioning[0];
-  
-  parents[id].node_partitioning[3] = partition_run(start, end, pivots[axes[0]], axes[0]);
-  
-  parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
-  parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
-  parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
-  parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivots[axes[1]], axes[1]);
-  
-  parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
-  parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivots[axes[2]], axes[2]);
+  uint end = parents[id].points_end;
+  parents[id].node_partitioning[3] = partition_run(start, end, pivot, axis);
   parents[id].node_partitioning[7] = end;
 }
+void partition_pass_second(uint axis, double pivot, uint id, uint thread_id) {
+  uint start = parents[id].points_start;
+  uint end = parents[id].points_end;
+  switch(thread_id) {
+    case 0: parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivot, axis);
+    break;
+    case 1: parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivot, axis);
+    break;
+  }
+}
+void partition_pass_third(uint axis, double pivot, uint id, uint thread_id){
+  uint start = parents[id].points_start;
+  uint end = parents[id].points_end;
+  switch(thread_id) {
+    case 0: parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivot, axis);
+    break;
+    case 1: parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivot, axis);
+    break;
+    case 2: parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivot, axis);
+    break;
+    case 3: parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivot, axis);
+  }
+}
 
 bool[3] partitioned_to_right(uint[3] partition_order, uint index){
   bool[3] on_right_side = bool[3](false, false, false);
-  if(index % 2 != 0){
-    on_right_side[partition_order[2]] = true;
-  }
-  if(index >= 2 && index <= 3 || index >= 6){
-    on_right_side[partition_order[1]] = true;
-  }
-  if(index >= 4){
-    on_right_side[partition_order[0]] = true;
-  }
+
+  on_right_side[partition_order[2]] = index % 2 != 0;
+  on_right_side[partition_order[1]] = index >= 2 && index <= 3 || index >= 6;
+  on_right_side[partition_order[0]] = index >= 4;
   return on_right_side;
 }
 
@@ -104,20 +133,19 @@ double[2][3] get_childs_bounds(uint[3] partition_order, double[3] partition_axes
   if(child_index == 0) {
     bounds_min = parent.bounds_min;
     bounds_max = partition_axes;
+    return double[2][3](bounds_min, bounds_max);
   }
-  else {
-    bool[3] on_right_side = partitioned_to_right(partition_order, child_index);
-    for(uint k = 0; k < 3; ++k){
-      bounds_min[k] = on_right_side[k] ? partition_axes[k] : parent.bounds_min[k];
-      bounds_max[k] = on_right_side[k] ? parent.bounds_max[k] : partition_axes[k];
-
-    }
+  bool[3] on_right_side = partitioned_to_right(partition_order, child_index);
+  for(uint k = 0; k < 3; ++k){
+    bounds_min[k] = on_right_side[k] ? partition_axes[k] : parent.bounds_min[k];
+    bounds_max[k] = on_right_side[k] ? parent.bounds_max[k] : partition_axes[k];
   }
   return double[2][3](bounds_min, bounds_max);
 }
 
 void main() {
   uint idx = gl_WorkGroupID.x;
+  uint thread_idx = gl_LocalInvocationID.x;
   Node parent = parents[idx];
 
   double x_diff = parent.bounds_max[0] - parent.bounds_min[0];
@@ -129,31 +157,39 @@ void main() {
   double[3] partition_pivots = double[3](x_partition, y_partition, z_partition);
   uint[3] partition_order = partitioning_order(abs(x_diff), abs(y_diff), abs(z_diff));
   
-  partitioning(partition_order, partition_pivots, idx);
-
-  for(uint i = 0; i < 8; ++i){
-    if(i == 0) {
-      parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i] - parents[idx].points_start;
-    } else {
-    parents[idx].points_per_partition[i] = parents[idx].node_partitioning[i] - parents[idx].node_partitioning[i-1];
-    }
+  //partitioning(partition_order, partition_pivots, idx, thread_idx);
+  if(thread_idx == 0){
+    partition_pass_first(partition_order[0], partition_pivots[0], idx, thread_idx);
   }
-  for(uint i = 0; i < 8; ++i){
-    if(i == 0) {
-      children[idx * 8 + i].points_start = parents[idx].points_start;
-    } else {
-      children[idx * 8 + i].points_start = parents[idx].node_partitioning[i - 1];
-    }
-    children[idx * 8 + i].points_per_partition[0] = parents[idx].points_per_partition[i];
-    children[idx * 8 + i].points_end = parents[idx].node_partitioning[i];
-    children[idx * 8 + i].node_partitioning[0] = children[idx * 8 + i].points_end;
-    double[2][3] child_bounds = get_childs_bounds(
-      partition_order,
-      double[3](x_partition, y_partition, z_partition),
-      i,
-      parent
-    );
-    children[idx * 8 + i].bounds_min = child_bounds[0];
-    children[idx* 8 + i].bounds_max = child_bounds[1];
+  if(thread_idx < 2) {
+    partition_pass_second(partition_order[1], partition_pivots[1], idx, thread_idx);
+  }
+  if(thread_idx < 4) {
+    partition_pass_third(partition_order[2], partition_pivots[2], idx, thread_idx);
+  }
+  
+  if(thread_idx == 0) {
+    parents[idx].points_per_partition[thread_idx] = parents[idx].node_partitioning[thread_idx] - parents[idx].points_start;
+  } 
+  else {
+   parents[idx].points_per_partition[thread_idx] = parents[idx].node_partitioning[thread_idx] - parents[idx].node_partitioning[thread_idx - 1];
+  }
+  
+  if(thread_idx == 0) {
+    children[idx * 8 + thread_idx].points_start = parents[idx].points_start;
+  } else {
+    children[idx * 8 + thread_idx].points_start = parents[idx].node_partitioning[thread_idx - 1];
   }
+  children[idx * 8 + thread_idx].points_per_partition[0] = parents[idx].points_per_partition[thread_idx];
+  children[idx * 8 + thread_idx].points_end = parents[idx].node_partitioning[thread_idx];
+  children[idx * 8 + thread_idx].node_partitioning[0] = children[idx * 8 + thread_idx].points_end;
+  double[2][3] child_bounds = get_childs_bounds(
+    partition_order,
+    double[3](x_partition, y_partition, z_partition),
+    thread_idx,
+    parent
+  );
+  children[idx * 8 + thread_idx].bounds_min = child_bounds[0];
+  children[idx* 8 + thread_idx].bounds_max = child_bounds[1];
+
 }

From 6bbbb1fd3da170980d568d0e4f651f7351dd1f6d Mon Sep 17 00:00:00 2001
From: jneus <jannis.neus@live.de>
Date: Wed, 2 Mar 2022 14:53:31 +0100
Subject: [PATCH 14/15] Moved octree to pasture-algorithms

---
 pasture-algorithms/Cargo.toml                 |  13 ++
 .../src/acceleration_structures/gpu_octree.rs | 113 ++++++++++------
 .../src/acceleration_structures/mod.rs        |   0
 .../shaders/generate_nodes.comp               | 123 +++++++++---------
 pasture-algorithms/src/lib.rs                 |   4 +
 pasture-tools/Cargo.toml                      |  16 +--
 6 files changed, 152 insertions(+), 117 deletions(-)
 rename {pasture-tools => pasture-algorithms}/src/acceleration_structures/gpu_octree.rs (89%)
 rename {pasture-tools => pasture-algorithms}/src/acceleration_structures/mod.rs (100%)
 rename {pasture-tools => pasture-algorithms}/src/acceleration_structures/shaders/generate_nodes.comp (55%)

diff --git a/pasture-algorithms/Cargo.toml b/pasture-algorithms/Cargo.toml
index cf240ce..de27f45 100644
--- a/pasture-algorithms/Cargo.toml
+++ b/pasture-algorithms/Cargo.toml
@@ -21,6 +21,19 @@ typenum = "1.13.0"
 proj-sys = "0.18.2"
 kd-tree = "0.3.0"
 num-traits = "0.2.14"
+tokio = { version = "1.16.1", features = ["full"] }
+priority-queue = "1.2.1"
+ordered-float = "2.10.0"
+
+#gpu related
+wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
+shaderc = { version = "0.7.2", optional = true }
+futures = { version = "0.3", optional = true }
+bytemuck = { version = "1.5.1", optional = true }
+
+[features]
+gpu = ["wgpu", "shaderc", "futures", "bytemuck"]
+
 
 [dev-dependencies]
 criterion = "0.3"
diff --git a/pasture-tools/src/acceleration_structures/gpu_octree.rs b/pasture-algorithms/src/acceleration_structures/gpu_octree.rs
similarity index 89%
rename from pasture-tools/src/acceleration_structures/gpu_octree.rs
rename to pasture-algorithms/src/acceleration_structures/gpu_octree.rs
index 4c0be27..2e945f8 100644
--- a/pasture-tools/src/acceleration_structures/gpu_octree.rs
+++ b/pasture-algorithms/src/acceleration_structures/gpu_octree.rs
@@ -1,7 +1,7 @@
 use pasture_core::{
     containers::{attr1::AttributeIteratorByValue, PointBuffer, PointBufferExt, PerAttributePointBufferExt},
     layout::attributes,
-    math::AABB,
+    math::{AABB, DynamicMortonIndex, Octant, MortonIndexNaming, MortonIndex64},
     nalgebra::{Point3, Vector3},
 };
 use priority_queue::DoublePriorityQueue;
@@ -9,8 +9,6 @@ use ordered_float::OrderedFloat;
 use std::convert::TryInto;
 use std::fmt;
 use std::mem;
-use std::time;
-use std::thread;
 use wgpu::util::DeviceExt;
 
 #[derive(Debug, Clone)]
@@ -33,6 +31,7 @@ pub struct GpuOctree{
     depth: u32,
     bounds: AABB<f64>,
     points_per_node: u32,
+    morton_code: DynamicMortonIndex,
 }
 
 enum OctreeRelation {
@@ -143,6 +142,10 @@ impl OctreeNode {
         }
     }
 
+    /// Specifies the relation between the query point `pos` and the Bounding Box of the Node.
+    /// OctreeRelation::In      ==> the whole node sits inside the radius of the query
+    /// OctreeRelation::Partial ==> node and query intersect or node contains whole query
+    /// OctreeRelation::Out     ==> node and query are disjoint
     fn relation_to_point(&self, pos: &Vector3<f64>, radius: f64) -> OctreeRelation {
         let node_extent = self.bounds.extent();
         let node_center = self.bounds.center().coords;
@@ -236,12 +239,16 @@ impl GpuOctree {
             )
             .await?;
             
+
+        // Points are read from the Pasture PointBuffer to allow for faster access of individual points.,
+        // Without this, onw would need to get each raw point individually and convert it
         let mut points: Vec<Vector3<f64>> = Vec::new();
         let point_iter = AttributeIteratorByValue::new(point_buffer, &attributes::POSITION_3D);
         for point in point_iter {
             points.push(point);
         }
 
+        // raw points read here, so that it must not be done while construction
         let point_count = point_buffer.len();
         let mut raw_points = vec![0u8; 24 * point_count];
         point_buffer.get_raw_attribute_range(
@@ -249,6 +256,7 @@ impl GpuOctree {
             &attributes::POSITION_3D,
             raw_points.as_mut_slice(),
         );
+        let morton_code = DynamicMortonIndex::from_octants(&[]);
 
         Ok(GpuOctree {
             gpu_device: device,
@@ -260,6 +268,7 @@ impl GpuOctree {
             depth: 0,
             bounds: max_bounds,
             points_per_node,
+            morton_code
         })
     }
 
@@ -268,6 +277,20 @@ impl GpuOctree {
         println!("Tree Depth: {}", self.depth);
         println!("{}", self.root_node.as_ref().unwrap());
     }
+
+    /// Prints the morton index for given point inside the AABB of the octree
+    pub fn print_morton_code(&self, point: &Point3<f64>) {
+        if let Some(root) = self.root_node.as_ref() {
+            println!("{}", 
+            MortonIndex64::from_point_in_bounds(&point, &root.bounds)
+            .to_string(MortonIndexNaming::AsOctantConcatenationWithRoot));
+        }
+        else {
+            println!("Octree not constructed yet");
+        }
+    }
+
+    // }
     /// Run top-down construction of the octree.
     /// 
     /// Starting from the root, on each level the children of all current leaves
@@ -375,12 +398,13 @@ impl GpuOctree {
 
         let gpu_point_buffer =
             self.gpu_device
-                .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                .create_buffer(&wgpu::BufferDescriptor {
                     label: Some("PointBuffer"),
-                    contents: &raw_points.as_slice(),
+                    size: (point_count * mem::size_of::<Vector3<f64>>()) as u64,
                     usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::STORAGE,
+                    mapped_at_creation: false,
                 });
-
+        self.gpu_queue.write_buffer(&gpu_point_buffer, 0, self.raw_points.as_slice());
         let mut root_node = OctreeNode {
             bounds: self.bounds,
             children: None,
@@ -402,21 +426,17 @@ impl GpuOctree {
         let raw_indeces: &[u8] = bytemuck::cast_slice(index_range.as_slice());
         let point_index_buffer =
             self.gpu_device
-                .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                .create_buffer(&wgpu::BufferDescriptor {
                     label: Some("IndexBuffer"),
-                    contents: &raw_indeces,
+                    size: (point_count * mem::size_of::<u32>()) as u64,
                     usage: wgpu::BufferUsages::COPY_SRC
                         | wgpu::BufferUsages::COPY_DST
+                        | wgpu::BufferUsages::MAP_READ
                         | wgpu::BufferUsages::STORAGE,
+                    mapped_at_creation: false,
                 });
-        
-        let index_buffer_staging = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
-            label: Some("CPU_IndexBuffer"),
-            size: raw_indeces.len() as u64,
-            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
-            mapped_at_creation: false,
-        });
-        
+        self.gpu_queue.write_buffer(&point_index_buffer, 0, raw_indeces);
+
         let points_bind_group = self
             .gpu_device
             .create_bind_group(&wgpu::BindGroupDescriptor {
@@ -434,8 +454,10 @@ impl GpuOctree {
                 ],
             });
         while !current_nodes.is_empty() {
-            let num_threads = current_nodes.len();
+            let num_blocks = current_nodes.len();
+
             // Nodes buffers are created inside the loop, as their size changes per iteration
+            // Staging Buffers do not reside on GPU and are used for reading Compure results
             let child_buffer_size = 8 * (OctreeNode::size() * current_nodes.len()) as u64; 
             let child_nodes_buffer_staging =
                 self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
@@ -467,13 +489,16 @@ impl GpuOctree {
                 });
             let parent_nodes_buffer =
                 self.gpu_device
-                    .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                    .create_buffer(&wgpu::BufferDescriptor {
                         label: Some("ParentNodesBuffer"),
-                        contents: parent_nodes_raw.as_slice(),
+                        size: current_nodes.len() as u64 * OctreeNode::size() as u64,
                         usage: wgpu::BufferUsages::COPY_SRC
                             | wgpu::BufferUsages::COPY_DST
                             | wgpu::BufferUsages::STORAGE,
+                        mapped_at_creation: false,
                     });
+            self.gpu_queue.write_buffer(&parent_nodes_buffer, 0, parent_nodes_raw.as_slice());
+
             let nodes_bind_group = self
                 .gpu_device
                 .create_bind_group(&wgpu::BindGroupDescriptor {
@@ -505,8 +530,9 @@ impl GpuOctree {
                 compute_pass.set_bind_group(1, &points_bind_group, &[]);
 
                 compute_pass.insert_debug_marker("Pasture Compute Debug");
-                compute_pass.dispatch(current_nodes.len() as u32, 1, 1);
+                compute_pass.dispatch(num_blocks as u32, 1, 1);
             }
+            // Copy computed Nodes into CPU staging buffers for mapped reading
             encoder.copy_buffer_to_buffer(
                 &child_nodes_buffer,
                 0,
@@ -521,16 +547,10 @@ impl GpuOctree {
                 0,
                 parent_nodes_raw.len() as u64,
             );
-            encoder.copy_buffer_to_buffer(
-                &point_index_buffer,
-                0,
-                &index_buffer_staging,
-                0,
-                raw_indeces.len() as u64,
-            );
+            
 
             self.gpu_queue.submit(Some(encoder.finish()));
-
+            
             let parents_slice = parent_nodes_buffer_staging.slice(..);
             let parents_future = parents_slice.map_async(wgpu::MapMode::Read);
             let children_slice = child_nodes_buffer_staging.slice(..);
@@ -538,7 +558,6 @@ impl GpuOctree {
             
             self.gpu_device.poll(wgpu::Maintain::Wait);
             if let Ok(()) = parents_future.await {
-                let download_now = time::Instant::now();
                 let mapped_nodes_data = parents_slice.get_mapped_range();
                 let mapped_node_buffer = mapped_nodes_data.to_vec();
                 let nodes: Vec<OctreeNode> = mapped_node_buffer
@@ -563,7 +582,7 @@ impl GpuOctree {
 
                         let node_ref = current_nodes.remove(0);
                         *node_ref = node;
-
+                        
                         let children: &mut Box<[OctreeNode; 8]> =
                             node_ref.children.as_mut().unwrap();
 
@@ -583,6 +602,15 @@ impl GpuOctree {
                             num_nodes += 1;
                             child_index += 1;
                         }
+                        let morton_octants: Vec<Octant> = vec![
+                            0, 1, 2, 3, 4, 5, 6, 7
+                        ]
+                        .iter()
+                        .map(|&raw_octant| (raw_octant as u8).try_into().unwrap())
+                        .collect();
+
+                        morton_octants.iter()
+                        .for_each(|octant| self.morton_code.add_octant(*octant));
                     }
                     drop(mapped_nodes_data);
                     parent_nodes_buffer_staging.unmap();
@@ -594,15 +622,11 @@ impl GpuOctree {
                     child_nodes_buffer_staging.destroy();
                 }
             }
-            
-            let work_done = self.gpu_queue.on_submitted_work_done();
-            work_done.await;
 
             tree_depth += 1;
-
         }
-        
-        let index_slice = index_buffer_staging.slice(..);
+        // Point indices are read after compute loop to reduce data copying and runtime
+        let index_slice = point_index_buffer.slice(..);
         let mapped_future = index_slice.map_async(wgpu::MapMode::Read);
 
         self.gpu_device.poll(wgpu::Maintain::Wait);
@@ -618,25 +642,30 @@ impl GpuOctree {
             self.point_partitioning = indices.clone();
 
             drop(mapped_index_buffer);
-            index_buffer_staging.unmap();
+            point_index_buffer.unmap();
         }
         gpu_point_buffer.destroy();
         point_index_buffer.destroy();
-        index_buffer_staging.destroy();
         self.root_node = Some(root_node);
         self.depth = tree_depth;
     }
 
+    /// Returns a Vec containing the indices of all points belonging to `point`
     fn get_points(&self, node: &OctreeNode) -> Vec<u32> {
         let indices =
             self.point_partitioning[node.point_start as usize..node.point_end as usize].to_vec();
         return indices;
     }
 
+    /// Computes the `k` nearest neighbors of `point` that are within `radius`. 
+    /// Returns a Vec containing the indices to the neighbor points in increasing order of distance 
     pub fn k_nearest_neighbors(&self, pos: Vector3<f64>, radius: f64, k: usize) -> Vec<u32> {
         if k < 1 {
             return vec![];
         }
+        // We use a Vec<&Octree> as working queue for the nodes that we are visitting
+        // We use a priority queue so that the found indices are already
+        // being sorted by their distance to pos
         let node = self.root_node.as_ref().unwrap();
         let mut worklist = vec![node];
         let point_buffer = &self.point_buffer;
@@ -652,8 +681,10 @@ impl GpuOctree {
             return vec![];
         }
         let radius_squared = radius * radius;
+
         while !worklist.is_empty() {
             let node = worklist.pop().unwrap();
+            // When node is leaf we need to search it's points
             if node.is_leaf(self.points_per_node) {
                 let point_indices = self.get_points(node);
                 for i in point_indices.iter() {
@@ -676,16 +707,16 @@ impl GpuOctree {
                 let point_indices: Vec<u32> = self.get_points(node);
             }
             else {
-                
+                // When node is not leaf, we must check if the Bounding Box of the node is in range of the query
+                // If so, we check if the whole node is inside the radius to possibly reduce step downs
+                // If node and query only intersect, children of the node are inspected
                 match node.relation_to_point(&pos, radius) {
                     OctreeRelation::In => {
-                        //let now = time::Instant::now();
                         let point_indices = self.get_points(node);
                         for i in point_indices.iter() {
                             let point = point_buffer[*i as usize];
                             let curr_dist = point - pos;
                             let curr_dist = f64::powi(curr_dist.x, 2) + f64::powi(curr_dist.y, 2) + f64::powi(curr_dist.z, 2);
-                            //let curr_dist = curr_dist.x * curr_dist.x + curr_dist.y * curr_dist.y + curr_dist.z * curr_dist.z;
                             if  curr_dist <= radius_squared {
                                 if points.len() >= k {
                                     let (_, dist) = points.peek_max().unwrap();
diff --git a/pasture-tools/src/acceleration_structures/mod.rs b/pasture-algorithms/src/acceleration_structures/mod.rs
similarity index 100%
rename from pasture-tools/src/acceleration_structures/mod.rs
rename to pasture-algorithms/src/acceleration_structures/mod.rs
diff --git a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-algorithms/src/acceleration_structures/shaders/generate_nodes.comp
similarity index 55%
rename from pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
rename to pasture-algorithms/src/acceleration_structures/shaders/generate_nodes.comp
index c4a839d..0c05b3c 100644
--- a/pasture-tools/src/acceleration_structures/shaders/generate_nodes.comp
+++ b/pasture-algorithms/src/acceleration_structures/shaders/generate_nodes.comp
@@ -1,3 +1,18 @@
+/*
+Compute shader for processing octree nodes.
+This approach is based on the work shown in "GPU-based Adaptive Octree Construction Algorithms" by (Goradia, 2008),
+Global buffer indeces[] maps an index to a point of the point cloud.
+This is used, so that we do not need to swap any real point inside the Point buffer.
+We use the indices to later adress the correct points of a Pasture PointBuffer
+
+Each node holds an array called node_partitioning, that specifies the range of indices a node contains.
+node_partitioning: [4,9,12,12,20,22,26,30] points_per_partition: [4,5,3,0,8,2,4,4]
+
+Right now, each parent in parents[] node has its own WorkGroup consisting of 8 threads, each thread assigning the computed
+values to the corresponding child node in children[]. That means each block of 8 Nodes in children[] belongs to o Node in parents[].
+
+*/
+
 #version 450
 
 struct Node {
@@ -24,6 +39,8 @@ layout(std430, set=1, binding=1) buffer Partitioning {
 
 layout (local_size_x=8, local_size_y=1, local_size_z=1) in;
 
+/// Determines the order in which the node is divided on is axes
+/// [0,1,2] means partition order is x,y,z
 uint[3] partitioning_order(double x, double y, double z){
   uint[] order = uint[3](0,1,2);
   double[] axes = double[3](x,y,z);
@@ -44,12 +61,16 @@ uint[3] partitioning_order(double x, double y, double z){
   }
   return order;
 }
-
+/// Swaps the indices at place a and b
 void swap( uint a,  uint b){
   uint tmp = indeces[a];
   indeces[a] = indeces[b];
   indeces[b] = tmp;
 }
+
+/// Partitions the indeces buffer along given axis with given pivot
+/// When finished all points in [start, j) are less than pivot and all
+/// points in [j, end) are greater equals pivot
 uint partition_run(uint start, uint end, double pivot, uint axis) {
   uint i = start;
   uint j = start;
@@ -67,57 +88,44 @@ uint partition_run(uint start, uint end, double pivot, uint axis) {
 
   return j;
 }
-
-// void partitioning(uint[3] axes, double[3] pivots, uint id, uint local_thread_id){
-//   uint start = parents[id].points_start;
-//   uint end = parents[id].node_partitioning[0];
-
-//   parents[id].node_partitioning[3] = partition_run(start, end, pivots[axes[0]], axes[0]);
-
-//   parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
-
-//   parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivots[axes[1]], axes[1]);
-
-//   parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
-
-//   parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
-
-//   parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
-
-//   parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivots[axes[2]], axes[2]);
-
-//   parents[id].node_partitioning[7] = end;
-// }
-void partition_pass_first(uint axis, double pivot, uint id, uint thread_id) {
-  uint start = parents[id].points_start;
-  uint end = parents[id].points_end;
-  parents[id].node_partitioning[3] = partition_run(start, end, pivot, axis);
-  parents[id].node_partitioning[7] = end;
-}
-void partition_pass_second(uint axis, double pivot, uint id, uint thread_id) {
+/// Starts the different partitionig phases for the axes
+/// At the end half of node_partitioning (0-3) are below pivots[0] on axes[0],
+/// (0-1) are below pivots[1] on axes[1] and (2-3) are above pivots[1] o axes[1]
+/// (0) and (2) are below pivots[2] on axis[2], (1) and (3) are above pivots[2] on axis[2].
+/// Same happens on subpartition (4-7) of node_partitioning
+void partitioning(uint[3] axes, double[3] pivots, uint id, uint local_thread_id){
   uint start = parents[id].points_start;
-  uint end = parents[id].points_end;
-  switch(thread_id) {
-    case 0: parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivot, axis);
-    break;
-    case 1: parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivot, axis);
-    break;
+  uint end = parents[id].node_partitioning[0];
+  if(local_thread_id == 0){
+    parents[id].node_partitioning[3] = partition_run(start, end, pivots[axes[0]], axes[0]);
   }
-}
-void partition_pass_third(uint axis, double pivot, uint id, uint thread_id){
-  uint start = parents[id].points_start;
-  uint end = parents[id].points_end;
-  switch(thread_id) {
-    case 0: parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivot, axis);
-    break;
-    case 1: parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivot, axis);
-    break;
-    case 2: parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivot, axis);
-    break;
-    case 3: parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivot, axis);
+  barrier();
+  if(local_thread_id == 1) {
+    parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
+  }
+  else if(local_thread_id == 2){
+    parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivots[axes[1]], axes[1]);
+  }
+  barrier();
+  if(local_thread_id == 3){
+    parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
+  }
+  else if(local_thread_id == 4){
+    parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
+  }
+  else if(local_thread_id == 5){
+    parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
+  }
+  else if(local_thread_id == 6){
+    parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivots[axes[2]], axes[2]);
   }
+  else if(local_thread_id == 7){
+    parents[id].node_partitioning[7] = end;
+  }
+  barrier();
 }
 
+/// Returns if child at `index` in parent is partitioned on the right side of the pivot on any axis
 bool[3] partitioned_to_right(uint[3] partition_order, uint index){
   bool[3] on_right_side = bool[3](false, false, false);
 
@@ -126,7 +134,7 @@ bool[3] partitioned_to_right(uint[3] partition_order, uint index){
   on_right_side[partition_order[0]] = index >= 4;
   return on_right_side;
 }
-
+/// Calculates the Bounding Box of given child
 double[2][3] get_childs_bounds(uint[3] partition_order, double[3] partition_axes, uint child_index, Node parent){
   double[3] bounds_min;
   double[3] bounds_max;
@@ -147,7 +155,8 @@ void main() {
   uint idx = gl_WorkGroupID.x;
   uint thread_idx = gl_LocalInvocationID.x;
   Node parent = parents[idx];
-
+  // Calculation of axis-length and center point coorddinates 
+  // of parent's bounding box
   double x_diff = parent.bounds_max[0] - parent.bounds_min[0];
   double y_diff = parent.bounds_max[1] - parent.bounds_min[1];
   double z_diff = parent.bounds_max[2] - parent.bounds_min[2];
@@ -157,16 +166,7 @@ void main() {
   double[3] partition_pivots = double[3](x_partition, y_partition, z_partition);
   uint[3] partition_order = partitioning_order(abs(x_diff), abs(y_diff), abs(z_diff));
   
-  //partitioning(partition_order, partition_pivots, idx, thread_idx);
-  if(thread_idx == 0){
-    partition_pass_first(partition_order[0], partition_pivots[0], idx, thread_idx);
-  }
-  if(thread_idx < 2) {
-    partition_pass_second(partition_order[1], partition_pivots[1], idx, thread_idx);
-  }
-  if(thread_idx < 4) {
-    partition_pass_third(partition_order[2], partition_pivots[2], idx, thread_idx);
-  }
+  partitioning(partition_order, partition_pivots, idx, thread_idx);
   
   if(thread_idx == 0) {
     parents[idx].points_per_partition[thread_idx] = parents[idx].node_partitioning[thread_idx] - parents[idx].points_start;
@@ -174,7 +174,9 @@ void main() {
   else {
    parents[idx].points_per_partition[thread_idx] = parents[idx].node_partitioning[thread_idx] - parents[idx].node_partitioning[thread_idx - 1];
   }
-  
+  // writes computed data into children
+  // based on the number of points belonging to a child, we can
+  // decide if the child is a leaf or should be processed as parent in next compute pass
   if(thread_idx == 0) {
     children[idx * 8 + thread_idx].points_start = parents[idx].points_start;
   } else {
@@ -191,5 +193,4 @@ void main() {
   );
   children[idx * 8 + thread_idx].bounds_min = child_bounds[0];
   children[idx* 8 + thread_idx].bounds_max = child_bounds[1];
-
 }
diff --git a/pasture-algorithms/src/lib.rs b/pasture-algorithms/src/lib.rs
index 00fa392..3e17ba4 100644
--- a/pasture-algorithms/src/lib.rs
+++ b/pasture-algorithms/src/lib.rs
@@ -4,6 +4,10 @@
 //! Pasture contains algorithms that can manipulate the point cloud data or
 //! calculate results based on them.
 
+// Data structures used for accelerating operations on point clouds
+#[cfg(feature = "gpu")]
+pub mod acceleration_structures;
+
 // Algorithm to calculate the bounding box of a point cloud.
 pub mod bounds;
 // Get the minimum and maximum value of a specific attribute in a point cloud.
diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index 8bdf0db..030cb60 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -12,30 +12,16 @@ categories = ["data-structures", "command-line-utilities"]
 readme = "README.md"
 
 [dependencies]
-pasture-core = {version = "=0.2.0", path = "../pasture-core", features=["gpu"]}
+pasture-core = {version = "=0.2.0", path = "../pasture-core"}
 pasture-io = {version = "=0.2.0", path = "../pasture-io" }
 pasture-algorithms = {version = "=0.2.0", path = "../pasture-algorithms" }
 pasture-derive = {version = "=0.2.0", path = "../pasture-derive" }
 anyhow = "1.0.34"
 clap = "2.33.3"
 log = "0.4"
-env_logger = "0.9.0"
 pretty_env_logger = "0.4.0"
 plotters = "^0.3.0"
 rand = {version = "0.8.3", features = ["small_rng"] }
-tokio = { version = "1.16.1", features = ["full"] }
-priority-queue = "1.2.1"
-ordered-float = "2.10.0"
-
-#gpu related
-wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
-shaderc = { version = "0.7.2", optional = true }
-futures = { version = "0.3", optional = true }
-bytemuck = { version = "1.5.1", optional = true }
-
-[features]
-gpu = ["wgpu", "shaderc", "futures", "bytemuck"]
-
 [[bin]]
 name = "reorder_laz_chunks"
 

From 7dd22abc6690a6601abf7870740c673944206076 Mon Sep 17 00:00:00 2001
From: jneus <jannis.neus@live.de>
Date: Wed, 2 Mar 2022 15:18:04 +0100
Subject: [PATCH 15/15] Added a README for octree module

---
 .../src/acceleration_structures/README.md     | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 pasture-algorithms/src/acceleration_structures/README.md

diff --git a/pasture-algorithms/src/acceleration_structures/README.md b/pasture-algorithms/src/acceleration_structures/README.md
new file mode 100644
index 0000000..61910c8
--- /dev/null
+++ b/pasture-algorithms/src/acceleration_structures/README.md
@@ -0,0 +1,30 @@
+# Pasture GPU Octree
+## TODO
+- Runtime of octree construction can be improved, bottleneck are the first few iterations of the compute shader,
+  where the partition of the node indices runs on a large set of numbers (whole buffer at first iteration).
+  - Possible way of accelerating the partitioning is via a GPU sort (bitonic sort, radix sort, odd-even-merge sort) on the morton codes of the points.
+  - As of now, there are no atomic operations supported by wgpu, but since 2021 they are officially part of the WebGPU spec. In the near future wgpu should allow for the use of atomic operations, which should make implementing sorting easier
+- Currently nearest_neighbor search runs recursively and single threaded. When compiling for release this is not really a bottleneck as of now, but the runtime when building in dev mode could improve radically.
+  - It's possible to enhance the nearest neighbor search to run multi-threaded in the future, for example through the use of the rayon crate.
+  - For that to function it could be possible to restructure the use of some resources in the algorithm to get the ownership right, when working with multiple threads.
+- As of now, the compute shader of the construction algorithm terminates prematurely on large input sizes (point clouds with over 3-4 million points).
+  - As I was able to reproduce a Vulkan Validation error, that is not captured by wgpu, I submitted an issue in their repo (https://github.com/gfx-rs/wgpu/issues/2484)
+  - possible sources for this can be:
+    - Illegal access of any resource by the compute shader on large files
+    - Termination of gpu process due to timing constraints
+    - An error in wgpu, as the error in the issue occurs when polling the gpu.
+  - to further investigate the issue I will remain participating on it
+
+If any questions arise, I can be contacted via email: jannis.neus@live.de
+
+## Building the octree
+
+To build the GpuOctree module, pasture needs to be build with the `gpu` feature enabled:
+
+```
+cargo build --features gpu
+```
+
+## Using octrees
+
+There is an example file in pasture-algorithms that shows how to use this module.