diff --git a/pasture-algorithms/Cargo.toml b/pasture-algorithms/Cargo.toml
index 832c3a3..f6365b5 100644
--- a/pasture-algorithms/Cargo.toml
+++ b/pasture-algorithms/Cargo.toml
@@ -21,6 +21,19 @@ typenum = "1.13.0"
 proj-sys = "0.22.0"
 kd-tree = "0.3.0"
 num-traits = "0.2.14"
+tokio = { version = "1.16.1", features = ["full"] }
+priority-queue = "1.2.1"
+ordered-float = "2.10.0"
+
+#gpu related
+wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
+shaderc = { version = "0.7.2", optional = true }
+futures = { version = "0.3", optional = true }
+bytemuck = { version = "1.5.1", optional = true }
+
+[features]
+gpu = ["wgpu", "shaderc", "futures", "bytemuck"]
+
 
 [dev-dependencies]
 criterion = "0.3"
diff --git a/pasture-algorithms/src/acceleration_structures/README.md b/pasture-algorithms/src/acceleration_structures/README.md
new file mode 100644
index 0000000..61910c8
--- /dev/null
+++ b/pasture-algorithms/src/acceleration_structures/README.md
@@ -0,0 +1,30 @@
+# Pasture GPU Octree
+## TODO
+- Runtime of octree construction can be improved, bottleneck are the first few iterations of the compute shader,
+  where the partition of the node indices runs on a large set of numbers (whole buffer at first iteration).
+  - Possible way of accelerating the partitioning is via a GPU sort (bitonic sort, radix sort, odd-even-merge sort) on the morton codes of the points.
+  - As of now, there are no atomic operations supported by wgpu, but since 2021 they are officially part of the WebGPU spec. In the near future wgpu should allow for the use of atomic operations, which should make implementing sorting easier
+- Currently nearest_neighbor search runs recursively and single threaded. When compiling for release this is not really a bottleneck as of now, but the runtime when building in dev mode could improve radically.
+  - It's possible to enhance the nearest neighbor search to run multi-threaded in the future, for example through the use of the rayon crate.
+  - For that to function it could be possible to restructure the use of some resources in the algorithm to get the ownership right, when working with multiple threads.
+- As of now, the compute shader of the construction algorithm terminates prematurely on large input sizes (point clouds with over 3-4 million points).
+  - As I was able to reproduce a Vulkan Validation error, that is not captured by wgpu, I submitted an issue in their repo (https://github.com/gfx-rs/wgpu/issues/2484)
+  - possible sources for this can be:
+    - Illegal access of any resource by the compute shader on large files
+    - Termination of gpu process due to timing constraints
+    - An error in wgpu, as the error in the issue occurs when polling the gpu.
+  - to further investigate the issue I will remain participating on it
+
+If any questions arise, I can be contacted via email: jannis.neus@live.de
+
+## Building the octree
+
+To build the GpuOctree module, pasture needs to be build with the `gpu` feature enabled:
+
+```
+cargo build --features gpu
+```
+
+## Using octrees
+
+There is an example file in pasture-algorithms that shows how to use this module.
diff --git a/pasture-algorithms/src/acceleration_structures/gpu_octree.rs b/pasture-algorithms/src/acceleration_structures/gpu_octree.rs
new file mode 100644
index 0000000..2e945f8
--- /dev/null
+++ b/pasture-algorithms/src/acceleration_structures/gpu_octree.rs
@@ -0,0 +1,945 @@
+use pasture_core::{
+    containers::{attr1::AttributeIteratorByValue, PointBuffer, PointBufferExt, PerAttributePointBufferExt},
+    layout::attributes,
+    math::{AABB, DynamicMortonIndex, Octant, MortonIndexNaming, MortonIndex64},
+    nalgebra::{Point3, Vector3},
+};
+use priority_queue::DoublePriorityQueue;
+use ordered_float::OrderedFloat;
+use std::convert::TryInto;
+use std::fmt;
+use std::mem;
+use wgpu::util::DeviceExt;
+
+#[derive(Debug, Clone)]
+pub struct OctreeNode {
+    bounds: AABB<f64>,
+    children: Option<Box<[OctreeNode; 8]>>,
+    node_partitioning: [u32; 8],
+    points_per_partition: [u32; 8],
+    point_start: u32,
+    point_end: u32,
+}
+
+pub struct GpuOctree{
+    gpu_device: wgpu::Device,
+    gpu_queue: wgpu::Queue,
+    point_buffer: Vec<Vector3<f64>>,
+    raw_points: Vec<u8>,
+    point_partitioning: Vec<u32>,
+    root_node: Option<OctreeNode>,
+    depth: u32,
+    bounds: AABB<f64>,
+    points_per_node: u32,
+    morton_code: DynamicMortonIndex,
+}
+
+enum OctreeRelation {
+    In,
+    Out,
+    Partial,
+}
+
+impl OctreeNode {
+    /// Get the number of bytes, a node allocates on the gpu.
+    /// Because the `children` pointer is not required for GPU node creation,
+    /// it's size is neglected.
+    fn size() -> usize {
+        let mut size = mem::size_of::<OctreeNode>();
+        size -= mem::size_of::<Option<Box<[OctreeNode; 8]>>>();
+        size
+    }
+    /// Checks if the given node has less than or equal to `points_per_node` points.
+    /// If yes, the node is a leaf.
+    fn is_leaf(&self, points_per_node: u32) -> bool {
+        let diff: i64 = self.point_end as i64 - self.point_start as i64;
+        return diff <= points_per_node as i64;
+    }
+    fn is_empty(&self) -> bool {
+        self.point_start == self.point_end && self.points_per_partition[0] == 0
+    }
+    /// Returns a vector of the nodes raw data. As with `size(), the field
+    /// `children`is not included, as it is not necessary for GPU computation.
+    fn into_raw(&self) -> Vec<u8> {
+        let mut raw_node: Vec<u8> = Vec::new();
+        for coord in self.bounds.min().iter() {
+            raw_node.append(&mut coord.to_le_bytes().to_vec());
+        }
+        for coord in self.bounds.max().iter() {
+            raw_node.append(&mut coord.to_le_bytes().to_vec());
+        }
+        raw_node.append(
+            &mut self
+                .node_partitioning
+                .map(|x| x.to_le_bytes())
+                .to_vec()
+                .into_iter()
+                .flatten()
+                .collect(),
+        );
+        raw_node.append(
+            &mut self
+                .points_per_partition
+                .map(|x| x.to_le_bytes())
+                .to_vec()
+                .into_iter()
+                .flatten()
+                .collect(),
+        );
+        raw_node.append(&mut self.point_start.to_le_bytes().to_vec());
+        raw_node.append(&mut self.point_end.to_le_bytes().to_vec());
+
+        raw_node
+    }
+    /// Converts a vector of raw data back into `OctreeNode`.
+    /// Panics, if the vector has not enough data.
+    fn from_raw(mut data: Vec<u8>) -> Self {
+        let raw_bounds: Vec<u8> = data.drain(..24).collect();
+        let bounds_iter = raw_bounds.chunks_exact(8);
+        let bounds_min: Point3<f64> = Point3 {
+            coords: Vector3::from_vec(
+                bounds_iter
+                    .take(3)
+                    .map(|b| f64::from_le_bytes(b.try_into().unwrap()))
+                    .collect(),
+            ),
+        };
+        let raw_bounds: Vec<u8> = data.drain(..24).collect();
+        let bounds_iter = raw_bounds.chunks_exact(8);
+        let bounds_max: Point3<f64> = Point3 {
+            coords: Vector3::from_vec(
+                bounds_iter
+                    .take(3)
+                    .map(|b| f64::from_le_bytes(b.try_into().unwrap()))
+                    .collect(),
+            ),
+        };
+        let mut rest_data: Vec<u32> = data
+            .chunks_exact(4)
+            .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
+            .collect();
+        let mut rest_iter = rest_data.iter_mut();
+        let mut node_partitioning = [0u32; 8];
+        use std::sync::{mpsc::channel, Arc, Mutex};
+        for i in 0..8 {
+            node_partitioning[i] = *rest_iter.next().unwrap();
+        }
+        let mut points_per_partition = [0u32; 8];
+        for i in 0..8 {
+            points_per_partition[i] = *rest_iter.next().unwrap();
+        }
+        let points_start = *rest_iter.next().unwrap();
+
+        let points_end = *rest_iter.next().unwrap();
+
+        OctreeNode {
+            bounds: AABB::from_min_max(bounds_min, bounds_max),
+            children: None,
+            node_partitioning,
+            points_per_partition,
+            point_start: points_start,
+            point_end: points_end,
+        }
+    }
+
+    /// Specifies the relation between the query point `pos` and the Bounding Box of the Node.
+    /// OctreeRelation::In      ==> the whole node sits inside the radius of the query
+    /// OctreeRelation::Partial ==> node and query intersect or node contains whole query
+    /// OctreeRelation::Out     ==> node and query are disjoint
+    fn relation_to_point(&self, pos: &Vector3<f64>, radius: f64) -> OctreeRelation {
+        let node_extent = self.bounds.extent();
+        let node_center = self.bounds.center().coords;
+        let x_diff = (pos.x - node_center.x).abs();
+        let y_diff = (pos.y - node_center.y).abs();
+        let z_diff = (pos.z - node_center.z).abs();
+
+        // Point and radius outside of node
+        let max_diff = Vector3::new(
+            node_extent.x / 2. + radius,
+            node_extent.y / 2. + radius,
+            node_extent.z / 2. + radius
+        );
+        if x_diff >= max_diff.x || y_diff >= max_diff.y || z_diff >= max_diff.z {
+            return OctreeRelation::Out;
+        }
+        let radius_squared = radius * radius;
+        if x_diff <= node_extent.x || y_diff <= node_extent.y || z_diff <= node_extent.z {
+            let radius_squared = radius * radius;
+            let distance_squared = f64::powi(x_diff + node_extent.x * 0.5, 2) + f64::powi(y_diff + node_extent.y * 0.5, 2) + f64::powi(z_diff + node_extent.z * 0.5, 2);
+            // Whole Node lies within radius
+            if radius_squared >= distance_squared {
+                return OctreeRelation::In;
+            }
+            return OctreeRelation::Partial;
+        }
+        let distance_squared = f64::powi(x_diff - node_extent.x * 0.5, 2) + f64::powi(y_diff - node_extent.y * 0.5, 2) + f64::powi(z_diff - node_extent.z * 0.5, 2);
+        if radius_squared >= distance_squared{
+            return OctreeRelation::Partial;
+        }
+        return OctreeRelation::Out;
+    }
+    
+}
+
+impl fmt::Display for OctreeNode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "####### Octree Node #######\n");
+        write!(f, "Bounds: {:?}\n", self.bounds);
+        write!(f, "Start: {}, End: {}\n", self.point_start, self.point_end);
+        write!(f, "Node Partitioning: {:?}\n", self.node_partitioning);
+        write!(f, "Points per partition: {:?}\n", self.points_per_partition);
+        write!(f, "Chilren: ");
+        if let Some(c) = &self.children {
+            c.iter().for_each(|x| {
+                write!(f, "  {}", x);
+            });
+        } else {
+            write!(f, "None\n");
+        }
+        write!(f, "##########\n")
+    }
+}
+
+impl GpuOctree {
+    /// Creates an empty Octree accelerated by the GPU.
+    /// 
+    /// `point_buffer`: pasture buffer containing the point cloud data
+    /// 
+    /// `max_bounds`: boundary of the point cloud
+    /// 
+    /// `points_per_node`: threshold for a node becoming a leaf
+    /// 
+    /// The generated instance has no constructed octree. To get the octree,
+    /// run `construct()`.
+    pub async fn new(
+        point_buffer: & dyn PointBuffer,
+        max_bounds: AABB<f64>,
+        points_per_node: u32,
+    ) -> Result<GpuOctree, wgpu::RequestDeviceError> {
+        if points_per_node < 1 {
+            panic!("Cannot build octree with less than 1 point per node!")
+        }
+        let instance = wgpu::Instance::new(wgpu::Backends::VULKAN);
+        let adapter = instance
+            .request_adapter(&wgpu::RequestAdapterOptions {
+                power_preference: wgpu::PowerPreference::HighPerformance,
+                compatible_surface: None,
+                force_fallback_adapter: false,
+            })
+            .await
+            .unwrap();
+        let (device, queue) = adapter
+            .request_device(
+                &wgpu::DeviceDescriptor {
+                    features: adapter.features(),
+                    limits: adapter.limits(),
+                    label: Some("Octree_Device"),
+                },
+                None,
+            )
+            .await?;
+            
+
+        // Points are read from the Pasture PointBuffer to allow for faster access of individual points.,
+        // Without this, onw would need to get each raw point individually and convert it
+        let mut points: Vec<Vector3<f64>> = Vec::new();
+        let point_iter = AttributeIteratorByValue::new(point_buffer, &attributes::POSITION_3D);
+        for point in point_iter {
+            points.push(point);
+        }
+
+        // raw points read here, so that it must not be done while construction
+        let point_count = point_buffer.len();
+        let mut raw_points = vec![0u8; 24 * point_count];
+        point_buffer.get_raw_attribute_range(
+            0..point_count,
+            &attributes::POSITION_3D,
+            raw_points.as_mut_slice(),
+        );
+        let morton_code = DynamicMortonIndex::from_octants(&[]);
+
+        Ok(GpuOctree {
+            gpu_device: device,
+            gpu_queue: queue,
+            point_buffer: points,
+            raw_points,
+            point_partitioning: (0..point_buffer.len() as u32).collect(),
+            root_node: None,
+            depth: 0,
+            bounds: max_bounds,
+            points_per_node,
+            morton_code
+        })
+    }
+
+    pub fn print_tree(&self) {
+        println!("Num Points: {}", self.point_buffer.len());
+        println!("Tree Depth: {}", self.depth);
+        println!("{}", self.root_node.as_ref().unwrap());
+    }
+
+    /// Prints the morton index for given point inside the AABB of the octree
+    pub fn print_morton_code(&self, point: &Point3<f64>) {
+        if let Some(root) = self.root_node.as_ref() {
+            println!("{}", 
+            MortonIndex64::from_point_in_bounds(&point, &root.bounds)
+            .to_string(MortonIndexNaming::AsOctantConcatenationWithRoot));
+        }
+        else {
+            println!("Octree not constructed yet");
+        }
+    }
+
+    // }
+    /// Run top-down construction of the octree.
+    /// 
+    /// Starting from the root, on each level the children of all current leaves
+    /// are computed and put into the next compute stage, if these children are big enough.
+    pub async fn construct(&mut self) {
+        let point_count = self.point_buffer.len();
+        
+        let mut raw_points = &self.raw_points;
+        
+        let mut compiler = shaderc::Compiler::new().unwrap();
+        let comp_shader = include_str!("shaders/generate_nodes.comp");
+        let comp_spirv = compiler
+            .compile_into_spirv(
+                comp_shader,
+                shaderc::ShaderKind::Compute,
+                "ComputeShader",
+                "main",
+                None,
+            )
+            .unwrap();
+
+        let comp_data = wgpu::util::make_spirv(comp_spirv.as_binary_u8());
+        let shader = self
+            .gpu_device
+            .create_shader_module(&wgpu::ShaderModuleDescriptor {
+                label: Some("NodeGenerationShader"),
+                source: comp_data,
+            });
+
+        // 2 Bind groups are used
+        // - points_bind_group for point cloud data and point indices
+        // - nodes_bind_group for parent nodes and children nodes computed by GPU
+        let points_bind_group_layout =
+            self.gpu_device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    entries: &[
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 0,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 1,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                    ],
+                    label: Some("PointBufferBindGroupLayout"),
+                });
+
+        let nodes_bind_group_layout =
+            self.gpu_device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: Some("NodesBindGroupLayout"),
+                    entries: &[
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 0,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                        wgpu::BindGroupLayoutEntry {
+                            binding: 1,
+                            visibility: wgpu::ShaderStages::COMPUTE,
+                            ty: wgpu::BindingType::Buffer {
+                                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                                has_dynamic_offset: false,
+                                min_binding_size: None,
+                            },
+                            count: None,
+                        },
+                    ],
+                });
+
+        let compute_pipeline_layout =
+            self.gpu_device
+                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                    label: Some("ConstructionPipelineLayout"),
+                    bind_group_layouts: &[&nodes_bind_group_layout, &points_bind_group_layout],
+                    push_constant_ranges: &[],
+                });
+
+        let compute_pipeline =
+            self.gpu_device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: Some("ConstructionPipeline"),
+                    layout: Some(&compute_pipeline_layout),
+                    module: &shader,
+                    entry_point: "main",
+                });
+
+        let gpu_point_buffer =
+            self.gpu_device
+                .create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("PointBuffer"),
+                    size: (point_count * mem::size_of::<Vector3<f64>>()) as u64,
+                    usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::STORAGE,
+                    mapped_at_creation: false,
+                });
+        self.gpu_queue.write_buffer(&gpu_point_buffer, 0, self.raw_points.as_slice());
+        let mut root_node = OctreeNode {
+            bounds: self.bounds,
+            children: None,
+            node_partitioning: [0; 8],
+            points_per_partition: [0; 8],
+            point_start: 0,
+            point_end: point_count as u32,
+        };
+        root_node.node_partitioning[0] = point_count as u32;
+        root_node.points_per_partition[0] = point_count as u32;
+
+        let mut tree_depth = 1;
+        let mut num_leaves: u32 = 0;
+        let mut num_nodes: u32 = 1;
+
+        let mut current_nodes = vec![&mut root_node];
+
+        let index_range: Vec<u32> = (0u32..point_count as u32).map(u32::from).collect::<Vec<u32>>();
+        let raw_indeces: &[u8] = bytemuck::cast_slice(index_range.as_slice());
+        let point_index_buffer =
+            self.gpu_device
+                .create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("IndexBuffer"),
+                    size: (point_count * mem::size_of::<u32>()) as u64,
+                    usage: wgpu::BufferUsages::COPY_SRC
+                        | wgpu::BufferUsages::COPY_DST
+                        | wgpu::BufferUsages::MAP_READ
+                        | wgpu::BufferUsages::STORAGE,
+                    mapped_at_creation: false,
+                });
+        self.gpu_queue.write_buffer(&point_index_buffer, 0, raw_indeces);
+
+        let points_bind_group = self
+            .gpu_device
+            .create_bind_group(&wgpu::BindGroupDescriptor {
+                label: Some("PointBufferBindGroup"),
+                layout: &points_bind_group_layout,
+                entries: &[
+                    wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: gpu_point_buffer.as_entire_binding(),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding: 1,
+                        resource: point_index_buffer.as_entire_binding(),
+                    },
+                ],
+            });
+        while !current_nodes.is_empty() {
+            let num_blocks = current_nodes.len();
+
+            // Nodes buffers are created inside the loop, as their size changes per iteration
+            // Staging Buffers do not reside on GPU and are used for reading Compure results
+            let child_buffer_size = 8 * (OctreeNode::size() * current_nodes.len()) as u64; 
+            let child_nodes_buffer_staging =
+                self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("CPU_NewNodesBuffer"),
+                    size: child_buffer_size,
+                    usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
+                });
+            let child_nodes_buffer = self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some("NewNodesBuffer"),
+                size: 
+                    child_buffer_size,
+                usage: wgpu::BufferUsages::COPY_SRC
+                    | wgpu::BufferUsages::COPY_DST
+                    | wgpu::BufferUsages::STORAGE,
+                mapped_at_creation: false,
+            });
+            
+            let mut parent_nodes_raw = Vec::new();
+            for node in &current_nodes {
+                parent_nodes_raw.append(&mut node.into_raw());
+            }
+            let parent_nodes_buffer_staging =
+                self.gpu_device.create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("CPU_ParentNodesBuffer"),
+                    size: parent_nodes_raw.len() as u64,
+                    usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+                    mapped_at_creation: false,
+                });
+            let parent_nodes_buffer =
+                self.gpu_device
+                    .create_buffer(&wgpu::BufferDescriptor {
+                        label: Some("ParentNodesBuffer"),
+                        size: current_nodes.len() as u64 * OctreeNode::size() as u64,
+                        usage: wgpu::BufferUsages::COPY_SRC
+                            | wgpu::BufferUsages::COPY_DST
+                            | wgpu::BufferUsages::STORAGE,
+                        mapped_at_creation: false,
+                    });
+            self.gpu_queue.write_buffer(&parent_nodes_buffer, 0, parent_nodes_raw.as_slice());
+
+            let nodes_bind_group = self
+                .gpu_device
+                .create_bind_group(&wgpu::BindGroupDescriptor {
+                    label: Some("NodesBindGroup"),
+                    layout: &nodes_bind_group_layout,
+                    entries: &[
+                        wgpu::BindGroupEntry {
+                            binding: 0,
+                            resource: parent_nodes_buffer.as_entire_binding(),
+                        },
+                        wgpu::BindGroupEntry {
+                            binding: 1,
+                            resource: child_nodes_buffer.as_entire_binding(),
+                        },
+                    ],
+                });
+            let mut encoder =
+                self.gpu_device
+                    .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                        label: Some("CommandEncoder"),
+                    });
+            {
+                let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                    label: Some("ConstructionComputePass"),
+                });
+                compute_pass.set_pipeline(&compute_pipeline);
+
+                compute_pass.set_bind_group(0, &nodes_bind_group, &[]);
+                compute_pass.set_bind_group(1, &points_bind_group, &[]);
+
+                compute_pass.insert_debug_marker("Pasture Compute Debug");
+                compute_pass.dispatch(num_blocks as u32, 1, 1);
+            }
+            // Copy computed Nodes into CPU staging buffers for mapped reading
+            encoder.copy_buffer_to_buffer(
+                &child_nodes_buffer,
+                0,
+                &child_nodes_buffer_staging,
+                0,
+                child_buffer_size,
+            );
+            encoder.copy_buffer_to_buffer(
+                &parent_nodes_buffer,
+                0,
+                &parent_nodes_buffer_staging,
+                0,
+                parent_nodes_raw.len() as u64,
+            );
+            
+
+            self.gpu_queue.submit(Some(encoder.finish()));
+            
+            let parents_slice = parent_nodes_buffer_staging.slice(..);
+            let parents_future = parents_slice.map_async(wgpu::MapMode::Read);
+            let children_slice = child_nodes_buffer_staging.slice(..);
+            let children_future = children_slice.map_async(wgpu::MapMode::Read);
+            
+            self.gpu_device.poll(wgpu::Maintain::Wait);
+            if let Ok(()) = parents_future.await {
+                let mapped_nodes_data = parents_slice.get_mapped_range();
+                let mapped_node_buffer = mapped_nodes_data.to_vec();
+                let nodes: Vec<OctreeNode> = mapped_node_buffer
+                    .chunks_exact(OctreeNode::size())
+                    .map(|b| OctreeNode::from_raw(b.to_vec()))
+                    .collect();
+
+                if let Ok(()) = children_future.await {
+                    let mapped_children_data = children_slice.get_mapped_range();
+                    let mapped_children_buffer = mapped_children_data.to_vec();
+                    let mut children: Vec<OctreeNode> = mapped_children_buffer
+                        .chunks_exact(OctreeNode::size())
+                        .map(|b| OctreeNode::from_raw(b.to_vec()))
+                        .collect();
+                    for mut node in nodes {
+                        let children_sizes = node.points_per_partition.clone();
+
+                        let local_children: Vec<OctreeNode> = children.drain(..8).collect();
+
+                        let child_array: [OctreeNode; 8] = local_children.try_into().unwrap();
+                        node.children = Some(Box::new(child_array));
+
+                        let node_ref = current_nodes.remove(0);
+                        *node_ref = node;
+                        
+                        let children: &mut Box<[OctreeNode; 8]> =
+                            node_ref.children.as_mut().unwrap();
+
+                        let iter = children.iter_mut();
+
+                        let mut child_index = 0;
+
+                        for child in iter {
+                            if children_sizes[child_index] != 0
+                                && !child.is_leaf(self.points_per_node)
+                            {
+                                current_nodes.push(child);
+                            } else {
+                                num_leaves += 1;
+                            }
+
+                            num_nodes += 1;
+                            child_index += 1;
+                        }
+                        let morton_octants: Vec<Octant> = vec![
+                            0, 1, 2, 3, 4, 5, 6, 7
+                        ]
+                        .iter()
+                        .map(|&raw_octant| (raw_octant as u8).try_into().unwrap())
+                        .collect();
+
+                        morton_octants.iter()
+                        .for_each(|octant| self.morton_code.add_octant(*octant));
+                    }
+                    drop(mapped_nodes_data);
+                    parent_nodes_buffer_staging.unmap();
+                    drop(mapped_children_data);
+                    child_nodes_buffer_staging.unmap();
+                    parent_nodes_buffer.destroy();
+                    child_nodes_buffer.destroy();
+                    parent_nodes_buffer_staging.destroy();
+                    child_nodes_buffer_staging.destroy();
+                }
+            }
+
+            tree_depth += 1;
+        }
+        // Point indices are read after compute loop to reduce data copying and runtime
+        let index_slice = point_index_buffer.slice(..);
+        let mapped_future = index_slice.map_async(wgpu::MapMode::Read);
+
+        self.gpu_device.poll(wgpu::Maintain::Wait);
+        // Read in the changes of the global point partitioning
+        if let Ok(()) = mapped_future.await {
+            let mapped_index_buffer = index_slice.get_mapped_range();
+            let index_vec = mapped_index_buffer.to_vec();
+            let indices: Vec<u32> = index_vec
+                .chunks_exact(4)
+                .map(|b| u32::from_le_bytes(b.try_into().unwrap()))
+                .collect();
+
+            self.point_partitioning = indices.clone();
+
+            drop(mapped_index_buffer);
+            point_index_buffer.unmap();
+        }
+        gpu_point_buffer.destroy();
+        point_index_buffer.destroy();
+        self.root_node = Some(root_node);
+        self.depth = tree_depth;
+    }
+
+    /// Returns a Vec containing the indices of all points belonging to `point`
+    fn get_points(&self, node: &OctreeNode) -> Vec<u32> {
+        let indices =
+            self.point_partitioning[node.point_start as usize..node.point_end as usize].to_vec();
+        return indices;
+    }
+
+    /// Computes the `k` nearest neighbors of `point` that are within `radius`. 
+    /// Returns a Vec containing the indices to the neighbor points in increasing order of distance 
+    pub fn k_nearest_neighbors(&self, pos: Vector3<f64>, radius: f64, k: usize) -> Vec<u32> {
+        if k < 1 {
+            return vec![];
+        }
+        // We use a Vec<&Octree> as working queue for the nodes that we are visitting
+        // We use a priority queue so that the found indices are already
+        // being sorted by their distance to pos
+        let node = self.root_node.as_ref().unwrap();
+        let mut worklist = vec![node];
+        let point_buffer = &self.point_buffer;
+        let mut points = DoublePriorityQueue::new();
+
+        if pos.x - radius > node.bounds.max().x ||
+            pos.x + radius < node.bounds.min().x ||
+            pos.y - radius > node.bounds.max().y ||
+            pos.y + radius < node.bounds.min().y ||
+            pos.z - radius > node.bounds.max().z ||
+            pos.z + radius < node.bounds.min().z 
+        {
+            return vec![];
+        }
+        let radius_squared = radius * radius;
+
+        while !worklist.is_empty() {
+            let node = worklist.pop().unwrap();
+            // When node is leaf we need to search it's points
+            if node.is_leaf(self.points_per_node) {
+                let point_indices = self.get_points(node);
+                for i in point_indices.iter() {
+                    let point = point_buffer[*i as usize];
+                    let curr_dist = point - pos;
+                    let curr_dist = f64::powi(curr_dist.x, 2) + f64::powi(curr_dist.y, 2) + f64::powi(curr_dist.z, 2);
+                    if  curr_dist <= radius_squared{
+                        if points.len() >= k {
+                            let (_, dist) = points.peek_max().unwrap();
+                            if *dist > OrderedFloat(curr_dist) {
+                                points.pop_max();
+                                points.push(i.clone(), OrderedFloat(curr_dist));
+                            }
+                        }
+                        else {
+                            points.push(i.clone(), OrderedFloat(curr_dist));
+                        }
+                    }
+                }
+                let point_indices: Vec<u32> = self.get_points(node);
+            }
+            else {
+                // When node is not leaf, we must check if the Bounding Box of the node is in range of the query
+                // If so, we check if the whole node is inside the radius to possibly reduce step downs
+                // If node and query only intersect, children of the node are inspected
+                match node.relation_to_point(&pos, radius) {
+                    OctreeRelation::In => {
+                        let point_indices = self.get_points(node);
+                        for i in point_indices.iter() {
+                            let point = point_buffer[*i as usize];
+                            let curr_dist = point - pos;
+                            let curr_dist = f64::powi(curr_dist.x, 2) + f64::powi(curr_dist.y, 2) + f64::powi(curr_dist.z, 2);
+                            if  curr_dist <= radius_squared {
+                                if points.len() >= k {
+                                    let (_, dist) = points.peek_max().unwrap();
+                                    if *dist > OrderedFloat(curr_dist) {
+                                        points.pop_max();
+                                        points.push(i.clone(), OrderedFloat(curr_dist));
+                                    }
+                                }
+                                else {
+                                    points.push(i.clone(), OrderedFloat(curr_dist));
+                                }
+                            }
+                        }
+                    }
+                    OctreeRelation::Partial => {
+                        if let Some(children) = node.children.as_ref() {
+                            children.iter().for_each(|c| {
+                                if !c.is_empty(){ 
+                                    worklist.push(c);
+                                }
+                            });
+                        }
+                    }
+                    OctreeRelation::Out => {}
+                }; 
+            }
+        }
+        if points.is_empty() {
+            return vec![];
+        }
+        let mut nearest = points.into_ascending_sorted_vec();
+        nearest.truncate(k);
+        
+        return nearest;
+        
+    }
+
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::acceleration_structures::GpuOctree;
+    use crate::acceleration_structures::gpu_octree::OctreeNode;
+    use pasture_core::containers::InterleavedVecPointStorage;
+    use pasture_core::containers::PointBufferExt;
+    use pasture_core::layout::PointType;
+    use pasture_core::nalgebra::Vector3;
+    use pasture_io::base::PointReader;
+    use pasture_io::las::LASReader;
+    use pasture_io::las::LasPointFormat0;
+
+    use tokio;
+
+    static FILE: &'static str = //"/home/jnoice/Downloads/WSV_Pointcloud_Tile-3-1.laz"
+                                //"/home/jnoice/Downloads/interesting.las"
+                                //"/home/jnoice/Downloads/45123H3316.laz"
+                                //"/home/jnoice/Downloads/OR_Camp_Creek_OLC_2008_000001.laz"
+                                "/home/jnoice/Downloads/portland.laz"
+                                ;
+    #[tokio::test]
+    async fn check_correct_bounds() {
+        let reader = LASReader::from_path(FILE);
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(_) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(_) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
+
+        let octree = GpuOctree::new(&buffer, bounds, 75).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let node = octree.root_node.as_ref().unwrap();
+        let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
+        while !nodes_to_visit.is_empty() {
+            let current_node = nodes_to_visit.remove(0);
+            println!("Partition {:?}", current_node.node_partitioning);
+            assert!((current_node.point_start == 0 &&
+                current_node.point_end == 0 &&
+                current_node.node_partitioning == [0; 8]) || 
+                current_node.node_partitioning != [0; 8]);
+            let current_bounds = current_node.bounds;
+            let point_ids = octree.get_points(&current_node).into_iter();
+            for id in point_ids {
+                let point = buffer.get_point::<LasPointFormat0>(id as usize);
+                let pos: Vector3<f64> = Vector3::from(point.position);
+                
+                assert!(
+                    current_bounds.min().x <= pos.x
+                        && current_bounds.max().x >= pos.x
+                        && current_bounds.min().y <= pos.y
+                        && current_bounds.max().y >= pos.y
+                        && current_bounds.min().z <= pos.z
+                        && current_bounds.max().z >= pos.z
+                );
+                
+            }
+            if let Some(children) = current_node.children.as_ref() {
+                (*children).iter().for_each(|x| nodes_to_visit.push(x));
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn check_point_count() {
+        let reader = LASReader::from_path(FILE);
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(_) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(_) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
+
+        let octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let node = octree.root_node.as_ref().unwrap();
+        let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
+        let mut point_count: usize = 0;
+        while !nodes_to_visit.is_empty() {
+            let current_node = nodes_to_visit.pop().unwrap();
+            if let None = current_node.children {
+                println!("{}", current_node);
+                point_count += current_node.points_per_partition[0] as usize;
+            } else {
+                let children = current_node.children.as_ref().unwrap();
+                (*children).iter().for_each(|x| nodes_to_visit.push(x));
+            }
+        }
+        println!(
+            "Point count of octree: {}, Point Count of Buffer {}",
+            point_count, count
+        );
+        assert!(point_count == count);
+    }
+    #[tokio::test]
+    async fn check_point_partitioning_duplicates() {
+        let reader = LASReader::from_path(FILE);
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(_) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(_) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
+
+        let octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let mut indices = octree.point_partitioning.clone();
+        indices.sort();
+        indices.dedup();
+        assert!(indices.len() == octree.point_partitioning.len());
+    }
+    #[tokio::test]
+    async fn check_node_overflows() {
+        let reader = LASReader::from_path(FILE);
+        let mut reader = match reader {
+            Ok(a) => a,
+            Err(_) => panic!("Could not create LAS Reader"),
+        };
+        let count = reader.remaining_points();
+        let mut buffer =
+            InterleavedVecPointStorage::with_capacity(count, LasPointFormat0::layout());
+        let _data_read = match reader.read_into(&mut buffer, count) {
+            Ok(a) => a,
+            Err(_) => panic!("Could not write Point Buffer"),
+        };
+        let bounds = reader.get_metadata().bounds().unwrap();
+
+        let octree = GpuOctree::new(&buffer, bounds, 50).await;
+        let mut octree = match octree {
+            Ok(a) => a,
+            Err(b) => {
+                println!("{:?}", b);
+                panic!("Could not create GPU Device for Octree")
+            }
+        };
+        octree.construct().await;
+        let node = octree.root_node.as_ref().unwrap();
+        let mut nodes_to_visit: Vec<&OctreeNode> = vec![node];
+        while !nodes_to_visit.is_empty() {
+            let current_node = nodes_to_visit.pop().unwrap();
+            assert!(current_node.point_start <= current_node.point_end);
+            if let Some(children) = &current_node.children {
+                (*children).iter().for_each(|x| nodes_to_visit.push(x));
+            }
+        }
+    }
+    
+}
diff --git a/pasture-algorithms/src/acceleration_structures/mod.rs b/pasture-algorithms/src/acceleration_structures/mod.rs
new file mode 100644
index 0000000..6181ef8
--- /dev/null
+++ b/pasture-algorithms/src/acceleration_structures/mod.rs
@@ -0,0 +1,2 @@
+mod gpu_octree;
+pub use gpu_octree::GpuOctree;
diff --git a/pasture-algorithms/src/acceleration_structures/shaders/generate_nodes.comp b/pasture-algorithms/src/acceleration_structures/shaders/generate_nodes.comp
new file mode 100644
index 0000000..0c05b3c
--- /dev/null
+++ b/pasture-algorithms/src/acceleration_structures/shaders/generate_nodes.comp
@@ -0,0 +1,196 @@
+/*
+Compute shader for processing octree nodes.
+This approach is based on the work shown in "GPU-based Adaptive Octree Construction Algorithms" by (Goradia, 2008),
+Global buffer indeces[] maps an index to a point of the point cloud.
+This is used, so that we do not need to swap any real point inside the Point buffer.
+We use the indices to later adress the correct points of a Pasture PointBuffer
+
+Each node holds an array called node_partitioning, that specifies the range of indices a node contains.
+node_partitioning: [4,9,12,12,20,22,26,30] points_per_partition: [4,5,3,0,8,2,4,4]
+
+Right now, each parent in parents[] node has its own WorkGroup consisting of 8 threads, each thread assigning the computed
+values to the corresponding child node in children[]. That means each block of 8 Nodes in children[] belongs to o Node in parents[].
+
+*/
+
+#version 450
+
+struct Node {
+  double bounds_min[3];
+  double bounds_max[3];
+  uint node_partitioning[8];
+  uint points_per_partition[8];
+  uint points_start;
+  uint points_end;
+};
+
+layout(std430, set=0, binding=0) buffer ParentNodes{
+  Node parents[];
+};
+layout(std430, set=0, binding=1) buffer ChildNodes{
+  Node children[];
+};
+layout(std430, set=1, binding=0) buffer PointBuffer {
+  double points[][3];
+};
+layout(std430, set=1, binding=1) buffer Partitioning {
+  uint indeces[];
+};
+
+layout (local_size_x=8, local_size_y=1, local_size_z=1) in;
+
+/// Determines the order in which the node is divided on is axes
+/// [0,1,2] means partition order is x,y,z
+uint[3] partitioning_order(double x, double y, double z){
+  uint[] order = uint[3](0,1,2);
+  double[] axes = double[3](x,y,z);
+  for(uint i = 0; i < 2; ++i){
+    if(axes[i] < axes[i+1]){
+      uint tmp_order = order[i];
+      double tmp_axis = axes[i];
+      order[i] = order[i+1];
+      axes[i] = axes[i+1];
+      order[i+1] = tmp_order;
+      axes[i+1] = tmp_axis;
+    }
+  }
+  if(axes[0] < axes[1]){
+    uint tmp = order[0];
+    order[0] = order[1];
+    order[1] = tmp;
+  }
+  return order;
+}
+/// Swaps the indices at place a and b
+void swap( uint a,  uint b){
+  uint tmp = indeces[a];
+  indeces[a] = indeces[b];
+  indeces[b] = tmp;
+}
+
+/// Partitions the indeces buffer along given axis with given pivot
+/// When finished all points in [start, j) are less than pivot and all
+/// points in [j, end) are greater equals pivot
+uint partition_run(uint start, uint end, double pivot, uint axis) {
+  uint i = start;
+  uint j = start;
+
+  while(i < end) {
+    if(points[indeces[i]][axis] > pivot){
+      ++i;
+    }
+    else if(points[indeces[i]][axis] <= pivot){
+      swap(i, j);
+      ++i;
+      ++j;
+    }
+  }
+
+  return j;
+}
+/// Starts the different partitionig phases for the axes
+/// At the end half of node_partitioning (0-3) are below pivots[0] on axes[0],
+/// (0-1) are below pivots[1] on axes[1] and (2-3) are above pivots[1] o axes[1]
+/// (0) and (2) are below pivots[2] on axis[2], (1) and (3) are above pivots[2] on axis[2].
+/// Same happens on subpartition (4-7) of node_partitioning
+void partitioning(uint[3] axes, double[3] pivots, uint id, uint local_thread_id){
+  uint start = parents[id].points_start;
+  uint end = parents[id].node_partitioning[0];
+  if(local_thread_id == 0){
+    parents[id].node_partitioning[3] = partition_run(start, end, pivots[axes[0]], axes[0]);
+  }
+  barrier();
+  if(local_thread_id == 1) {
+    parents[id].node_partitioning[1] = partition_run(start, parents[id].node_partitioning[3], pivots[axes[1]], axes[1]);
+  }
+  else if(local_thread_id == 2){
+    parents[id].node_partitioning[5] = partition_run(parents[id].node_partitioning[3], end, pivots[axes[1]], axes[1]);
+  }
+  barrier();
+  if(local_thread_id == 3){
+    parents[id].node_partitioning[0] = partition_run(start, parents[id].node_partitioning[1], pivots[axes[2]], axes[2]);
+  }
+  else if(local_thread_id == 4){
+    parents[id].node_partitioning[2] = partition_run(parents[id].node_partitioning[1], parents[id].node_partitioning[3], pivots[axes[2]], axes[2]);
+  }
+  else if(local_thread_id == 5){
+    parents[id].node_partitioning[4] = partition_run(parents[id].node_partitioning[3], parents[id].node_partitioning[5], pivots[axes[2]], axes[2]);
+  }
+  else if(local_thread_id == 6){
+    parents[id].node_partitioning[6] = partition_run(parents[id].node_partitioning[5], end, pivots[axes[2]], axes[2]);
+  }
+  else if(local_thread_id == 7){
+    parents[id].node_partitioning[7] = end;
+  }
+  barrier();
+}
+
+/// Returns if child at `index` in parent is partitioned on the right side of the pivot on any axis
+bool[3] partitioned_to_right(uint[3] partition_order, uint index){
+  bool[3] on_right_side = bool[3](false, false, false);
+
+  on_right_side[partition_order[2]] = index % 2 != 0;
+  on_right_side[partition_order[1]] = index >= 2 && index <= 3 || index >= 6;
+  on_right_side[partition_order[0]] = index >= 4;
+  return on_right_side;
+}
+/// Calculates the Bounding Box of given child
+double[2][3] get_childs_bounds(uint[3] partition_order, double[3] partition_axes, uint child_index, Node parent){
+  double[3] bounds_min;
+  double[3] bounds_max;
+  if(child_index == 0) {
+    bounds_min = parent.bounds_min;
+    bounds_max = partition_axes;
+    return double[2][3](bounds_min, bounds_max);
+  }
+  bool[3] on_right_side = partitioned_to_right(partition_order, child_index);
+  for(uint k = 0; k < 3; ++k){
+    bounds_min[k] = on_right_side[k] ? partition_axes[k] : parent.bounds_min[k];
+    bounds_max[k] = on_right_side[k] ? parent.bounds_max[k] : partition_axes[k];
+  }
+  return double[2][3](bounds_min, bounds_max);
+}
+
+void main() {
+  uint idx = gl_WorkGroupID.x;
+  uint thread_idx = gl_LocalInvocationID.x;
+  Node parent = parents[idx];
+  // Calculation of axis-length and center point coorddinates 
+  // of parent's bounding box
+  double x_diff = parent.bounds_max[0] - parent.bounds_min[0];
+  double y_diff = parent.bounds_max[1] - parent.bounds_min[1];
+  double z_diff = parent.bounds_max[2] - parent.bounds_min[2];
+  double x_partition = parent.bounds_min[0] + 0.5 * abs(x_diff);
+  double y_partition = parent.bounds_min[1] + 0.5 * abs(y_diff);
+  double z_partition = parent.bounds_min[2] + 0.5 * abs(z_diff);
+  double[3] partition_pivots = double[3](x_partition, y_partition, z_partition);
+  uint[3] partition_order = partitioning_order(abs(x_diff), abs(y_diff), abs(z_diff));
+  
+  partitioning(partition_order, partition_pivots, idx, thread_idx);
+  
+  if(thread_idx == 0) {
+    parents[idx].points_per_partition[thread_idx] = parents[idx].node_partitioning[thread_idx] - parents[idx].points_start;
+  } 
+  else {
+   parents[idx].points_per_partition[thread_idx] = parents[idx].node_partitioning[thread_idx] - parents[idx].node_partitioning[thread_idx - 1];
+  }
+  // writes computed data into children
+  // based on the number of points belonging to a child, we can
+  // decide if the child is a leaf or should be processed as parent in next compute pass
+  if(thread_idx == 0) {
+    children[idx * 8 + thread_idx].points_start = parents[idx].points_start;
+  } else {
+    children[idx * 8 + thread_idx].points_start = parents[idx].node_partitioning[thread_idx - 1];
+  }
+  children[idx * 8 + thread_idx].points_per_partition[0] = parents[idx].points_per_partition[thread_idx];
+  children[idx * 8 + thread_idx].points_end = parents[idx].node_partitioning[thread_idx];
+  children[idx * 8 + thread_idx].node_partitioning[0] = children[idx * 8 + thread_idx].points_end;
+  double[2][3] child_bounds = get_childs_bounds(
+    partition_order,
+    double[3](x_partition, y_partition, z_partition),
+    thread_idx,
+    parent
+  );
+  children[idx * 8 + thread_idx].bounds_min = child_bounds[0];
+  children[idx* 8 + thread_idx].bounds_max = child_bounds[1];
+}
diff --git a/pasture-algorithms/src/lib.rs b/pasture-algorithms/src/lib.rs
index 00fa392..3e17ba4 100644
--- a/pasture-algorithms/src/lib.rs
+++ b/pasture-algorithms/src/lib.rs
@@ -4,6 +4,10 @@
 //! Pasture contains algorithms that can manipulate the point cloud data or
 //! calculate results based on them.
 
+// Data structures used for accelerating operations on point clouds
+#[cfg(feature = "gpu")]
+pub mod acceleration_structures;
+
 // Algorithm to calculate the bounding box of a point cloud.
 pub mod bounds;
 // Get the minimum and maximum value of a specific attribute in a point cloud.
diff --git a/pasture-core/Cargo.toml b/pasture-core/Cargo.toml
index 2e56547..4ee10ab 100644
--- a/pasture-core/Cargo.toml
+++ b/pasture-core/Cargo.toml
@@ -24,7 +24,7 @@ itertools = "0.10.0"
 byteorder = "1.4.2"
 
 # GPU related
-wgpu = { version = "0.11.0", features = ["spirv"], optional = true }
+wgpu = { version = "0.12.0", features = ["spirv"], optional = true }
 shaderc = { version = "0.7.2", optional = true }
 futures = { version = "0.3", optional = true }
 bytemuck = { version = "1.5.1", optional = true }
diff --git a/pasture-core/src/gpu/device.rs b/pasture-core/src/gpu/device.rs
index 43ab74c..92b9ffd 100644
--- a/pasture-core/src/gpu/device.rs
+++ b/pasture-core/src/gpu/device.rs
@@ -1,7 +1,7 @@
 use crate::layout;
-use wgpu::util::DeviceExt;
 use std::collections::BTreeMap;
 use std::ops::BitOr;
+use wgpu::util::DeviceExt;
 
 /// The base structure used to get access to the GPU. In addition it handles things like
 /// shader compilation and the actual dispatch of work to the GPU.
@@ -66,18 +66,19 @@ impl<'a> Device<'a> {
     ///     };
     /// });
     /// ```
-    pub async fn new(device_options: DeviceOptions) -> Result<Device<'a>, wgpu::RequestDeviceError> {
+    pub async fn new(
+        device_options: DeviceOptions,
+    ) -> Result<Device<'a>, wgpu::RequestDeviceError> {
         // == Create an instance from the desired backend =========================================
 
         let backend_bits = match device_options.device_backend {
             // DeviceBackend::Primary => { wgpu::Backends::PRIMARY }
             // DeviceBackend::Secondary => { wgpu::Backends::SECONDARY }
-            DeviceBackend::Vulkan => { wgpu::Backends::VULKAN }
-            // DeviceBackend::Metal => { wgpu::Backends::METAL }
-            // DeviceBackend::Dx12 => { wgpu::Backends::DX12 }
-            // DeviceBackend::Dx11 => { wgpu::Backends::DX11 }
-            // DeviceBackend::OpenGL => { wgpu::Backends::GL }
-            // DeviceBackend::Browser => { wgpu::Backends::BROWSER_WEBGPU }
+            DeviceBackend::Vulkan => wgpu::Backends::VULKAN, // DeviceBackend::Metal => { wgpu::Backends::METAL }
+                                                             // DeviceBackend::Dx12 => { wgpu::Backends::DX12 }
+                                                             // DeviceBackend::Dx11 => { wgpu::Backends::DX11 }
+                                                             // DeviceBackend::OpenGL => { wgpu::Backends::GL }
+                                                             // DeviceBackend::Browser => { wgpu::Backends::BROWSER_WEBGPU }
         };
 
         let instance = wgpu::Instance::new(backend_bits);
@@ -92,13 +93,13 @@ impl<'a> Device<'a> {
         // The adapter gives us a handle to the actual device.
         // We can query some GPU information, such as the device name, its type (discrete vs integrated)
         // or the backend that is being used.
-        let adapter = instance.request_adapter(
-            &wgpu::RequestAdapterOptions {
+        let adapter = instance
+            .request_adapter(&wgpu::RequestAdapterOptions {
                 power_preference: power_pref,
                 compatible_surface: None,
                 force_fallback_adapter: false,
-            }
-        ).await;
+            })
+            .await;
 
         let adapter = match adapter {
             Some(a) => a,
@@ -107,12 +108,17 @@ impl<'a> Device<'a> {
 
         // == Create a device and a queue from the given adapter ==================================
 
-        if !adapter.features().contains(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS) {
+        if !adapter
+            .features()
+            .contains(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS)
+        {
             return Result::Err(wgpu::RequestDeviceError);
         }
 
         let features = match device_options.use_adapter_features {
-            true => adapter.features().bitor(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS),
+            true => adapter
+                .features()
+                .bitor(wgpu::Features::MAPPABLE_PRIMARY_BUFFERS),
             false => wgpu::Features::MAPPABLE_PRIMARY_BUFFERS,
         };
 
@@ -121,14 +127,16 @@ impl<'a> Device<'a> {
             false => wgpu::Limits::default(),
         };
 
-        let (wgpu_device, wgpu_queue) = adapter.request_device(
-            &wgpu::DeviceDescriptor {
-                label: Some("wgpu_device_and_queue"),
-                features,
-                limits,
-            },
-            None,
-        ).await?;
+        let (wgpu_device, wgpu_queue) = adapter
+            .request_device(
+                &wgpu::DeviceDescriptor {
+                    label: Some("wgpu_device_and_queue"),
+                    features,
+                    limits,
+                },
+                None,
+            )
+            .await?;
 
         // == Other fields =========================================================================
 
@@ -161,12 +169,18 @@ impl<'a> Device<'a> {
 
     /// Displays the features that the physical GPU is able to support.
     pub fn print_adapter_features(&self) {
-        println!("Features supported by the adapter: {:?}", self.adapter.features());
+        println!(
+            "Features supported by the adapter: {:?}",
+            self.adapter.features()
+        );
     }
 
     /// Displays the features that are currently active.
     pub fn print_active_features(&self) {
-        println!("Currently active features: {:?}", self.wgpu_device.features());
+        println!(
+            "Currently active features: {:?}",
+            self.wgpu_device.features()
+        );
     }
 
     /// Displays the default limits that are likely supported by all devices.
@@ -176,7 +190,10 @@ impl<'a> Device<'a> {
 
     /// Displays the best limits the physical GPU can support.
     pub fn print_adapter_limits(&self) {
-        println!("\"Best\" limits supported by the adapter: {:?}", self.adapter.limits());
+        println!(
+            "\"Best\" limits supported by the adapter: {:?}",
+            self.adapter.limits()
+        );
     }
 
     /// Displays the limits that are currently active.
@@ -191,46 +208,46 @@ impl<'a> Device<'a> {
     /// * `uniform_as_bytes` - the uniform's content as bytes. Make sure it's correctly aligned
     ///                        according to the `std140` layout rules.
     /// * `binding` - the binding at which the uniform buffer object is set in the shader.
-    pub fn create_uniform_bind_group(&self, uniform_as_bytes: &[u8], binding: u32) -> (wgpu::BindGroupLayout, wgpu::BindGroup) {
+    pub fn create_uniform_bind_group(
+        &self,
+        uniform_as_bytes: &[u8],
+        binding: u32,
+    ) -> (wgpu::BindGroupLayout, wgpu::BindGroup) {
         // TODO: separate buffer from bind group -> should probably become part of Device state
-        let uniform_buffer = self.wgpu_device.create_buffer_init(
-            &wgpu::util::BufferInitDescriptor {
-                label: Some("uniform_buffer"),
-                contents: uniform_as_bytes,
-                usage: wgpu::BufferUsages::UNIFORM,
-            }
-        );
-
-        let uniform_bind_group_layout = self.wgpu_device.create_bind_group_layout(
-            &wgpu::BindGroupLayoutDescriptor {
-                label: Some("uniform_bind_group_layout"),
-                entries: &[
-                    wgpu::BindGroupLayoutEntry {
+        let uniform_buffer =
+            self.wgpu_device
+                .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                    label: Some("uniform_buffer"),
+                    contents: uniform_as_bytes,
+                    usage: wgpu::BufferUsages::UNIFORM,
+                });
+
+        let uniform_bind_group_layout =
+            self.wgpu_device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: Some("uniform_bind_group_layout"),
+                    entries: &[wgpu::BindGroupLayoutEntry {
                         binding,
                         visibility: wgpu::ShaderStages::COMPUTE,
                         ty: wgpu::BindingType::Buffer {
                             ty: wgpu::BufferBindingType::Uniform,
                             has_dynamic_offset: false,
-                            min_binding_size: None
+                            min_binding_size: None,
                         },
-                        count: None
-                    }
-                ],
-            }
-        );
+                        count: None,
+                    }],
+                });
 
-        let uniform_bind_group = self.wgpu_device.create_bind_group(
-            &wgpu::BindGroupDescriptor {
+        let uniform_bind_group = self
+            .wgpu_device
+            .create_bind_group(&wgpu::BindGroupDescriptor {
                 label: Some("uniform_bind_group"),
                 layout: &uniform_bind_group_layout,
-                entries: &[
-                    wgpu::BindGroupEntry {
-                        binding,
-                        resource: uniform_buffer.as_entire_binding(),
-                    },
-                ],
-            }
-        );
+                entries: &[wgpu::BindGroupEntry {
+                    binding,
+                    resource: uniform_buffer.as_entire_binding(),
+                }],
+            });
 
         (uniform_bind_group_layout, uniform_bind_group)
     }
@@ -238,7 +255,12 @@ impl<'a> Device<'a> {
     /// Associate a bind group and its layout with a given set on the shader side.
     /// Eg. if on the shader we have a buffer with `layout(std430, set=2, binding=0)`,
     /// then the passed in `index` should equal 2.
-    pub fn set_bind_group(&mut self, index: u32, bind_group_layout: &'a wgpu::BindGroupLayout, bind_group: &'a wgpu::BindGroup) {
+    pub fn set_bind_group(
+        &mut self,
+        index: u32,
+        bind_group_layout: &'a wgpu::BindGroupLayout,
+        bind_group: &'a wgpu::BindGroup,
+    ) {
         let bind_group_pair = BindGroupPair {
             bind_group_layout,
             bind_group,
@@ -253,7 +275,7 @@ impl<'a> Device<'a> {
             &wgpu::ShaderModuleDescriptor {
                 label: Some("wgsl_computer_shader_module"),
                 source: wgpu::ShaderSource::Wgsl(wgsl_compute_shader_src.into()),
-            }
+            },
         ));
 
         let pipeline = self.create_compute_pipeline(self.cs_module.as_ref().unwrap());
@@ -270,7 +292,10 @@ impl<'a> Device<'a> {
         self.compute_pipeline = Some(pipeline);
     }
 
-    fn compile_glsl_and_create_compute_module(&self, compute_shader_src: &str) -> Option<wgpu::ShaderModule> {
+    fn compile_glsl_and_create_compute_module(
+        &self,
+        compute_shader_src: &str,
+    ) -> Option<wgpu::ShaderModule> {
         // WebGPU wants its shaders pre-compiled in binary SPIR-V format.
         // So we'll take the source code of our compute shader and compile it
         // with the help of the shaderc crate.
@@ -289,35 +314,37 @@ impl<'a> Device<'a> {
         // Now with the binary data we can create and return our ShaderModule,
         // which will be executed on the GPU within our compute pipeline.
         Some(
-            self.wgpu_device.create_shader_module(&wgpu::ShaderModuleDescriptor {
-                label: Some("glsl_compute_shader_module"),
-                source: cs_data,
-            })
+            self.wgpu_device
+                .create_shader_module(&wgpu::ShaderModuleDescriptor {
+                    label: Some("glsl_compute_shader_module"),
+                    source: cs_data,
+                }),
         )
     }
 
     fn create_compute_pipeline(&self, cs_module: &wgpu::ShaderModule) -> wgpu::ComputePipeline {
-        let layouts = self.bind_group_data
+        let layouts = self
+            .bind_group_data
             .values()
             .map(|pair| pair.bind_group_layout)
             .collect::<Vec<&'a wgpu::BindGroupLayout>>();
 
-        let compute_pipeline_layout = self.wgpu_device.create_pipeline_layout(
-            &wgpu::PipelineLayoutDescriptor {
-                label: Some("compute_pipeline_layout"),
-                bind_group_layouts: layouts.as_slice(),
-                push_constant_ranges: &[],
-            }
-        );
-
-        let compute_pipeline = self.wgpu_device.create_compute_pipeline(
-            &wgpu::ComputePipelineDescriptor {
-                label: Some("compute_pipeline"),
-                layout: Some(&compute_pipeline_layout),
-                module: &cs_module,
-                entry_point: "main",
-            }
-        );
+        let compute_pipeline_layout =
+            self.wgpu_device
+                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                    label: Some("compute_pipeline_layout"),
+                    bind_group_layouts: layouts.as_slice(),
+                    push_constant_ranges: &[],
+                });
+
+        let compute_pipeline =
+            self.wgpu_device
+                .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                    label: Some("compute_pipeline"),
+                    layout: Some(&compute_pipeline_layout),
+                    module: &cs_module,
+                    entry_point: "main",
+                });
 
         compute_pipeline
     }
@@ -333,15 +360,16 @@ impl<'a> Device<'a> {
         // The resulting CommandBuffer can then be submitted to the GPU via a Queue.
         // Signal the end of the batch with CommandEncoder#finish().
         let mut encoder =
-            self.wgpu_device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: Some("command_encoder") });
+            self.wgpu_device
+                .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                    label: Some("command_encoder"),
+                });
 
         {
             // The compute pass will start ("dispatch") our compute shader.
-            let mut compute_pass = encoder.begin_compute_pass(
-                &wgpu::ComputePassDescriptor {
-                    label: Some("compute_pass")
-                }
-            );
+            let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("compute_pass"),
+            });
             compute_pass.set_pipeline(self.compute_pipeline.as_ref().unwrap());
 
             for (i, bind_group_pair) in self.bind_group_data.values().enumerate() {
@@ -390,7 +418,9 @@ pub enum DevicePower {
 
 impl Default for DevicePower {
     /// Default is [DevicePower::Low]
-    fn default() -> Self { Self::Low }
+    fn default() -> Self {
+        Self::Low
+    }
 }
 
 /// Currently only `Vulkan` is supported, because it is the only backend that allows 64-bit floats
@@ -411,7 +441,9 @@ pub enum DeviceBackend {
 
 impl Default for DeviceBackend {
     /// Default is `Vulkan`
-    fn default() -> Self { Self::Vulkan }
+    fn default() -> Self {
+        Self::Vulkan
+    }
 }
 
 // TODO: consider usage (readonly vs read/write, shader stages, ...), size, mapped_at_creation, etc.
diff --git a/pasture-tools/Cargo.toml b/pasture-tools/Cargo.toml
index c399455..a524129 100644
--- a/pasture-tools/Cargo.toml
+++ b/pasture-tools/Cargo.toml
@@ -22,7 +22,6 @@ log = "0.4"
 pretty_env_logger = "0.4.0"
 plotters = "^0.3.0"
 rand = {version = "0.8.3", features = ["small_rng"] }
-
 [[bin]]
 name = "reorder_laz_chunks"
 
@@ -30,4 +29,4 @@ name = "reorder_laz_chunks"
 name = "plotting"
 
 [[bin]]
-name = "info"
\ No newline at end of file
+name = "info"
diff --git a/pasture-tools/src/lib.rs b/pasture-tools/src/lib.rs
new file mode 100644
index 0000000..ae7ea54
--- /dev/null
+++ b/pasture-tools/src/lib.rs
@@ -0,0 +1,3 @@
+extern crate self as pasture_tools;
+#[cfg(feature = "gpu")]
+pub mod acceleration_structures;