From b26043b0e6b42ca23e6fbaad93f0368dfe0afa44 Mon Sep 17 00:00:00 2001
From: haslersn <sebastian.hasler@gmx.net>
Date: Fri, 6 Sep 2019 15:23:38 +0200
Subject: [PATCH] common: Rework `Chunk` and `Chonk` implementation

Previously, voxels in sparsely populated chunks were stored in a `HashMap`.
However, during usage oftentimes block accesses are followed by subsequent
nearby voxel accesses. Therefore it's possible to provide cache friendliness,
but not with `HashMap`.

The previous merge request [!469](https://gitlab.com/veloren/veloren/merge_requests/469)
proposed to order voxels by their morton order (see https://en.wikipedia.org/wiki/Z-order_curve ).
This provided excellent cache friendliness. However, benchmarks showed that
the required indexing calculations are quite expensive. Particular results
on my _Intel(R) Core(TM) i7-7500U CPU @ 2.70 GHz_ were:

| Benchmark                                | Before this commit @ d322384becac | Morton Order @ ec8a7caf42ba | This commit          |
| ---------------------------------------- | --------------------------------- | --------------------------- | -------------------- |
| `full read` (81920 voxels)               | 17.7ns per voxel                  | 8.9ns per voxel             | **3.6ns** per voxel  |
| `constrained read` (4913 voxels)         | 67.0ns per voxel                  | 40.1ns per voxel            | **14.1ns** per voxel |
| `local read` (125 voxels)                | 17.5ns per voxel                  | 14.7ns per voxel            | **3.8ns** per voxel  |
| `X-direction read` (17 voxels)           | 17.8ns per voxel                  | 25.9ns per voxel            | **4.2ns** per voxel  |
| `Y-direction read` (17 voxels)           | 18.4ns per voxel                  | 33.3ns per voxel            | **4.5ns** per voxel  |
| `Z-direction read` (17 voxels)           | 18.6ns per voxel                  | 38.2ns per voxel            | **5.4ns** per voxel  |
| `long Z-direction read` (65 voxels)      | 18.0ns per voxel                  | 37.7ns per voxel            | **5.1ns** per voxel  |
| `full write (dense)` (81920 voxels)      | 17.9ns per voxel                  | **10.3ns** per voxel        | 12.4ns per voxel     |

This commit (instead of utilizing morton order) replaces `HashMap` in the
`Chunk` implementation by the following data structure:

The volume is spatially subdivided into groups of `4*4*4` blocks. Since a
`Chunk` is of total size `32*32*16`, this implies that there are `8*8*4`
groups. (These numbers are generic in the actual code such that there are
always `256` groups. I.e. the group size is chosen depending on the desired
total size of the `Chunk`.)

There's a single vector `self.vox` which consecutively stores these groups.
Each group might or might not be contained in `self.vox`. A group that is
not contained represents that the full group consists only of `self.default`
voxels. This saves a lot of memory because oftentimes a `Chunk` consists of
either a lot of air or a lot of stone.

To track whether a group is contained in `self.vox`, there's an index buffer
`self.indices : [u8; 256]`. It contains for each group

* (a) the order in which it has been inserted into `self.vox`, if the group
    is contained in `self.vox` or
* (b) 255, otherwise. That case represents that the whole group consists
    only of `self.default` voxels.

(Note that 255 is a valid insertion order for case (a) only if `self.vox` is
full and then no other group has the index 255. Therefore there's no
ambiguity.)

Rationale:

The index buffer should be small because:

* Small size increases the probability that it will always be in cache.
* The index buffer is allocated for every `Chunk` and an almost empty `Chunk`
    shall not consume too much memory.

The number of 256 groups is particularly nice because it means that the index
buffer can consist of `u8`s. This keeps the space requirement for the index
buffer as low as 4 cache lines.
---
 client/src/lib.rs           |   6 +-
 common/src/terrain/chonk.rs | 372 +++++++++++++++++------------------
 common/src/terrain/mod.rs   |   2 +-
 common/src/volumes/chunk.rs | 378 +++++++++++++++++++++++++++++-------
 4 files changed, 500 insertions(+), 258 deletions(-)

diff --git a/client/src/lib.rs b/client/src/lib.rs
index 4b7550585c..fb1aeb03ed 100644
--- a/client/src/lib.rs
+++ b/client/src/lib.rs
@@ -12,12 +12,12 @@ use common::{
     msg::{ClientMsg, ClientState, RequestStateError, ServerError, ServerInfo, ServerMsg},
     net::PostBox,
     state::{State, Uid},
-    terrain::{block::Block, chonk::ChonkMetrics, TerrainChunk, TerrainChunkSize},
+    terrain::{block::Block, TerrainChunk, TerrainChunkSize},
     vol::RectVolSize,
     ChatType,
 };
 use hashbrown::HashMap;
-use log::{info, log_enabled, warn};
+use log::warn;
 use std::{
     net::SocketAddr,
     sync::Arc,
@@ -398,6 +398,7 @@ impl Client {
             }
         }
 
+        /*
         // Output debug metrics
         if log_enabled!(log::Level::Info) && self.tick % 600 == 0 {
             let metrics = self
@@ -407,6 +408,7 @@ impl Client {
                 .fold(ChonkMetrics::default(), |a, (_, c)| a + c.get_metrics());
             info!("{:?}", metrics);
         }
+        */
 
         // 7) Finish the tick, pass control back to the frontend.
         self.tick += 1;
diff --git a/common/src/terrain/chonk.rs b/common/src/terrain/chonk.rs
index cd7b63b61d..1b1dfa6496 100644
--- a/common/src/terrain/chonk.rs
+++ b/common/src/terrain/chonk.rs
@@ -1,59 +1,60 @@
-use super::{block::Block, TerrainChunkMeta, TerrainChunkSize};
 use crate::{
     vol::{
-        BaseVol, DefaultPosIterator, DefaultVolIterator, IntoPosIterator, IntoVolIterator, ReadVol,
-        RectRasterableVol, RectVolSize, VolSize, WriteVol,
+        BaseVol, IntoPosIterator, IntoVolIterator, ReadVol, RectRasterableVol, RectVolSize,
+        VolSize, Vox, WriteVol,
     },
-    volumes::chunk::{Chunk, ChunkError},
+    volumes::chunk::{Chunk, ChunkError, ChunkPosIter, ChunkVolIter},
 };
-use hashbrown::HashMap;
 use serde_derive::{Deserialize, Serialize};
-use std::ops::Add;
+use std::marker::PhantomData;
 use vek::*;
 
 #[derive(Debug)]
 pub enum ChonkError {
-    ChunkError(ChunkError),
+    SubChunkError(ChunkError),
     OutOfBounds,
 }
 
-const SUB_CHUNK_HEIGHT: u32 = 16;
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct SubChunkSize;
+pub struct SubChunkSize<ChonkSize: RectVolSize> {
+    phantom: PhantomData<ChonkSize>,
+}
 
-impl VolSize for SubChunkSize {
+// TODO (haslersn): Assert ChonkSize::RECT_SIZE.x == ChonkSize::RECT_SIZE.y
+
+impl<ChonkSize: RectVolSize> VolSize for SubChunkSize<ChonkSize> {
     const SIZE: Vec3<u32> = Vec3 {
-        x: TerrainChunkSize::RECT_SIZE.x,
-        y: TerrainChunkSize::RECT_SIZE.y,
-        z: SUB_CHUNK_HEIGHT,
+        x: ChonkSize::RECT_SIZE.x,
+        y: ChonkSize::RECT_SIZE.x,
+        z: ChonkSize::RECT_SIZE.x / 2,
     };
 }
 
-const SUB_CHUNK_HASH_LIMIT: usize =
-    (SubChunkSize::SIZE.x * SubChunkSize::SIZE.y * SubChunkSize::SIZE.z) as usize / 4;
+type SubChunk<V, S, M> = Chunk<V, SubChunkSize<S>, M>;
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Chonk {
+pub struct Chonk<V: Vox, S: RectVolSize, M: Clone> {
     z_offset: i32,
-    sub_chunks: Vec<SubChunk>,
-    below: Block,
-    above: Block,
-    meta: TerrainChunkMeta,
+    sub_chunks: Vec<SubChunk<V, S, M>>,
+    below: V,
+    above: V,
+    meta: M,
+    phantom: PhantomData<S>,
 }
 
-impl Chonk {
-    pub fn new(z_offset: i32, below: Block, above: Block, meta: TerrainChunkMeta) -> Self {
+impl<V: Vox, S: RectVolSize, M: Clone> Chonk<V, S, M> {
+    pub fn new(z_offset: i32, below: V, above: V, meta: M) -> Self {
         Self {
             z_offset,
             sub_chunks: Vec::new(),
             below,
             above,
             meta,
+            phantom: PhantomData,
         }
     }
 
-    pub fn meta(&self) -> &TerrainChunkMeta {
+    pub fn meta(&self) -> &M {
         &self.meta
     }
 
@@ -62,68 +63,40 @@ impl Chonk {
     }
 
     pub fn get_max_z(&self) -> i32 {
-        self.z_offset + (self.sub_chunks.len() as u32 * SUB_CHUNK_HEIGHT) as i32
-    }
-
-    pub fn get_metrics(&self) -> ChonkMetrics {
-        ChonkMetrics {
-            chonks: 1,
-            homogeneous: self
-                .sub_chunks
-                .iter()
-                .filter(|s| match s {
-                    SubChunk::Homogeneous(_) => true,
-                    _ => false,
-                })
-                .count(),
-            hash: self
-                .sub_chunks
-                .iter()
-                .filter(|s| match s {
-                    SubChunk::Hash(_, _) => true,
-                    _ => false,
-                })
-                .count(),
-            heterogeneous: self
-                .sub_chunks
-                .iter()
-                .filter(|s| match s {
-                    SubChunk::Heterogeneous(_) => true,
-                    _ => false,
-                })
-                .count(),
-        }
+        self.z_offset + (self.sub_chunks.len() as u32 * SubChunkSize::<S>::SIZE.z) as i32
     }
 
     // Returns the index (in self.sub_chunks) of the SubChunk that contains
     // layer z; note that this index changes when more SubChunks are prepended
-    fn sub_chunk_idx(&self, z: i32) -> usize {
-        ((z - self.z_offset) / SUB_CHUNK_HEIGHT as i32) as usize
+    fn sub_chunk_idx(&self, z: i32) -> i32 {
+        let diff = z - self.z_offset;
+        diff >> (SubChunkSize::<S>::SIZE.z - 1).count_ones()
     }
 
-    // Returns the z_offset of the sub_chunk that contains layer z
-    fn sub_chunk_z_offset(&self, z: i32) -> i32 {
-        let rem = (z - self.z_offset) % SUB_CHUNK_HEIGHT as i32;
-        if rem < 0 {
-            z - (rem + SUB_CHUNK_HEIGHT as i32)
-        } else {
-            z - rem
-        }
+    // Converts a z coordinate into a local z coordinate within a sub chunk
+    fn sub_chunk_z(&self, z: i32) -> i32 {
+        let diff = z - self.z_offset;
+        diff & (SubChunkSize::<S>::SIZE.z - 1) as i32
+    }
+
+    // Returns the z offset of the sub_chunk that contains layer z
+    fn sub_chunk_min_z(&self, z: i32) -> i32 {
+        z - self.sub_chunk_z(z)
     }
 }
 
-impl BaseVol for Chonk {
-    type Vox = Block;
+impl<V: Vox, S: RectVolSize, M: Clone> BaseVol for Chonk<V, S, M> {
+    type Vox = V;
     type Error = ChonkError;
 }
 
-impl RectRasterableVol for Chonk {
-    const RECT_SIZE: Vec2<u32> = TerrainChunkSize::RECT_SIZE;
+impl<V: Vox, S: RectVolSize, M: Clone> RectRasterableVol for Chonk<V, S, M> {
+    const RECT_SIZE: Vec2<u32> = S::RECT_SIZE;
 }
 
-impl ReadVol for Chonk {
+impl<V: Vox, S: RectVolSize, M: Clone> ReadVol for Chonk<V, S, M> {
     #[inline(always)]
-    fn get(&self, pos: Vec3<i32>) -> Result<&Block, ChonkError> {
+    fn get(&self, pos: Vec3<i32>) -> Result<&V, Self::Error> {
         if pos.z < self.get_min_z() {
             // Below the terrain
             Ok(&self.below)
@@ -132,162 +105,181 @@ impl ReadVol for Chonk {
             Ok(&self.above)
         } else {
             // Within the terrain
-
             let sub_chunk_idx = self.sub_chunk_idx(pos.z);
-
-            match &self.sub_chunks[sub_chunk_idx] {
-                // Can't fail
-                SubChunk::Homogeneous(block) => Ok(block),
-                SubChunk::Hash(cblock, map) => {
-                    let rpos = pos
-                        - Vec3::unit_z()
-                            * (self.z_offset + sub_chunk_idx as i32 * SUB_CHUNK_HEIGHT as i32);
-
-                    Ok(map.get(&rpos.map(|e| e as u8)).unwrap_or(cblock))
-                }
-                SubChunk::Heterogeneous(chunk) => {
-                    let rpos = pos
-                        - Vec3::unit_z()
-                            * (self.z_offset + sub_chunk_idx as i32 * SUB_CHUNK_HEIGHT as i32);
-
-                    chunk.get(rpos).map_err(ChonkError::ChunkError)
-                }
-            }
+            let rpos = pos
+                - Vec3::unit_z()
+                    * (self.z_offset + sub_chunk_idx * SubChunkSize::<S>::SIZE.z as i32);
+            self.sub_chunks[sub_chunk_idx as usize]
+                .get(rpos)
+                .map_err(Self::Error::SubChunkError)
         }
     }
 }
 
-impl WriteVol for Chonk {
+impl<V: Vox, S: RectVolSize, M: Clone> WriteVol for Chonk<V, S, M> {
     #[inline(always)]
-    fn set(&mut self, pos: Vec3<i32>, block: Block) -> Result<(), ChonkError> {
+    fn set(&mut self, pos: Vec3<i32>, block: Self::Vox) -> Result<(), Self::Error> {
+        let mut sub_chunk_idx = self.sub_chunk_idx(pos.z);
+
         if pos.z < self.get_min_z() {
             // Prepend exactly sufficiently many SubChunks via Vec::splice
-            let target_z_offset = self.sub_chunk_z_offset(pos.z);
-            let c = SubChunk::Homogeneous(self.below);
-            let n = (self.get_min_z() - target_z_offset) / SUB_CHUNK_HEIGHT as i32;
-            self.sub_chunks
-                .splice(0..0, std::iter::repeat(c).take(n as usize));
-            self.z_offset = target_z_offset;
+            let c = Chunk::<V, SubChunkSize<S>, M>::filled(self.below.clone(), self.meta.clone());
+            let n = (-sub_chunk_idx) as usize;
+            self.sub_chunks.splice(0..0, std::iter::repeat(c).take(n));
+            self.z_offset += sub_chunk_idx * SubChunkSize::<S>::SIZE.z as i32;
+            sub_chunk_idx = 0;
         } else if pos.z >= self.get_max_z() {
             // Append exactly sufficiently many SubChunks via Vec::extend
-            let target_z_offset = self.sub_chunk_z_offset(pos.z);
-            let c = SubChunk::Homogeneous(self.above);
-            let n = (target_z_offset - self.get_max_z()) / SUB_CHUNK_HEIGHT as i32 + 1;
-            self.sub_chunks
-                .extend(std::iter::repeat(c).take(n as usize));
+            let c = Chunk::<V, SubChunkSize<S>, M>::filled(self.above.clone(), self.meta.clone());
+            let n = 1 + sub_chunk_idx as usize - self.sub_chunks.len();
+            self.sub_chunks.extend(std::iter::repeat(c).take(n));
         }
 
-        let sub_chunk_idx = self.sub_chunk_idx(pos.z);
+        let rpos = pos
+            - Vec3::unit_z() * (self.z_offset + sub_chunk_idx * SubChunkSize::<S>::SIZE.z as i32);
+        self.sub_chunks[sub_chunk_idx as usize] // TODO (haslersn): self.sub_chunks.get(...).and_then(...)
+            .set(rpos, block)
+            .map_err(Self::Error::SubChunkError)
+    }
+}
 
-        let rpos =
-            pos - Vec3::unit_z() * (self.z_offset + sub_chunk_idx as i32 * SUB_CHUNK_HEIGHT as i32);
+struct ChonkIterHelper<V: Vox, S: RectVolSize, M: Clone> {
+    sub_chunk_min_z: i32,
+    lower_bound: Vec3<i32>,
+    upper_bound: Vec3<i32>,
+    phantom: PhantomData<Chonk<V, S, M>>,
+}
 
-        match &mut self.sub_chunks[sub_chunk_idx] {
-            // Can't fail
-            SubChunk::Homogeneous(cblock) if block == *cblock => Ok(()),
-            SubChunk::Homogeneous(cblock) => {
-                let mut map = HashMap::default();
-                map.insert(rpos.map(|e| e as u8), block);
+impl<V: Vox, S: RectVolSize, M: Clone> Iterator for ChonkIterHelper<V, S, M> {
+    type Item = (i32, Vec3<i32>, Vec3<i32>);
 
-                self.sub_chunks[sub_chunk_idx] = SubChunk::Hash(*cblock, map);
-                Ok(())
-            }
-            SubChunk::Hash(cblock, map) if block == *cblock => {
-                map.remove(&rpos.map(|e| e as u8));
-                Ok(())
-            }
-            SubChunk::Hash(_cblock, map) if map.len() < SUB_CHUNK_HASH_LIMIT => {
-                map.insert(rpos.map(|e| e as u8), block);
-                Ok(())
-            }
-            SubChunk::Hash(cblock, map) => {
-                let mut new_chunk = Chunk::filled(*cblock, ());
-                for (map_pos, map_block) in map {
-                    new_chunk
-                        .set(map_pos.map(|e| i32::from(e)), *map_block)
-                        .unwrap(); // Can't fail (I hope!)
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.lower_bound.z >= self.upper_bound.z {
+            return None;
+        }
+        let mut lb = self.lower_bound;
+        let mut ub = self.upper_bound;
+        let current_min_z = self.sub_chunk_min_z;
+        lb.z -= current_min_z;
+        ub.z -= current_min_z;
+        ub.z = std::cmp::min(ub.z, SubChunkSize::<S>::SIZE.z as i32);
+        self.sub_chunk_min_z += SubChunkSize::<S>::SIZE.z as i32;
+        self.lower_bound.z = self.sub_chunk_min_z;
+        Some((current_min_z, lb, ub))
+    }
+}
+
+pub struct ChonkPosIter<V: Vox, S: RectVolSize, M: Clone> {
+    outer: ChonkIterHelper<V, S, M>,
+    opt_inner: Option<(i32, ChunkPosIter<V, SubChunkSize<S>, M>)>,
+}
+
+impl<V: Vox, S: RectVolSize, M: Clone> Iterator for ChonkPosIter<V, S, M> {
+    type Item = Vec3<i32>;
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some((sub_chunk_min_z, ref mut inner)) = self.opt_inner {
+                if let Some(mut pos) = inner.next() {
+                    pos.z += sub_chunk_min_z;
+                    return Some(pos);
                 }
-
-                new_chunk.set(rpos, block).unwrap(); // Can't fail (I hope)
-
-                self.sub_chunks[sub_chunk_idx] = SubChunk::Heterogeneous(new_chunk);
-                Ok(())
             }
-
-            /*
-            SubChunk::Homogeneous(cblock) => {
-                let mut new_chunk = Chunk::filled(*cblock, ());
-
-                new_chunk.set(rpos, block).unwrap(); // Can't fail (I hope!)
-
-                self.sub_chunks[sub_chunk_idx] = SubChunk::Heterogeneous(new_chunk);
-                Ok(())
+            match self.outer.next() {
+                None => return None,
+                Some((sub_chunk_min_z, lb, ub)) => {
+                    self.opt_inner = Some((sub_chunk_min_z, SubChunk::<V, S, M>::pos_iter(lb, ub)))
+                }
             }
-            */
-            SubChunk::Heterogeneous(chunk) => {
-                chunk.set(rpos, block).map_err(ChonkError::ChunkError)
-            } //_ => unimplemented!(),
         }
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum SubChunk {
-    Homogeneous(Block),
-    Hash(Block, HashMap<Vec3<u8>, Block>),
-    Heterogeneous(Chunk<Block, SubChunkSize, ()>),
+enum InnerChonkVolIter<'a, V: Vox, S: RectVolSize, M: Clone> {
+    Vol(ChunkVolIter<'a, V, SubChunkSize<S>, M>),
+    Pos(ChunkPosIter<V, SubChunkSize<S>, M>),
 }
 
-impl SubChunk {
-    pub fn filled(block: Block) -> Self {
-        SubChunk::Homogeneous(block)
-    }
+pub struct ChonkVolIter<'a, V: Vox, S: RectVolSize, M: Clone> {
+    chonk: &'a Chonk<V, S, M>,
+    outer: ChonkIterHelper<V, S, M>,
+    opt_inner: Option<(i32, InnerChonkVolIter<'a, V, S, M>)>,
 }
 
-#[derive(Debug)]
-pub struct ChonkMetrics {
-    chonks: usize,
-    homogeneous: usize,
-    hash: usize,
-    heterogeneous: usize,
-}
+impl<'a, V: Vox, S: RectVolSize, M: Clone> Iterator for ChonkVolIter<'a, V, S, M> {
+    type Item = (Vec3<i32>, &'a V);
 
-impl Default for ChonkMetrics {
-    fn default() -> Self {
-        ChonkMetrics {
-            chonks: 0,
-            homogeneous: 0,
-            hash: 0,
-            heterogeneous: 0,
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some((sub_chunk_min_z, ref mut inner)) = self.opt_inner {
+                let got = match inner {
+                    InnerChonkVolIter::<'a, V, S, M>::Vol(iter) => iter.next(),
+                    InnerChonkVolIter::<'a, V, S, M>::Pos(iter) => iter.next().map(|pos| {
+                        if sub_chunk_min_z < self.chonk.get_min_z() {
+                            (pos, &self.chonk.below)
+                        } else {
+                            (pos, &self.chonk.above)
+                        }
+                    }),
+                };
+                if let Some((mut pos, vox)) = got {
+                    pos.z += sub_chunk_min_z;
+                    return Some((pos, vox));
+                }
+            }
+            match self.outer.next() {
+                None => return None,
+                Some((sub_chunk_min_z, lb, ub)) => {
+                    let inner = if sub_chunk_min_z < self.chonk.get_min_z()
+                        || sub_chunk_min_z >= self.chonk.get_max_z()
+                    {
+                        InnerChonkVolIter::<'a, V, S, M>::Pos(SubChunk::<V, S, M>::pos_iter(lb, ub))
+                    } else {
+                        InnerChonkVolIter::<'a, V, S, M>::Vol(
+                            self.chonk.sub_chunks
+                                [self.chonk.sub_chunk_idx(sub_chunk_min_z) as usize]
+                                .vol_iter(lb, ub),
+                        )
+                    };
+                    self.opt_inner = Some((sub_chunk_min_z, inner));
+                }
+            }
         }
     }
 }
 
-impl Add for ChonkMetrics {
-    type Output = Self;
-
-    fn add(self, other: Self::Output) -> Self {
-        Self::Output {
-            chonks: self.chonks + other.chonks,
-            homogeneous: self.homogeneous + other.homogeneous,
-            hash: self.hash + other.hash,
-            heterogeneous: self.heterogeneous + other.heterogeneous,
-        }
-    }
-}
-
-impl<'a> IntoPosIterator for &'a Chonk {
-    type IntoIter = DefaultPosIterator;
+impl<'a, V: Vox, S: RectVolSize, M: Clone> IntoPosIterator for &'a Chonk<V, S, M> {
+    type IntoIter = ChonkPosIter<V, S, M>;
 
     fn pos_iter(self, lower_bound: Vec3<i32>, upper_bound: Vec3<i32>) -> Self::IntoIter {
-        DefaultPosIterator::new(lower_bound, upper_bound)
+        Self::IntoIter {
+            outer: ChonkIterHelper::<V, S, M> {
+                sub_chunk_min_z: self.sub_chunk_min_z(lower_bound.z),
+                lower_bound,
+                upper_bound,
+                phantom: PhantomData,
+            },
+            opt_inner: None,
+        }
     }
 }
 
-impl<'a> IntoVolIterator<'a> for &'a Chonk {
-    type IntoIter = DefaultVolIterator<'a, Chonk>;
+impl<'a, V: Vox, S: RectVolSize, M: Clone> IntoVolIterator<'a> for &'a Chonk<V, S, M> {
+    type IntoIter = ChonkVolIter<'a, V, S, M>;
 
     fn vol_iter(self, lower_bound: Vec3<i32>, upper_bound: Vec3<i32>) -> Self::IntoIter {
-        DefaultVolIterator::new(self, lower_bound, upper_bound)
+        Self::IntoIter {
+            chonk: self,
+            outer: ChonkIterHelper::<V, S, M> {
+                sub_chunk_min_z: self.sub_chunk_min_z(lower_bound.z),
+                lower_bound,
+                upper_bound,
+                phantom: PhantomData,
+            },
+            opt_inner: None,
+        }
     }
 }
diff --git a/common/src/terrain/mod.rs b/common/src/terrain/mod.rs
index 43bb15ebf3..d8e6e0e62e 100644
--- a/common/src/terrain/mod.rs
+++ b/common/src/terrain/mod.rs
@@ -57,5 +57,5 @@ impl TerrainChunkMeta {
 
 // Terrain type aliases
 
-pub type TerrainChunk = chonk::Chonk;
+pub type TerrainChunk = chonk::Chonk<Block, TerrainChunkSize, TerrainChunkMeta>;
 pub type TerrainGrid = VolGrid2d<TerrainChunk>;
diff --git a/common/src/volumes/chunk.rs b/common/src/volumes/chunk.rs
index dbe0f069b9..0c8a1c4094 100644
--- a/common/src/volumes/chunk.rs
+++ b/common/src/volumes/chunk.rs
@@ -1,5 +1,8 @@
-use crate::vol::{BaseVol, ReadVol, SizedVol, VolSize, Vox, WriteVol};
+use crate::vol::{
+    BaseVol, IntoPosIterator, IntoVolIterator, RasterableVol, ReadVol, VolSize, Vox, WriteVol,
+};
 use serde_derive::{Deserialize, Serialize};
+use std::iter::Iterator;
 use std::marker::PhantomData;
 use vek::*;
 
@@ -8,81 +11,105 @@ pub enum ChunkError {
     OutOfBounds,
 }
 
-/// A volume with dimensions known at compile-time.
-// V = Voxel
-// S = Size (replace when const generics are a thing)
-// M = Metadata
+/// The volume is spatially subdivided into groups of `4*4*4` blocks. Since a
+/// `Chunk` is of total size `32*32*16`, this implies that there are `8*8*4`
+/// groups. (These numbers are generic in the actual code such that there are
+/// always `256` groups. I.e. the group size is chosen depending on the desired
+/// total size of the `Chunk`.)
+///
+/// There's a single vector `self.vox` which consecutively stores these groups.
+/// Each group might or might not be contained in `self.vox`. A group that is
+/// not contained represents that the full group consists only of `self.default`
+/// voxels. This saves a lot of memory because oftentimes a `Chunk` consists of
+/// either a lot of air or a lot of stone.
+///
+/// To track whether a group is contained in `self.vox`, there's an index buffer
+/// `self.indices : [u8; 256]`. It contains for each group
+///
+/// * (a) the order in which it has been inserted into `self.vox`, if the group
+///     is contained in `self.vox` or
+/// * (b) 255, otherwise. That case represents that the whole group consists
+///     only of `self.default` voxels.
+///
+/// (Note that 255 is a valid insertion order for case (a) only if `self.vox` is
+/// full and then no other group has the index 255. Therefore there's no
+/// ambiguity.)
+///
+/// ## Rationale:
+///
+/// The index buffer should be small because:
+///
+/// * Small size increases the probability that it will always be in cache.
+/// * The index buffer is allocated for every `Chunk` and an almost empty `Chunk`
+///     shall not consume too much memory.
+///
+/// The number of 256 groups is particularly nice because it means that the index
+/// buffer can consist of `u8`s. This keeps the space requirement for the index
+/// buffer as low as 4 cache lines.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Chunk<V: Vox, S: VolSize, M> {
+    indices: Vec<u8>, // TODO (haslersn): Box<[u8; S::SIZE.x * S::SIZE.y * S::SIZE.z]>, this is however not possible in Rust yet
     vox: Vec<V>,
+    default: V,
     meta: M,
     phantom: PhantomData<S>,
 }
 
 impl<V: Vox, S: VolSize, M> Chunk<V, S, M> {
-    /// Used to transform a voxel position in the volume into its corresponding index
-    /// in the voxel array.
-    #[inline(always)]
-    fn idx_for(pos: Vec3<i32>) -> Option<usize> {
-        if pos.map(|e| e >= 0).reduce_and()
-            && pos.map2(S::SIZE, |e, lim| e < lim as i32).reduce_and()
-        {
-            Some(Self::idx_for_unchecked(pos))
-        } else {
-            None
-        }
-    }
+    const VOLUME: u32 = (S::SIZE.x * S::SIZE.y * S::SIZE.z) as u32;
+    const GROUP_VOLUME: u32 = [Self::VOLUME / 256, 1][(Self::VOLUME < 256) as usize];
+    /// `GROUP_COUNT_TOTAL` is always `256`, except if `VOLUME < 256`
+    const GROUP_COUNT_TOTAL: u32 = Self::VOLUME / Self::GROUP_VOLUME;
+    const GROUP_LONG_SIDE_LEN: u32 = 1 << ((Self::GROUP_VOLUME * 4 - 1).count_ones() / 3);
+    const GROUP_SIZE: Vec3<u32> = Vec3::new(
+        Self::GROUP_LONG_SIDE_LEN,
+        Self::GROUP_LONG_SIDE_LEN,
+        Self::GROUP_VOLUME / (Self::GROUP_LONG_SIDE_LEN * Self::GROUP_LONG_SIDE_LEN),
+    );
+    const GROUP_COUNT: Vec3<u32> = Vec3::new(
+        S::SIZE.x / Self::GROUP_SIZE.x,
+        S::SIZE.y / Self::GROUP_SIZE.y,
+        S::SIZE.z / Self::GROUP_SIZE.z,
+    );
 
-    /// Used to transform a voxel position in the volume into its corresponding index
-    /// in the voxel array.
-    #[inline(always)]
-    fn idx_for_unchecked(pos: Vec3<i32>) -> usize {
-        (pos.x * S::SIZE.y as i32 * S::SIZE.z as i32 + pos.y * S::SIZE.z as i32 + pos.z) as usize
-    }
-}
+    /// Creates a new `Chunk` with the provided dimensions and all voxels filled
+    /// with duplicates of the provided voxel.
+    pub fn filled(default: V, meta: M) -> Self {
+        // TODO (haslersn): Alter into compile time assertions
+        //
+        // An extent is valid if it fulfils the following conditions.
+        //
+        // 1. In each direction, the extent is a power of two.
+        // 2. In each direction, the group size is in [1, 256].
+        // 3. In each direction, the group count is in [1, 256].
+        //
+        // Rationales:
+        //
+        // 1. We have code in the implementation that assumes it. In particular,
+        //    code using `.count_ones()`.
+        // 2. The maximum group size is `256x256x256`, because there's code that
+        //    stores group relative indices as `u8`.
+        // 3. There's code that stores group indices as `u8`.
+        debug_assert!(S::SIZE.x.is_power_of_two());
+        debug_assert!(S::SIZE.y.is_power_of_two());
+        debug_assert!(S::SIZE.z.is_power_of_two());
+        debug_assert!(0 < Self::GROUP_SIZE.x);
+        debug_assert!(0 < Self::GROUP_SIZE.y);
+        debug_assert!(0 < Self::GROUP_SIZE.z);
+        debug_assert!(Self::GROUP_SIZE.x <= 256);
+        debug_assert!(Self::GROUP_SIZE.y <= 256);
+        debug_assert!(Self::GROUP_SIZE.z <= 256);
+        debug_assert!(0 < Self::GROUP_COUNT.x);
+        debug_assert!(0 < Self::GROUP_COUNT.y);
+        debug_assert!(0 < Self::GROUP_COUNT.z);
+        debug_assert!(Self::GROUP_COUNT.x <= 256);
+        debug_assert!(Self::GROUP_COUNT.y <= 256);
+        debug_assert!(Self::GROUP_COUNT.z <= 256);
 
-impl<V: Vox, S: VolSize, M> BaseVol for Chunk<V, S, M> {
-    type Vox = V;
-    type Error = ChunkError;
-}
-
-impl<V: Vox, S: VolSize, M> SizedVol for Chunk<V, S, M> {
-    #[inline(always)]
-    fn lower_bound(&self) -> Vec3<i32> {
-        Vec3::zero()
-    }
-
-    #[inline(always)]
-    fn upper_bound(&self) -> Vec3<i32> {
-        S::SIZE.map(|e| e as i32)
-    }
-}
-
-impl<V: Vox, S: VolSize, M> ReadVol for Chunk<V, S, M> {
-    #[inline(always)]
-    fn get(&self, pos: Vec3<i32>) -> Result<&V, ChunkError> {
-        Self::idx_for(pos)
-            .and_then(|idx| self.vox.get(idx))
-            .ok_or(ChunkError::OutOfBounds)
-    }
-}
-
-impl<V: Vox, S: VolSize, M> WriteVol for Chunk<V, S, M> {
-    #[inline(always)]
-    fn set(&mut self, pos: Vec3<i32>, vox: Self::Vox) -> Result<(), ChunkError> {
-        Self::idx_for(pos)
-            .and_then(|idx| self.vox.get_mut(idx))
-            .map(|old_vox| *old_vox = vox)
-            .ok_or(ChunkError::OutOfBounds)
-    }
-}
-
-impl<V: Vox + Clone, S: VolSize, M> Chunk<V, S, M> {
-    /// Create a new `Chunk` with the provided dimensions and all voxels filled with duplicates of
-    /// the provided voxel.
-    pub fn filled(vox: V, meta: M) -> Self {
         Self {
-            vox: vec![vox; S::SIZE.product() as usize],
+            indices: vec![255; Self::GROUP_COUNT_TOTAL as usize],
+            vox: Vec::new(),
+            default,
             meta,
             phantom: PhantomData,
         }
@@ -97,4 +124,225 @@ impl<V: Vox + Clone, S: VolSize, M> Chunk<V, S, M> {
     pub fn metadata_mut(&mut self) -> &mut M {
         &mut self.meta
     }
+
+    #[inline(always)]
+    fn grp_idx(pos: Vec3<i32>) -> u32 {
+        let grp_pos = pos.map2(Self::GROUP_SIZE, |e, s| e as u32 / s);
+        (grp_pos.z * (Self::GROUP_COUNT.y * Self::GROUP_COUNT.x))
+            + (grp_pos.y * Self::GROUP_COUNT.x)
+            + (grp_pos.x)
+    }
+
+    #[inline(always)]
+    fn rel_idx(pos: Vec3<i32>) -> u32 {
+        let rel_pos = pos.map2(Self::GROUP_SIZE, |e, s| e as u32 % s);
+        (rel_pos.z * (Self::GROUP_SIZE.y * Self::GROUP_SIZE.x))
+            + (rel_pos.y * Self::GROUP_SIZE.x)
+            + (rel_pos.x)
+    }
+
+    #[inline(always)]
+    fn idx_unchecked(&self, pos: Vec3<i32>) -> Option<usize> {
+        let grp_idx = Self::grp_idx(pos);
+        let rel_idx = Self::rel_idx(pos);
+        let base = self.indices[grp_idx as usize];
+        let num_groups = self.vox.len() as u32 / Self::GROUP_VOLUME;
+        if base as u32 >= num_groups {
+            None
+        } else {
+            Some((base as u32 * Self::GROUP_VOLUME + rel_idx) as usize)
+        }
+    }
+
+    #[inline(always)]
+    fn force_idx_unchecked(&mut self, pos: Vec3<i32>) -> usize {
+        let grp_idx = Self::grp_idx(pos);
+        let rel_idx = Self::rel_idx(pos);
+        let base = &mut self.indices[grp_idx as usize];
+        let num_groups = self.vox.len() as u32 / Self::GROUP_VOLUME;
+        if *base as u32 >= num_groups {
+            *base = num_groups as u8;
+            self.vox
+                .extend(std::iter::repeat(self.default.clone()).take(Self::GROUP_VOLUME as usize));
+        }
+        (*base as u32 * Self::GROUP_VOLUME + rel_idx) as usize
+    }
+
+    #[inline(always)]
+    fn get_unchecked(&self, pos: Vec3<i32>) -> &V {
+        match self.idx_unchecked(pos) {
+            Some(idx) => &self.vox[idx],
+            None => &self.default,
+        }
+    }
+
+    #[inline(always)]
+    fn set_unchecked(&mut self, pos: Vec3<i32>, vox: V) {
+        if vox != self.default {
+            let idx = self.force_idx_unchecked(pos);
+            self.vox[idx] = vox;
+        } else if let Some(idx) = self.idx_unchecked(pos) {
+            self.vox[idx] = vox;
+        }
+    }
+}
+
+impl<V: Vox, S: VolSize, M> BaseVol for Chunk<V, S, M> {
+    type Vox = V;
+    type Error = ChunkError;
+}
+
+impl<V: Vox, S: VolSize, M> RasterableVol for Chunk<V, S, M> {
+    const SIZE: Vec3<u32> = S::SIZE;
+}
+
+impl<V: Vox, S: VolSize, M> ReadVol for Chunk<V, S, M> {
+    #[inline(always)]
+    fn get(&self, pos: Vec3<i32>) -> Result<&Self::Vox, Self::Error> {
+        if !pos
+            .map2(S::SIZE, |e, s| 0 <= e && e < s as i32)
+            .reduce_and()
+        {
+            Err(Self::Error::OutOfBounds)
+        } else {
+            Ok(self.get_unchecked(pos))
+        }
+    }
+}
+
+impl<V: Vox, S: VolSize, M> WriteVol for Chunk<V, S, M> {
+    #[inline(always)]
+    fn set(&mut self, pos: Vec3<i32>, vox: Self::Vox) -> Result<(), Self::Error> {
+        if !pos
+            .map2(S::SIZE, |e, s| 0 <= e && e < s as i32)
+            .reduce_and()
+        {
+            Err(Self::Error::OutOfBounds)
+        } else {
+            Ok(self.set_unchecked(pos, vox))
+        }
+    }
+}
+
+pub struct ChunkPosIter<V: Vox, S: VolSize, M> {
+    // Store as `u8`s so as to reduce memory footprint.
+    lb: Vec3<i32>,
+    ub: Vec3<i32>,
+    pos: Vec3<i32>,
+    phantom: PhantomData<Chunk<V, S, M>>,
+}
+
+impl<V: Vox, S: VolSize, M> ChunkPosIter<V, S, M> {
+    fn new(lower_bound: Vec3<i32>, upper_bound: Vec3<i32>) -> Self {
+        // If the range is empty, then we have the special case `ub = lower_bound`.
+        let ub = if lower_bound.map2(upper_bound, |l, u| l < u).reduce_and() {
+            upper_bound
+        } else {
+            lower_bound
+        };
+        Self {
+            lb: lower_bound,
+            ub,
+            pos: lower_bound,
+            phantom: PhantomData,
+        }
+    }
+}
+
+impl<V: Vox, S: VolSize, M> Iterator for ChunkPosIter<V, S, M> {
+    type Item = Vec3<i32>;
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.pos.z >= self.ub.z {
+            return None;
+        }
+        let res = Some(self.pos);
+
+        self.pos.x += 1;
+        if self.pos.x != self.ub.x && self.pos.x % Chunk::<V, S, M>::GROUP_SIZE.x as i32 != 0 {
+            return res;
+        }
+        self.pos.x = std::cmp::max(
+            self.lb.x,
+            (self.pos.x - 1) & !(Chunk::<V, S, M>::GROUP_SIZE.x as i32 - 1),
+        );
+
+        self.pos.y += 1;
+        if self.pos.y != self.ub.y && self.pos.y % Chunk::<V, S, M>::GROUP_SIZE.y as i32 != 0 {
+            return res;
+        }
+        self.pos.y = std::cmp::max(
+            self.lb.y,
+            (self.pos.y - 1) & !(Chunk::<V, S, M>::GROUP_SIZE.y as i32 - 1),
+        );
+
+        self.pos.z += 1;
+        if self.pos.z != self.ub.z && self.pos.z % Chunk::<V, S, M>::GROUP_SIZE.z as i32 != 0 {
+            return res;
+        }
+        self.pos.z = std::cmp::max(
+            self.lb.z,
+            (self.pos.z - 1) & !(Chunk::<V, S, M>::GROUP_SIZE.z as i32 - 1),
+        );
+
+        self.pos.x = (self.pos.x | (Chunk::<V, S, M>::GROUP_SIZE.x as i32 - 1)) + 1;
+        if self.pos.x < self.ub.x {
+            return res;
+        }
+        self.pos.x = self.lb.x;
+
+        self.pos.y = (self.pos.y | (Chunk::<V, S, M>::GROUP_SIZE.y as i32 - 1)) + 1;
+        if self.pos.y < self.ub.y {
+            return res;
+        }
+        self.pos.y = self.lb.y;
+
+        self.pos.z = (self.pos.z | (Chunk::<V, S, M>::GROUP_SIZE.z as i32 - 1)) + 1;
+
+        res
+    }
+}
+
+pub struct ChunkVolIter<'a, V: Vox, S: VolSize, M> {
+    chunk: &'a Chunk<V, S, M>,
+    iter_impl: ChunkPosIter<V, S, M>,
+}
+
+impl<'a, V: Vox, S: VolSize, M> Iterator for ChunkVolIter<'a, V, S, M> {
+    type Item = (Vec3<i32>, &'a V);
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter_impl
+            .next()
+            .map(|pos| (pos, self.chunk.get_unchecked(pos)))
+    }
+}
+
+impl<V: Vox, S: VolSize, M> Chunk<V, S, M> {
+    /// It's possible to obtain a positional iterator without having a `Chunk`
+    /// instance.
+    pub fn pos_iter(lower_bound: Vec3<i32>, upper_bound: Vec3<i32>) -> ChunkPosIter<V, S, M> {
+        ChunkPosIter::<V, S, M>::new(lower_bound, upper_bound)
+    }
+}
+
+impl<'a, V: Vox, S: VolSize, M> IntoPosIterator for &'a Chunk<V, S, M> {
+    type IntoIter = ChunkPosIter<V, S, M>;
+
+    fn pos_iter(self, lower_bound: Vec3<i32>, upper_bound: Vec3<i32>) -> Self::IntoIter {
+        Chunk::<V, S, M>::pos_iter(lower_bound, upper_bound)
+    }
+}
+
+impl<'a, V: Vox, S: VolSize, M> IntoVolIterator<'a> for &'a Chunk<V, S, M> {
+    type IntoIter = ChunkVolIter<'a, V, S, M>;
+
+    fn vol_iter(self, lower_bound: Vec3<i32>, upper_bound: Vec3<i32>) -> Self::IntoIter {
+        ChunkVolIter::<'a, V, S, M> {
+            chunk: self,
+            iter_impl: ChunkPosIter::<V, S, M>::new(lower_bound, upper_bound),
+        }
+    }
 }