From 6adfa6680fe29f1818444e186216091a41d5c23d Mon Sep 17 00:00:00 2001
From: Joshua Yanovski <pythonesque@gmail.com>
Date: Tue, 16 Aug 2022 21:32:03 -0700
Subject: [PATCH] All unmaps on the main thread, sprites consolidated into a
 buffer.

---
 voxygen/src/menu/main/scene.rs         |   6 +-
 voxygen/src/mesh/greedy.rs             |  42 +++++++---
 voxygen/src/mesh/terrain.rs            |  13 ++-
 voxygen/src/render/buffer.rs           |  84 ++++++++++---------
 voxygen/src/render/consts.rs           |  14 ++--
 voxygen/src/render/instances.rs        |  32 ++++----
 voxygen/src/render/mod.rs              |   3 +-
 voxygen/src/render/model.rs            |  31 ++++++-
 voxygen/src/render/pipelines/sprite.rs |   3 +
 voxygen/src/render/renderer.rs         | 109 +++++++++++++++----------
 voxygen/src/render/renderer/binding.rs |  23 ++----
 voxygen/src/render/renderer/drawer.rs  |   4 +-
 voxygen/src/render/texture.rs          |   2 +-
 voxygen/src/scene/figure/cache.rs      |   7 +-
 voxygen/src/scene/mod.rs               |   6 +-
 voxygen/src/scene/simple.rs            |  14 +++-
 voxygen/src/scene/terrain.rs           | 103 +++++++++++++++--------
 17 files changed, 308 insertions(+), 188 deletions(-)
diff --git a/voxygen/src/menu/main/scene.rs b/voxygen/src/menu/main/scene.rs
index 68d25b8f47..6612242a0e 100644
--- a/voxygen/src/menu/main/scene.rs
+++ b/voxygen/src/menu/main/scene.rs
@@ -10,9 +10,9 @@ pub struct Scene {
 impl Scene {
     pub fn new(renderer: &mut Renderer) -> Self {
         let global_data = GlobalModel {
-            globals: renderer.create_consts(&[Globals::default()]),
-            lights: renderer.create_consts(&[Light::default(); 32]),
-            shadows: renderer.create_consts(&[Shadow::default(); 32]),
+            globals: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Globals::default()]),
+            lights: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Light::default(); 32]),
+            shadows: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Shadow::default(); 32]),
             shadow_mats: renderer.create_shadow_bound_locals(&[ShadowLocals::default()]),
             rain_occlusion_mats: renderer
                 .create_rain_occlusion_bound_locals(&[RainOcclusionLocals::default()]),
diff --git a/voxygen/src/mesh/greedy.rs b/voxygen/src/mesh/greedy.rs
index cc90ce34f0..b5fc32f87c 100644
--- a/voxygen/src/mesh/greedy.rs
+++ b/voxygen/src/mesh/greedy.rs
@@ -81,7 +81,7 @@ pub struct GreedyConfig<D, FV, FA, FL, FG, FO, FS, FP, FT> {
 /// coloring part as a continuation.  When called with a final tile size and
 /// vector, the continuation will consume the color data and write it to the
 /// vector.
-pub type SuspendedMesh<'a> = dyn for<'r> FnOnce(&'r mut ColLightInfo) + 'a;
+pub type SuspendedMesh<'a> = dyn for<'r> FnOnce(/*&'r mut ColLightInfo*/(&'r mut [[u8; 4]], Vec2<u16>)) + 'a;
 
 /// Abstraction over different atlas allocators. Useful to swap out the
 /// allocator implementation for specific cases (e.g. sprites).
@@ -418,24 +418,41 @@ impl<'a, Allocator: AtlasAllocator> GreedyMesh<'a, Allocator> {
     /// are known, we can perform just a single allocation to construct a
     /// precisely fitting atlas.  This will also let us (in the future)
     /// suspend meshing partway through in order to meet frame budget, and
-    /// potentially use a single staged upload to the GPU.
+    /// allows us to use a single staged upload to the GPU.
     ///
-    /// Returns the ColLightsInfo corresponding to the constructed atlas.
-    pub fn finalize(self, alignment: Vec2<u16>) -> ColLightInfo {
-        span!(_guard, "finalize", "GreedyMesh::finalize");
+    /// `make_buffer` is the function that produces the buffer to which we draw (which may be
+    /// either a staging buffer for upload to the GPU, or any other s
+    ///
+    /// Returns a tuple containing the size of the required buffer, and a function that, when
+    /// applied to a buffer allocated with that size, will produce the correct bounds for the
+    /// texture (which can then be bundled up into a ColLightsInfo, if need be).  The reason
+    /// for this awkward API is to allow consumers to create a mapped buffer with the correct
+    /// size, then write to it directly, rather than introducing a second staging copy.
+    pub fn finalize(
+        self,
+        alignment: Vec2<u16>,
+    ) -> (usize, impl for<'b> FnOnce(&'b mut [[u8; 4]]) -> Vec2<u16> + 'a)
+    {
         let mut cur_size = self.col_lights_size;
         // Round to nearest alignment (assuming power of 2)
         cur_size.x = (cur_size.x + alignment.x - 1) / alignment.x * alignment.x;
         cur_size.y = (cur_size.y + alignment.y - 1) / alignment.y * alignment.y;
+        /* let col_lights = make_buffer(cur_size.x as usize * cur_size.y as usize);
         let col_lights = vec![
             TerrainVertex::make_col_light(254, 0, Rgb::broadcast(254), true);
             cur_size.x as usize * cur_size.y as usize
-        ];
-        let mut col_lights_info = (col_lights, cur_size);
-        self.suspended.into_iter().for_each(|cont| {
-            cont(&mut col_lights_info);
-        });
-        col_lights_info
+        ]; */
+        let alloc_size = cur_size.x as usize * cur_size.y as usize;
+        (alloc_size, move |col_lights| {
+            span!(_guard, "finalize", "GreedyMesh::finalize");
+            assert!(col_lights.len() == alloc_size);
+            self.suspended.into_iter().for_each(move |cont| {
+                let col_lights_info = (&mut *col_lights, cur_size);
+                cont(/*&mut */col_lights_info);
+            });
+            /* col_lights_info */
+            cur_size
+        })
     }
 
     pub fn max_size(&self) -> Vec2<u16> { self.max_size }
@@ -783,7 +800,7 @@ fn add_to_atlas<Allocator: AtlasAllocator>(
 //
 // TODO: See if we can speed this up using SIMD.
 fn draw_col_lights<D>(
-    (col_lights, cur_size): &mut ColLightInfo,
+    (col_lights, cur_size): /*&mut ColLightInfo*/(&mut [[u8; 4]], Vec2<u16>),
     data: &mut D,
     todo_rects: Vec<TodoRect>,
     draw_delta: Vec3<i32>,
@@ -793,6 +810,7 @@ fn draw_col_lights<D>(
     mut get_opacity: impl FnMut(&mut D, Vec3<i32>) -> bool,
     mut make_face_texel: impl FnMut(&mut D, Vec3<i32>, u8, u8, bool) -> [u8; 4],
 ) {
+    let col_lights = &mut col_lights[0..cur_size.y as usize * cur_size.x as usize];
     todo_rects.into_iter().for_each(|(pos, uv, rect, delta)| {
         // NOTE: Conversions are safe because width, height, and offset must be
         // non-negative, and because every allocated coordinate in the atlas must be in
diff --git a/voxygen/src/mesh/terrain.rs b/voxygen/src/mesh/terrain.rs
index 918ba14ba0..eb84c54aa1 100644
--- a/voxygen/src/mesh/terrain.rs
+++ b/voxygen/src/mesh/terrain.rs
@@ -5,7 +5,7 @@ use crate::{
         greedy::{self, GreedyConfig, GreedyMesh},
         MeshGen,
     },
-    render::{ColLightInfo, FluidVertex, Mesh, TerrainVertex},
+    render::{ColLightInfo, FluidVertex, Mesh, Model, TerrainVertex},
     scene::terrain::BlocksOfInterest,
 };
 use common::{
@@ -332,6 +332,7 @@ type V = TerrainChunk;
 #[inline(always)]
 pub fn generate_mesh<'a/*, V: RectRasterableVol<Vox = Block> + ReadVol + Debug + 'static*/>(
     vol: &'a VolGrid2d<V>,
+    create_texture: impl Fn(usize) -> Option<Model<[u8; 4]>>,
     (range, max_texture_size, boi): (Aabb<i32>, Vec2<u16>, &'a BlocksOfInterest),
 ) -> MeshGen<
     TerrainVertex,
@@ -339,7 +340,7 @@ pub fn generate_mesh<'a/*, V: RectRasterableVol<Vox = Block> + ReadVol + Debug +
     TerrainVertex,
     (
         Aabb<f32>,
-        ColLightInfo,
+        /*ColLightInfo*/(Option<Model<[u8; 4]>>, Vec2<u16>),
         Arc<dyn Fn(Vec3<i32>) -> f32 + Send + Sync>,
         Arc<dyn Fn(Vec3<i32>) -> f32 + Send + Sync>,
     ),
@@ -997,10 +998,14 @@ pub fn generate_mesh<'a/*, V: RectRasterableVol<Vox = Block> + ReadVol + Debug +
         max: max_bounds + min_bounds,
     };
     // WGPU requires this alignment.
-    let (col_lights, col_lights_size) = greedy.finalize(
+    let /*(col_lights, col_lights_size)*/(col_lights_alloc_size, finalize) = greedy.finalize(
         Vec2::new((wgpu::COPY_BYTES_PER_ROW_ALIGNMENT / 4) as u16, 1),
     );
-
+    // Allocate the fresh mesh.
+    let mut col_lights = create_texture(col_lights_alloc_size);
+    let col_lights_size = col_lights.as_mut().map(|col_lights| {
+        finalize(bytemuck::cast_slice_mut(&mut col_lights.get_mapped_mut(0, col_lights.len())))
+    }).unwrap_or(Vec2::broadcast(0));
     (
         opaque_mesh,
         fluid_mesh,
diff --git a/voxygen/src/render/buffer.rs b/voxygen/src/render/buffer.rs
index 902c53a0aa..142bead1b5 100644
--- a/voxygen/src/render/buffer.rs
+++ b/voxygen/src/render/buffer.rs
@@ -16,14 +16,18 @@ impl<T: Copy + Pod> Buffer<T> {
                 label: None,
                 mapped_at_creation: true,
                 size: len as u64 * std::mem::size_of::<T>() as u64,
-                usage: usage | wgpu::BufferUsage::COPY_DST,
+                usage: usage,
             }),
             len,
             phantom_data: std::marker::PhantomData,
         }
     }
 
-    pub fn new(device: &wgpu::Device, usage: wgpu::BufferUsage, data: &[T]) -> Self {
+    /// NOTE: Queue is not *explicitly* used here, but it is implicitly used during the unmap
+    /// (within wgpu internals) when mapped at creation, which is called by create_buffer_init,
+    /// and requires acquiring a lock on it, so it's left in the API to deter people from using
+    /// it when the queue isn't available.
+    pub fn new(device: &wgpu::Device, _queue: &wgpu::Queue, usage: wgpu::BufferUsage, data: &[T]) -> Self {
         let contents = bytemuck::cast_slice(data);
 
         Self {
@@ -39,42 +43,6 @@ impl<T: Copy + Pod> Buffer<T> {
 
     #[allow(clippy::len_without_is_empty)]
     pub fn len(&self) -> usize { self.len }
-}
-
-pub struct DynamicBuffer<T: Copy + Pod>(Buffer<T>);
-
-impl<T: Copy + Pod> DynamicBuffer<T> {
-    pub fn new(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
-        let buffer = Buffer {
-            buf: device.create_buffer(&wgpu::BufferDescriptor {
-                label: None,
-                mapped_at_creation: false,
-                size: len as u64 * std::mem::size_of::<T>() as u64,
-                usage: usage | wgpu::BufferUsage::COPY_DST,
-            }),
-            len,
-            phantom_data: std::marker::PhantomData,
-        };
-        Self(buffer)
-    }
-
-    pub fn new_with_data(device: &wgpu::Device, usage: wgpu::BufferUsage, data: &[T]) -> Self {
-        Self(Buffer::new(device, usage | wgpu::BufferUsage::COPY_DST, data))
-    }
-
-    pub fn new_mapped(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
-        Self(Buffer::new_mapped(device, len, usage | wgpu::BufferUsage::COPY_DST))
-    }
-
-    pub fn update(&self, queue: &wgpu::Queue, vals: &[T], offset: usize) {
-        if !vals.is_empty() {
-            queue.write_buffer(
-                &self.buf,
-                offset as u64 * std::mem::size_of::<T>() as u64,
-                bytemuck::cast_slice(vals),
-            )
-        }
-    }
 
     /// Get the GPU-side mapped slice represented by this buffer handle, if it was previously
     /// memory mapped.
@@ -100,8 +68,8 @@ impl<T: Copy + Pod> DynamicBuffer<T> {
     /// unmapped), either directly or via [Buffer::new_mapped].
     ///
     /// NOTE: Queue is not *explicitly* used here, but it is implicitly used during the unmap
-    /// (within wgpu internals) and requires acquiring a lock on it, so it's left in the API to
-    /// deter people from using it when the queue isn't available.
+    /// (within wgpu internals) when mapped at creation, and requires acquiring a lock on it,
+    /// so it's left in the API to deter people from using it when the queue isn't available.
     pub fn unmap(&self, _queue: &wgpu::Queue/* , vals: &[T], offset: usize */) {
         /* if !vals.is_empty() {
             let contents = bytemuck::cast_slice(vals);
@@ -117,6 +85,42 @@ impl<T: Copy + Pod> DynamicBuffer<T> {
     }
 }
 
+pub struct DynamicBuffer<T: Copy + Pod>(Buffer<T>);
+
+impl<T: Copy + Pod> DynamicBuffer<T> {
+    pub fn new(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
+        let buffer = Buffer {
+            buf: device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                mapped_at_creation: false,
+                size: len as u64 * std::mem::size_of::<T>() as u64,
+                usage: usage,
+            }),
+            len,
+            phantom_data: std::marker::PhantomData,
+        };
+        Self(buffer)
+    }
+
+    pub fn new_with_data(device: &wgpu::Device, queue: &wgpu::Queue, usage: wgpu::BufferUsage, data: &[T]) -> Self {
+        Self(Buffer::new(device, queue, usage, data))
+    }
+
+    pub fn new_mapped(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
+        Self(Buffer::new_mapped(device, len, usage))
+    }
+
+    pub fn update(&self, queue: &wgpu::Queue, vals: &[T], offset: usize) {
+        if !vals.is_empty() {
+            queue.write_buffer(
+                &self.buf,
+                offset as u64 * std::mem::size_of::<T>() as u64,
+                bytemuck::cast_slice(vals),
+            )
+        }
+    }
+}
+
 impl<T: Copy + Pod> std::ops::Deref for DynamicBuffer<T> {
     type Target = Buffer<T>;
 
diff --git a/voxygen/src/render/consts.rs b/voxygen/src/render/consts.rs
index 2b7d0f826f..b0e1c1f737 100644
--- a/voxygen/src/render/consts.rs
+++ b/voxygen/src/render/consts.rs
@@ -10,27 +10,25 @@ pub struct Consts<T: Copy + Pod> {
 
 impl<T: Copy + Pod> Consts<T> {
     /// Create a new `Const<T>`.
-    pub fn new(device: &wgpu::Device, len: usize) -> Self {
+    pub fn new(device: &wgpu::Device, usage: wgpu::BufferUsage, len: usize) -> Self {
         Self {
             // TODO: examine if all our consts need to be updatable
-            buf: DynamicBuffer::new(device, len, wgpu::BufferUsage::UNIFORM),
+            buf: DynamicBuffer::new(device, len, wgpu::BufferUsage::COPY_DST | wgpu::BufferUsage::UNIFORM),
         }
     }
 
-    pub fn new_with_data(device: &wgpu::Device, data: &[T]) -> Self {
+    pub fn new_with_data(device: &wgpu::Device, queue: &wgpu::Queue, usage: wgpu::BufferUsage, data: &[T]) -> Self {
         Self {
-            // TODO: examine if all our consts need to be updatable
-            buf: DynamicBuffer::new_with_data(device, wgpu::BufferUsage::UNIFORM, data),
+            buf: DynamicBuffer::new_with_data(device, queue, usage | wgpu::BufferUsage::UNIFORM, data),
         }
     }
 
     /// Create a new `Const<T>` that is mapped at creation.
     ///
     /// Warning: buffer must be unmapped before attempting to use this buffer on the GPU!
-    pub fn new_mapped(device: &wgpu::Device, len: usize) -> Self {
+    pub fn new_mapped(device: &wgpu::Device, usage: wgpu::BufferUsage, len: usize) -> Self {
         Self {
-            // TODO: examine if all our consts need to be updatable
-            buf: DynamicBuffer::new_mapped(device, len, wgpu::BufferUsage::UNIFORM),
+            buf: DynamicBuffer::new_mapped(device, len, usage | wgpu::BufferUsage::UNIFORM),
         }
     }
 
diff --git a/voxygen/src/render/instances.rs b/voxygen/src/render/instances.rs
index 2b1607f4e1..afb02beec7 100644
--- a/voxygen/src/render/instances.rs
+++ b/voxygen/src/render/instances.rs
@@ -1,34 +1,38 @@
-use super::buffer::DynamicBuffer;
+use super::buffer::Buffer;
 use bytemuck::Pod;
 
 /// Represents a mesh that has been sent to the GPU.
 pub struct Instances<T: Copy + Pod> {
-    buf: DynamicBuffer<T>,
+    buf: Buffer<T>,
 }
 
 impl<T: Copy + Pod> Instances<T> {
-    pub fn new(device: &wgpu::Device, len: usize) -> Self {
+    pub fn new_mapped(device: &wgpu::Device, len: usize) -> Self {
         Self {
-            // TODO: examine if we have Instances that are not updated (e.g. sprites) and if there
-            // would be any gains from separating those out
-            buf: DynamicBuffer::new(device, len, wgpu::BufferUsage::VERTEX),
+            buf: Buffer::new_mapped(device, len, wgpu::BufferUsage::VERTEX),
         }
     }
 
-    pub fn new_with_data(device: &wgpu::Device, data: &[T]) -> Self {
+    pub fn new_with_data(device: &wgpu::Device, queue: &wgpu::Queue, data: &[T]) -> Self {
         Self {
-            // TODO: examine if we have Instances that are not updated (e.g. sprites) and if there
-            // would be any gains from separating those out
-            buf: DynamicBuffer::new_with_data(device, wgpu::BufferUsage::VERTEX, data),
+            buf: Buffer::new(device, queue, wgpu::BufferUsage::VERTEX, data),
         }
     }
 
+    /// Get the GPU-side mapped slice represented by this instances buffer, if it was previously
+    /// memory mapped.
+    pub fn get_mapped_mut(&self, offset: usize, len: usize) -> /* &mut [T] */wgpu::BufferViewMut<'_> {
+        self.buf.get_mapped_mut(offset, len)
+    }
+
+    /// Unmaps the GPU-side handle represented by this instances buffer, if it was previously
+    /// memory-mapped.
+    pub fn unmap(&self, queue: &wgpu::Queue) {
+        self.buf.unmap(queue);
+    }
+
     // TODO: count vs len naming scheme??
     pub fn count(&self) -> usize { self.buf.len() }
 
-    pub fn update(&mut self, queue: &wgpu::Queue, vals: &[T], offset: usize) {
-        self.buf.update(queue, vals, offset)
-    }
-
     pub fn buf(&self) -> &wgpu::Buffer { &self.buf.buf }
 }
diff --git a/voxygen/src/render/mod.rs b/voxygen/src/render/mod.rs
index 43cc32a836..2bcaacf2d5 100644
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@@ -35,7 +35,8 @@ pub use self::{
         skybox::{create_mesh as create_skybox_mesh, Vertex as SkyboxVertex},
         sprite::{
             Instance as SpriteInstance, SpriteGlobalsBindGroup, SpriteVerts,
-            Vertex as SpriteVertex, VERT_PAGE_SIZE as SPRITE_VERT_PAGE_SIZE,
+            Vertex as SpriteVertex, LOD_LEVELS as SPRITE_LOD_LEVELS,
+            VERT_PAGE_SIZE as SPRITE_VERT_PAGE_SIZE,
         },
         terrain::{Locals as TerrainLocals, TerrainLayout, Vertex as TerrainVertex},
         trail::Vertex as TrailVertex,
diff --git a/voxygen/src/render/model.rs b/voxygen/src/render/model.rs
index 7ae8d83f69..ace622e87b 100644
--- a/voxygen/src/render/model.rs
+++ b/voxygen/src/render/model.rs
@@ -30,16 +30,41 @@ pub struct Model<V: Vertex> {
 
 impl<V: Vertex> Model<V> {
     /// Returns None if the provided mesh is empty
-    pub fn new(device: &wgpu::Device, usage: wgpu::BufferUsage, mesh: &Mesh<V>) -> Option<Self> {
+    pub fn new(device: &wgpu::Device, queue: &wgpu::Queue, usage: wgpu::BufferUsage, mesh: &Mesh<V>) -> Option<Self> {
         if mesh.vertices().is_empty() {
             return None;
         }
 
         Some(Self {
-            vbuf: Buffer::new(device, /*wgpu::BufferUsage::VERTEX*/usage, mesh.vertices()),
+            vbuf: Buffer::new(device, queue, /*wgpu::BufferUsage::VERTEX*/usage, mesh.vertices()),
         })
     }
 
+    /// Create a new `Const<T>` that is mapped at creation.  Returns None if the mesh is empty.
+    ///
+    /// Warning: buffer must be unmapped before attempting to use this buffer on the GPU!
+    pub fn new_mapped(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Option<Self> {
+        if len == 0 {
+            return None;
+        }
+
+        Some(Self {
+            vbuf: Buffer::new_mapped(device, len, /*wgpu::BufferUsage::VERTEX*/usage/*, mesh.vertices()*/),
+        })
+    }
+
+    /// Get the GPU-side mapped slice represented by this model handle, if it was previously
+    /// memory mapped.
+    pub fn get_mapped_mut(&self, offset: usize, len: usize) -> /* &mut [T] */wgpu::BufferViewMut<'_> {
+        self.vbuf.get_mapped_mut(offset, len)
+    }
+
+    /// Unmaps the GPU-side handle represented by this model handle, if it was previously
+    /// memory-mapped.
+    pub fn unmap(&self, queue: &wgpu::Queue) {
+        self.vbuf.unmap(queue);
+    }
+
     /// Create a model with a slice of a portion of this model to send to the
     /// renderer.
     pub fn submodel(&self, vertex_range: Range<u32>) -> SubModel<V> {
@@ -64,7 +89,7 @@ pub struct DynamicModel<V: Vertex> {
 impl<V: Vertex> DynamicModel<V> {
     pub fn new(device: &wgpu::Device, size: usize) -> Self {
         Self {
-            vbuf: DynamicBuffer::new(device, size, wgpu::BufferUsage::VERTEX),
+            vbuf: DynamicBuffer::new(device, size, wgpu::BufferUsage::VERTEX | wgpu::BufferUsage::COPY_DST),
         }
     }
 
diff --git a/voxygen/src/render/pipelines/sprite.rs b/voxygen/src/render/pipelines/sprite.rs
index c309a50bfb..8ea6aec0da 100644
--- a/voxygen/src/render/pipelines/sprite.rs
+++ b/voxygen/src/render/pipelines/sprite.rs
@@ -9,6 +9,7 @@ use std::mem;
 use vek::*;
 
 pub const VERT_PAGE_SIZE: u32 = 256;
+pub const LOD_LEVELS: usize = 5;
 
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Zeroable, Pod)]
@@ -81,11 +82,13 @@ pub struct SpriteVerts(Buffer<Vertex>);
 
 pub(in super::super) fn create_verts_buffer(
     device: &wgpu::Device,
+    queue: &wgpu::Queue,
     mesh: Mesh<Vertex>,
 ) -> SpriteVerts {
     // TODO: type Buffer by wgpu::BufferUsage
     SpriteVerts(Buffer::new(
         device,
+        queue,
         wgpu::BufferUsage::STORAGE,
         mesh.vertices(),
     ))
diff --git a/voxygen/src/render/renderer.rs b/voxygen/src/render/renderer.rs
index 9a803fa18b..c3eef719bd 100644
--- a/voxygen/src/render/renderer.rs
+++ b/voxygen/src/render/renderer.rs
@@ -475,9 +475,9 @@ impl Renderer {
         )?;
 
         let clouds_locals =
-            Self::create_consts_inner(&device, &[clouds::Locals::default()]);
+            Self::create_consts_inner(&device, &queue, wgpu::BufferUsage::COPY_DST, &[clouds::Locals::default()]);
         let postprocess_locals =
-            Self::create_consts_inner(&device, &[postprocess::Locals::default()]);
+            Self::create_consts_inner(&device, &queue, wgpu::BufferUsage::COPY_DST, &[postprocess::Locals::default()]);
 
         let locals = Locals::new(
             &device,
@@ -488,7 +488,7 @@ impl Renderer {
             &views.tgt_depth,
             views.bloom_tgts.as_ref().map(|tgts| locals::BloomParams {
                 locals: bloom_sizes.map(|size| {
-                    Self::create_consts_inner(&device, &[bloom::Locals::new(size)])
+                    Self::create_consts_inner(&device, &queue, wgpu::BufferUsage::empty(), &[bloom::Locals::new(size)])
                 }),
                 src_views: [&views.tgt_color_pp, &tgts[1], &tgts[2], &tgts[3], &tgts[4]],
                 final_tgt_view: &tgts[0],
@@ -499,9 +499,9 @@ impl Renderer {
         );
 
         let quad_index_buffer_u16 =
-            create_quad_index_buffer_u16(&device, QUAD_INDEX_BUFFER_U16_VERT_LEN.into());
+            create_quad_index_buffer_u16(&device, &queue, QUAD_INDEX_BUFFER_U16_VERT_LEN.into());
         let quad_index_buffer_u32 =
-            create_quad_index_buffer_u32(&device, QUAD_INDEX_BUFFER_U32_START_VERT_LEN as usize);
+            create_quad_index_buffer_u32(&device, &queue, QUAD_INDEX_BUFFER_U32_START_VERT_LEN as usize);
         let mut profiler = wgpu_profiler::GpuProfiler::new(4, queue.get_timestamp_period());
         other_modes.profiler_enabled &= profiler_features_enabled;
         profiler.enable_timer = other_modes.profiler_enabled;
@@ -513,12 +513,15 @@ impl Renderer {
         let (maintain_tx, maintain_rx) = channel::bounded(0);
 
         let device_ = Arc::clone(&device);
-        std::thread::spawn(move || {
+        /* std::thread::spawn(move || {
             // Maintain each time we are requested to do so, until the renderer dies.
+            // Additionally, accepts CPU->GPU tasks containing updates to perform that need to lock
+            // the device (but not necessarily the queue?).  This is a hopefully temporary measure
+            // required because wgpu as currently written cannot help itself.
             while let Ok(()) = maintain_rx.recv() {
                 device_.poll(wgpu::Maintain::Poll);
             }
-        });
+        }); */
 
         #[cfg(feature = "egui-ui")]
         let egui_renderpass =
@@ -690,7 +693,7 @@ impl Renderer {
                 .as_ref()
                 .map(|tgts| locals::BloomParams {
                     locals: bloom_sizes.map(|size| {
-                        Self::create_consts_inner(&self.device, &[bloom::Locals::new(
+                        Self::create_consts_inner(&self.device, &self.queue, wgpu::BufferUsage::empty(), &[bloom::Locals::new(
                             size,
                         )])
                     }),
@@ -813,7 +816,8 @@ impl Renderer {
         // Since if the channel is out of capacity, it means a maintain is already being processed
         // (in which case we can just catch up next frame), this is a long-winded way of saying we
         // can ignore the result of try_send.
-        let _ = self.maintain_tx.try_send(());
+        // let _ = self.maintain_tx.try_send(());
+        self.device.poll(wgpu::Maintain::Poll);
     }
 
     /// Create render target views
@@ -1255,22 +1259,25 @@ impl Renderer {
     }
 
     /// Create a new set of constants with the provided values.
-    pub fn create_consts<T: Copy + bytemuck::Pod>(&mut self, vals: &[T]) -> Consts<T> {
-        Self::create_consts_inner(&self.device, vals)
+    pub fn create_consts<T: Copy + bytemuck::Pod>(&mut self, usage: wgpu::BufferUsage, vals: &[T]) -> Consts<T> {
+        Self::create_consts_inner(&self.device, &self.queue, usage, vals)
     }
 
     pub fn create_consts_inner<T: Copy + bytemuck::Pod>(
         device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        usage: wgpu::BufferUsage,
         vals: &[T],
     ) -> Consts<T> {
-        Consts::new_with_data(device, vals)
+        Consts::new_with_data(device, queue, usage, vals)
     }
 
     pub fn create_consts_mapped<T: Copy + bytemuck::Pod>(
         &mut self,
+        usage: wgpu::BufferUsage,
         len: usize,
     ) -> Consts<T> {
-        Consts::new_mapped(&self.device, len)
+        Consts::new_mapped(&self.device, usage, len)
     }
 
     /// Update a set of constants with the provided values.
@@ -1278,16 +1285,12 @@ impl Renderer {
         consts.update(&self.queue, vals, 0)
     }
 
-    /// Gets a memory mapped buffer of a set of constants.
-    pub fn get_consts_mapped<'a, T: Copy + bytemuck::Pod>(&self, consts: &'a Consts<T>) -> /* &'a mut [T] */wgpu::BufferViewMut<'a> {
-        consts.get_mapped_mut(0, consts.len())
-    }
-
-    /// Unmaps a set of memory mapped constants.
+    /// Unmaps a set of memory mapped consts.
     pub fn unmap_consts<T: Copy + bytemuck::Pod>(&self, consts: &Consts<T>) {
         consts.unmap(&self.queue)
     }
 
+
     pub fn update_clouds_locals(&mut self, new_val: clouds::Locals) {
         self.locals.clouds.update(&self.queue, &[new_val], 0)
     }
@@ -1301,16 +1304,21 @@ impl Renderer {
         &mut self,
         vals: &[T],
     ) -> Result<Instances<T>, RenderError> {
-        Ok(Instances::new_with_data(&self.device, vals))
+        Ok(Instances::new_with_data(&self.device, &self.queue, vals))
     }
 
-    /// Create a new set of instances with the provided values lazily (for use off the main
+    /// Create a new set of instances with the provided size lazily (for use off the main
     /// thread).
     pub fn create_instances_lazy<T: Copy + bytemuck::Pod>(
         &mut self,
-    ) -> impl for<'a> Fn(&'a [T]) -> Instances<T> + Send + Sync {
+    ) -> impl /*for<'a> */Fn(/* &'a [T]*/usize) -> Instances<T> + Send + Sync {
         let device = Arc::clone(&self.device);
-        move |vals| Instances::new_with_data(&device, &vals)
+        move |/*vals*/len| Instances::new_mapped(&device, len)/*Instances::new_with_data(&device, &vals)*/
+    }
+
+    /// Unmaps a set of memory mapped instances.
+    pub fn unmap_instances<T: Copy + bytemuck::Pod>(&self, instances: &Instances<T>) {
+        instances.unmap(&self.queue)
     }
 
     /// Update the expected index length to be large enough for a quad vertex bfufer with this many
@@ -1351,7 +1359,7 @@ impl Renderer {
                 if self.quad_index_buffer_u32.len() < quad_index_length {
                     // Make sure we aren't over the max
                     self.quad_index_buffer_u32 =
-                        create_quad_index_buffer_u32(&self.device, vert_length);
+                        create_quad_index_buffer_u32(&self.device, &self.queue, vert_length);
                 } */
             },
             None => {},
@@ -1369,33 +1377,53 @@ impl Renderer {
         let vert_length = self.quad_index_buffer_u32_len.load(Ordering::Relaxed);
         if self.quad_index_buffer_u32.len() < vert_length {
             self.quad_index_buffer_u32 =
-                create_quad_index_buffer_u32(&self.device, vert_length);
+                create_quad_index_buffer_u32(&self.device, &self.queue, vert_length);
         }
     }
 
     pub fn create_sprite_verts(&mut self, mesh: Mesh<sprite::Vertex>) -> sprite::SpriteVerts {
         Self::update_index_length::<sprite::Vertex>(&self.quad_index_buffer_u32_len, sprite::VERT_PAGE_SIZE as usize);
-        sprite::create_verts_buffer(&self.device, mesh)
+        sprite::create_verts_buffer(&self.device, &self.queue, mesh)
     }
 
     /// Create a new model from the provided mesh.
     /// If the provided mesh is empty this returns None
     pub fn create_model<V: Vertex>(&mut self, mesh: &Mesh<V>) -> Option<Model<V>> {
         Self::update_index_length::<V>(&self.quad_index_buffer_u32_len, mesh.vertices().len());
-        Model::new(&self.device, wgpu::BufferUsage::VERTEX, mesh)
+        Model::new(&self.device, &self.queue, wgpu::BufferUsage::VERTEX, mesh)
     }
 
-    /// Create a new model from the provided mesh, lazily (for use off the main thread).
-    /// If the provided mesh is empty this returns None
-    pub fn create_model_lazy<V: Vertex>(&mut self, usage: wgpu::BufferUsage) -> impl for<'a> Fn(&'a Mesh<V>) -> Option<Model<V>> + Send + Sync {
+    /// Create a new model for a mesh with the provided length, lazily (for use off the main
+    /// thread).  If the provided mesh is empty this returns None.  The mesh is memory mapped, and
+    /// still needs to be unmapped before use.
+    pub fn create_model_lazy_base<V: Vertex>(&mut self, usage: wgpu::BufferUsage) -> impl Fn(usize) -> Option<Model<V>> + Send + Sync {
         let device = Arc::clone(&self.device);
         let quad_index_buffer_u32_len = Arc::clone(&self.quad_index_buffer_u32_len);
-        move |mesh| {
-            Self::update_index_length::<V>(&quad_index_buffer_u32_len, mesh.vertices().len());
-            Model::new(&device, usage, mesh)
+        move |len| {
+            Self::update_index_length::<V>(&quad_index_buffer_u32_len, len);
+            Model::new_mapped(&device, len, usage/*, mesh.vertices()*/)
         }
     }
 
+    /// Create a new model for a mesh with the provided length, lazily (for use off the main
+    /// thread).  If the provided mesh is empty this returns None.  The mesh is memory mapped, and
+    /// still needs to be unmapped before use.
+    pub fn create_model_lazy<V: Vertex>(&mut self, usage: wgpu::BufferUsage) -> impl for<'a> Fn(&'a Mesh<V>) -> Option<Model<V>> + Send + Sync {
+        let create_model = self.create_model_lazy_base(usage);
+        move |mesh| {
+            let len = mesh.vertices().len();
+            let model = create_model(len)?;
+            model.get_mapped_mut(0, len)
+                .copy_from_slice(bytemuck::cast_slice(mesh.vertices()));
+            Some(model)
+        }
+    }
+
+    /// Unmaps a memory mapped model.
+    pub fn unmap_model<V: Vertex>(&self, model: &Model<V>) {
+        model.unmap(&self.queue);
+    }
+
     /// Create a new dynamic model with the specified size.
     pub fn create_dynamic_model<V: Vertex>(&mut self, size: usize) -> DynamicModel<V> {
         Self::update_index_length::<V>(&self.quad_index_buffer_u32_len, size);
@@ -1515,13 +1543,6 @@ impl Renderer {
         texture.clear(&self.queue)
     }
 
-    /// Replaces the destination texture with the contents of the source texture.
-    ///
-    /// The source size should at least fit within the destination texture's size.
-    pub fn replace_texture(&mut self, encoder: &mut wgpu::CommandEncoder, dest: &Texture, source: &Texture) {
-        dest.replace(&self.device, encoder, source);
-    }
-
     /// Queue to obtain a screenshot on the next frame render
     pub fn create_screenshot(
         &mut self,
@@ -1613,7 +1634,7 @@ impl Renderer {
     // }
 }
 
-fn create_quad_index_buffer_u16(device: &wgpu::Device, vert_length: usize) -> Buffer<u16> {
+fn create_quad_index_buffer_u16(device: &wgpu::Device, queue: &wgpu::Queue, vert_length: usize) -> Buffer<u16> {
     assert!(vert_length <= u16::MAX as usize);
     let indices = [0, 1, 2, 2, 1, 3]
         .iter()
@@ -1624,10 +1645,10 @@ fn create_quad_index_buffer_u16(device: &wgpu::Device, vert_length: usize) -> Bu
         .map(|(i, b)| (i / 6 * 4 + b) as u16)
         .collect::<Vec<_>>();
 
-    Buffer::new(device, wgpu::BufferUsage::INDEX, &indices)
+    Buffer::new(device, queue, wgpu::BufferUsage::INDEX, &indices)
 }
 
-fn create_quad_index_buffer_u32(device: &wgpu::Device, vert_length: usize) -> Buffer<u32> {
+fn create_quad_index_buffer_u32(device: &wgpu::Device, queue: &wgpu::Queue, vert_length: usize) -> Buffer<u32> {
     assert!(vert_length <= u32::MAX as usize);
     let indices = [0, 1, 2, 2, 1, 3]
         .iter()
@@ -1638,5 +1659,5 @@ fn create_quad_index_buffer_u32(device: &wgpu::Device, vert_length: usize) -> Bu
         .map(|(i, b)| (i / 6 * 4 + b) as u32)
         .collect::<Vec<_>>();
 
-    Buffer::new(device, wgpu::BufferUsage::INDEX, &indices)
+    Buffer::new(device, queue, wgpu::BufferUsage::INDEX, &indices)
 }
diff --git a/voxygen/src/render/renderer/binding.rs b/voxygen/src/render/renderer/binding.rs
index 7f34914bed..d041abe343 100644
--- a/voxygen/src/render/renderer/binding.rs
+++ b/voxygen/src/render/renderer/binding.rs
@@ -40,12 +40,12 @@ impl Renderer {
     }
 
     pub fn create_debug_bound_locals(&mut self, vals: &[debug::Locals]) -> debug::BoundLocals {
-        let locals = self.create_consts(vals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, vals);
         self.layouts.debug.bind_locals(&self.device, locals)
     }
 
     pub fn create_ui_bound_locals(&mut self, vals: &[ui::Locals]) -> ui::BoundLocals {
-        let locals = self.create_consts(vals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, vals);
         self.layouts.ui.bind_locals(&self.device, locals)
     }
 
@@ -58,22 +58,13 @@ impl Renderer {
         locals: &[figure::Locals],
         bone_data: &[figure::BoneData],
     ) -> figure::BoundLocals {
-        let locals = self.create_consts(locals);
-        let bone_data = self.create_consts(bone_data);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, locals);
+        let bone_data = self.create_consts(wgpu::BufferUsage::COPY_DST, bone_data);
         self.layouts
             .figure
             .bind_locals(&self.device, locals, bone_data)
     }
 
-    /* /// Create a new set of constants with the provided values, lazily (so this can be instantiated
-    /// from another thread).
-    pub fn create_consts_lazy<T: Copy + bytemuck::Pod>(&mut self) ->
-        impl for<'a> Fn(&'a [T]) -> Consts<T> + Send + Sync
-    {
-        let device = Arc::clone(&self.device);
-        move |vals| Self::create_consts_inner(&device, vals)
-    } */
-
     /// NOTE: Locals are mapped at creation, so you still have to memory map and bind them in order
     /// before use.
     pub fn create_terrain_bound_locals(
@@ -84,14 +75,14 @@ impl Renderer {
         /* let device = Arc::clone(&self.device);
         let immutable = Arc::clone(&self.layouts.immutable);
         move || {
-            let locals = Consts::new_mapped(&device, 1);
+            let locals = Consts::new_mapped(&device, wgpu::BufferUsage::empty(), 1);
             immutable.terrain.bind_locals(&device, locals)
         } */
         self.layouts.immutable.terrain.bind_locals(&self.device, locals/* , offset */)
     }
 
     pub fn create_shadow_bound_locals(&mut self, locals: &[shadow::Locals]) -> shadow::BoundLocals {
-        let locals = self.create_consts(locals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, locals);
         self.layouts.shadow.bind_locals(&self.device, locals)
     }
 
@@ -99,7 +90,7 @@ impl Renderer {
         &mut self,
         locals: &[rain_occlusion::Locals],
     ) -> rain_occlusion::BoundLocals {
-        let locals = self.create_consts(locals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, locals);
         self.layouts
             .rain_occlusion
             .bind_locals(&self.device, locals)
diff --git a/voxygen/src/render/renderer/drawer.rs b/voxygen/src/render/renderer/drawer.rs
index 743c99bb29..3d173536e7 100644
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@@ -976,7 +976,7 @@ impl<'pass_ref, 'pass: 'pass_ref> SpriteDrawer<'pass_ref, 'pass> {
     pub fn draw<'data: 'pass>(
         &mut self,
         &(terrain_locals_offset, ref terrain_locals): &'data (wgpu::DynamicOffset, terrain::BoundLocals),
-        instances: &'data Instances<sprite::Instance>,
+        (range, instances): (Range<u32>, &'data Instances<sprite::Instance>),
     ) {
         self.render_pass
             .set_bind_group(3, &terrain_locals.bind_group, &[terrain_locals_offset]);
@@ -986,7 +986,7 @@ impl<'pass_ref, 'pass: 'pass_ref> SpriteDrawer<'pass_ref, 'pass> {
         self.render_pass.draw_indexed(
             0..sprite::VERT_PAGE_SIZE / 4 * 6,
             0,
-            0..instances.count() as u32,
+            range,
         );
     }
 }
diff --git a/voxygen/src/render/texture.rs b/voxygen/src/render/texture.rs
index 65f174bb7d..85b7dfc6d8 100644
--- a/voxygen/src/render/texture.rs
+++ b/voxygen/src/render/texture.rs
@@ -193,7 +193,7 @@ impl Texture {
     /// Replaces this texture with the contents of another texture.
     ///
     /// The source size should at least fit within this texture's size.
-    pub fn replace<'a>(&self, device: &wgpu::Device, encoder: &mut wgpu::CommandEncoder, texture: &Self) {
+    pub fn replace<'a>(&self, encoder: &mut wgpu::CommandEncoder, texture: &Self) {
         // Copy image
         encoder.copy_texture_to_texture(
             wgpu::ImageCopyTexture {
diff --git a/voxygen/src/scene/figure/cache.rs b/voxygen/src/scene/figure/cache.rs
index d8962b8f54..5d6b67bfc4 100644
--- a/voxygen/src/scene/figure/cache.rs
+++ b/voxygen/src/scene/figure/cache.rs
@@ -463,8 +463,13 @@ where
                         make_model(generate_mesh_lod_low),
                     ];
 
+                    let (col_lights_alloc_size, finalize) = greedy.finalize(Vec2::broadcast(1));
+                    let mut col_lights = vec![[0; 4]; col_lights_alloc_size];
+                    let col_lights_size = finalize(&mut col_lights);
+                    let col_light = (col_lights, col_lights_size);
+
                     slot_.store(Some(MeshWorkerResponse {
-                        col_light: greedy.finalize(Vec2::broadcast(1)),
+                        col_light,
                         opaque,
                         bounds: figure_bounds,
                         vertex_range: models,
diff --git a/voxygen/src/scene/mod.rs b/voxygen/src/scene/mod.rs
index 5d7d66dab5..be37f451c5 100644
--- a/voxygen/src/scene/mod.rs
+++ b/voxygen/src/scene/mod.rs
@@ -282,9 +282,9 @@ impl Scene {
         let sprite_render_context = lazy_init(renderer);
 
         let data = GlobalModel {
-            globals: renderer.create_consts(&[Globals::default()]),
-            lights: renderer.create_consts(&[Light::default(); MAX_LIGHT_COUNT]),
-            shadows: renderer.create_consts(&[Shadow::default(); MAX_SHADOW_COUNT]),
+            globals: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Globals::default()]),
+            lights: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Light::default(); MAX_LIGHT_COUNT]),
+            shadows: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Shadow::default(); MAX_SHADOW_COUNT]),
             shadow_mats: renderer.create_shadow_bound_locals(&[ShadowLocals::default()]),
             rain_occlusion_mats: renderer
                 .create_rain_occlusion_bound_locals(&[RainOcclusionLocals::default()]),
diff --git a/voxygen/src/scene/simple.rs b/voxygen/src/scene/simple.rs
index c89cb35bd9..e27fff81e5 100644
--- a/voxygen/src/scene/simple.rs
+++ b/voxygen/src/scene/simple.rs
@@ -109,9 +109,9 @@ impl Scene {
         let mut col_lights = FigureColLights::new(renderer);
 
         let data = GlobalModel {
-            globals: renderer.create_consts(&[Globals::default()]),
-            lights: renderer.create_consts(&[Light::default(); 20]),
-            shadows: renderer.create_consts(&[Shadow::default(); 24]),
+            globals: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Globals::default()]),
+            lights: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Light::default(); 20]),
+            shadows: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Shadow::default(); 24]),
             shadow_mats: renderer.create_shadow_bound_locals(&[ShadowLocals::default()]),
             rain_occlusion_mats: renderer
                 .create_rain_occlusion_bound_locals(&[RainOcclusionLocals::default()]),
@@ -145,9 +145,15 @@ impl Scene {
                 // total size is bounded by 2^24 * 3 * 1.5 which is bounded by
                 // 2^27, which fits in a u32.
                 let range = 0..opaque_mesh.vertices().len() as u32;
+
+                let (col_lights_alloc_size, finalize) = greedy.finalize(Vec2::broadcast(1));
+                let mut col_light = vec![[0; 4]; col_lights_alloc_size];
+                let col_lights_size = finalize(&mut col_light);
+                let col_light = (col_light, col_lights_size);
+
                 let model =
                     col_lights
-                        .create_figure(renderer, greedy.finalize(Vec2::broadcast(1)), (opaque_mesh, bounds), [range]);
+                        .create_figure(renderer, col_light, (opaque_mesh, bounds), [range]);
                 let mut buf = [Default::default(); anim::MAX_BONE_COUNT];
                 let common_params = FigureUpdateCommonParameters {
                     entity: None,
diff --git a/voxygen/src/scene/terrain.rs b/voxygen/src/scene/terrain.rs
index 3115db0140..b126195db2 100644
--- a/voxygen/src/scene/terrain.rs
+++ b/voxygen/src/scene/terrain.rs
@@ -12,7 +12,7 @@ use crate::{
         pipelines::{self, ColLights},
         ColLightInfo, Consts, FirstPassDrawer, FluidVertex, GlobalModel, Instances, LodData, Mesh, Model,
         RenderError, Renderer, SpriteGlobalsBindGroup, SpriteInstance, SpriteVertex, SpriteVerts,
-        TerrainLocals, TerrainShadowDrawer, TerrainVertex, Texture, SPRITE_VERT_PAGE_SIZE,
+        TerrainLocals, TerrainShadowDrawer, TerrainVertex, Texture, SPRITE_LOD_LEVELS, SPRITE_VERT_PAGE_SIZE,
     },
 };
 
@@ -46,7 +46,6 @@ use treeculler::{BVol, Frustum, AABB};
 use vek::*;
 
 const SPRITE_SCALE: Vec3<f32> = Vec3::new(1.0 / 11.0, 1.0 / 11.0, 1.0 / 11.0);
-const SPRITE_LOD_LEVELS: usize = 5;
 
 // For rain occlusion we only need to render the closest chunks.
 /// How many chunks are maximally rendered for rain occlusion.
@@ -91,7 +90,7 @@ pub struct TerrainChunkData {
     col_lights: Arc<ColLights<pipelines::terrain::Locals>>,
     light_map: LightMapFn,
     glow_map: LightMapFn,
-    sprite_instances: [Instances<SpriteInstance>; SPRITE_LOD_LEVELS],
+    sprite_instances: ([core::ops::Range<u32>; SPRITE_LOD_LEVELS], Instances<SpriteInstance>),
     locals: (wgpu::DynamicOffset, pipelines::terrain::BoundLocals),
     pub blocks_of_interest: BlocksOfInterest,
 
@@ -143,7 +142,7 @@ pub struct MeshWorkerResponseMesh {
 /// mesh of a chunk.
 struct MeshWorkerResponse {
     pos: Vec2<i32>,
-    sprite_instances: [Instances<SpriteInstance>; SPRITE_LOD_LEVELS],
+    sprite_instances: ([core::ops::Range<u32>; SPRITE_LOD_LEVELS], Instances<SpriteInstance>),
     /// If None, this update was requested without meshing.
     mesh: Option<MeshWorkerResponseMesh>,
     started_tick: u64,
@@ -259,9 +258,9 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
     sprite_config: &SpriteSpec,
     create_opaque: impl for<'a> Fn(&'a Mesh<TerrainVertex>) -> Option<Model<TerrainVertex>>,
     create_fluid: impl for<'a> Fn(&'a Mesh<FluidVertex>) -> Option<Model<FluidVertex>>,
-    create_instances: impl for<'a> Fn(&'a [SpriteInstance]) -> Instances<SpriteInstance>,
+    create_instances: impl for<'a> Fn(/* &'a [SpriteInstance] */usize) -> Instances<SpriteInstance>,
     /* create_locals: impl Fn() -> pipelines::terrain::BoundLocals, */
-    create_texture: impl for<'a> Fn(/* wgpu::TextureDescriptor<'a>, wgpu::TextureViewDescriptor<'a>, wgpu::SamplerDescriptor<'a>*/&'a Mesh<[u8; 4]>) -> /*Texture + Send + Sync*/Option<Model<[u8; 4]>>,
+    create_texture: impl for<'a> Fn(/* wgpu::TextureDescriptor<'a>, wgpu::TextureViewDescriptor<'a>, wgpu::SamplerDescriptor<'a>*//*&'a Mesh<[u8; 4]>*/usize) -> /*Texture + Send + Sync*/Option<Model<[u8; 4]>>,
 ) -> MeshWorkerResponse {
     span!(_guard, "mesh_worker");
     let (blocks_of_interest, sprite_kinds) = BlocksOfInterest::from_chunk(&chunk)/*default()*/;
@@ -281,15 +280,16 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
         let (opaque_mesh, fluid_mesh, _shadow_mesh, (bounds, col_lights_info, light_map, glow_map)) =
             generate_mesh(
                 &volume,
+                create_texture,
                 (
                     range,
                     Vec2::new(max_texture_size, max_texture_size),
                     &blocks_of_interest,
                 ),
             );
-        let mut tex_ = Mesh::new();
+        /* let mut tex_ = Mesh::new();
         *tex_.vertices_mut_vec() = col_lights_info.0;
-        let tex = create_texture(&tex_);
+        let tex = create_texture(&tex_); */
         mesh = Some(MeshWorkerResponseMesh {
             // TODO: Take sprite bounds into account somehow?
             z_bounds: (bounds.min.z, bounds.max.z),
@@ -301,7 +301,7 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
             opaque_model: create_opaque(&opaque_mesh),
             fluid_model: create_fluid(&fluid_mesh),
             /* locals: create_locals(), */
-            col_lights_info: (tex, col_lights_info.1),
+            col_lights_info/*: (tex, col_lights_info.1)*/,
             light_map,
             glow_map,
         });
@@ -383,7 +383,25 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
             } */
             }
 
-            instances.map(|instances| create_instances(&instances))
+            let mut start = 0;
+            let instance_ranges = instances.each_ref().map(|instances| {
+                let range = start..start + instances.len() as u32;
+                start = range.end;
+                range
+            });
+            let sprite_instances = create_instances(instance_ranges.iter().map(|range| range.len()).sum());
+            if start > 0 {
+                sprite_instances
+                    .get_mapped_mut(0, sprite_instances.count())
+                    .array_chunks_mut::<{ core::mem::size_of::<SpriteInstance>() }>()
+                    .zip(instances.into_iter().flatten()).for_each(|(dst, src)| {
+                    // FIXME: cast doesn't work because bytemuck::cast isn't const generic-ified
+                    // yet, so it fails on some array lengths.
+                    // *dst = bytemuck::cast(src);
+                    dst.copy_from_slice(bytemuck::cast_slice(&[src]));
+                });
+            }
+            (instance_ranges, sprite_instances)
         },
         mesh,
         blocks_of_interest,
@@ -601,7 +619,10 @@ impl SpriteRenderContext {
 
             let sprite_col_lights = {
                 prof_span!("finalize");
-                greedy.finalize(Vec2::broadcast(1))
+                let (col_lights_alloc_size, finalize) = greedy.finalize(Vec2::broadcast(1));
+                let mut col_lights = vec![[0; 4]; col_lights_alloc_size];
+                let col_lights_size = finalize(&mut col_lights);
+                (col_lights, col_lights_size)
             };
 
             SpriteWorkerResponse {
@@ -799,7 +820,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
         // a copy from the previous atlas, skipping the CPU->GPU upload.
         if let Some((old_texture, encoder)) = old_texture {
             // TODO: Delay submission, don't just submit immediately out of convenience!
-            renderer.replace_texture(encoder, &texture, old_texture);
+            texture.replace(encoder, old_texture);
         } else {
             renderer.clear_texture(&texture);
         }
@@ -1286,7 +1307,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
             let create_fluid = renderer.create_model_lazy(wgpu::BufferUsage::VERTEX);
             let create_instances = renderer.create_instances_lazy();
             /* let create_locals = renderer.create_terrain_bound_locals(); */
-            let create_texture = renderer./*create_texture_raw*/create_model_lazy(wgpu::BufferUsage::COPY_SRC);
+            let create_texture = renderer./*create_texture_raw*/create_model_lazy_base(wgpu::BufferUsage::COPY_SRC);
             /* cnt.fetch_add(1, Ordering::Relaxed); */
             let job = move || {
                 // Since this loads when the task actually *runs*, rather than when it's
@@ -1347,8 +1368,8 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
         if max_recv_count > 0 {
         // Construct a buffer for all the chunks we're going to process in this frame.  There might
         // be some unused slots, which is fine.
-        let locals = /*Arc::new(*/renderer.create_consts_mapped(max_recv_count as usize)/*)*/;
-        let mut locals_buffer = renderer.get_consts_mapped(&locals);
+        let locals = /*Arc::new(*/renderer.create_consts_mapped(wgpu::BufferUsage::empty(), max_recv_count as usize)/*)*/;
+        let mut locals_buffer = locals.get_mapped_mut(0, locals.len());
         let mut locals_bound = renderer.create_terrain_bound_locals(&locals/*, locals_offset */);
         let mut encoder = renderer.device
             .create_command_encoder(&wgpu::CommandEncoderDescriptor {
@@ -1356,25 +1377,26 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
             });
 
         for (locals_offset, (response, locals_buffer)) in incoming_chunks.zip(locals_buffer.array_chunks_mut::<{ core::mem::size_of::<TerrainLocals>() }>()).enumerate() {
-            match self.mesh_todo.get(&response.pos) {
+            let pos = response.pos;
+            let response_started_tick = response.started_tick;
+            match self.mesh_todo.get(&pos) {
                 // It's the mesh we want, insert the newly finished model into the terrain model
                 // data structure (convert the mesh to a model first of course).
                 Some(todo) => {
                     let started_tick = todo.started_tick.load(Ordering::Relaxed);
-                    if response.started_tick > started_tick {
+                    if response_started_tick > started_tick {
                         // Chunk must have been removed, or it was spawned on an old tick. Drop
-                        // the mesh since it's either out of date or no longer needed.
+                        // the mesh in the background since it's either out of date or no longer
+                        // needed.
+                        slowjob.spawn(&"TERRAIN_DROP", move || { drop(response); });
                         continue;
                     }
 
-                    let sprite_instances = response.sprite_instances;
-
                     if let Some(mut mesh) = response.mesh {
                         // Full update, insert the whole chunk.
-
                         let load_time = self
                             .chunks
-                            .get(&response.pos)
+                            .get(&pos)
                             .map(|chunk| chunk.load_time)
                             .unwrap_or(current_time as f32);
                         // TODO: Allocate new atlas on allocation failure.
@@ -1422,6 +1444,16 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                                 .expect("Chunk data does not fit in a texture of maximum size.")
                         });
 
+                        // Unmap buffers mapped on other threads (we do this here to avoid
+                        // contention with queue submission, as both of these take the device write
+                        // lock as of wgpu 0.8.1).
+                        //
+                        // FIXME: When we upgrade wgpu, reconsider all this.
+                        renderer.unmap_instances(&response.sprite_instances.1);
+                        mesh.opaque_model.as_ref().map(|model| renderer.unmap_model(model));
+                        mesh.fluid_model.as_ref().map(|model| renderer.unmap_model(model));
+                        renderer.unmap_model(&tex);
+
                         // NOTE: Cast is safe since the origin was a u16.
                         let atlas_offs = Vec2::new(
                             allocation.rectangle.min.x as u32,
@@ -1467,7 +1499,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                         let locals_buffer_ =
                         /* renderer.update_mapped(&mut mesh.locals, &[*/TerrainLocals::new(
                             Vec3::from(
-                                response.pos.map2(VolGrid2d::<V>::chunk_size(), |e, sz| {
+                                pos.map2(VolGrid2d::<V>::chunk_size(), |e, sz| {
                                     e as f32 * sz as f32
                                 }),
                             ),
@@ -1477,7 +1509,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                         *locals_buffer = bytemuck::cast(locals_buffer_);
 
                         /* let locals = Arc::clone(&locals); */
-                        Self::insert_chunk(&slowjob, &mut self.chunks, &mut self.atlas, response.pos, TerrainChunkData {
+                        Self::insert_chunk(&slowjob, &mut self.chunks, &mut self.atlas, pos, TerrainChunkData {
                             load_time,
                             opaque_model: mesh.opaque_model,
                             fluid_model: mesh.fluid_model,
@@ -1485,7 +1517,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                             col_lights: Arc::clone(&self.col_lights),
                             light_map: mesh.light_map,
                             glow_map: mesh.glow_map,
-                            sprite_instances,
+                            sprite_instances: response.sprite_instances,
                             locals: /* mesh.locals *//*renderer.create_terrain_bound_locals(&locals/*, locals_offset */)*/
                                 ((locals_offset * core::mem::size_of::<TerrainLocals>()) as wgpu::DynamicOffset, Arc::clone(&locals_bound)),
                             visible: Visibility {
@@ -1499,20 +1531,27 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                             shadow_z_bounds: mesh.shadow_z_bounds,
                             frustum_last_plane_index: 0,
                         });
-                    } else if let Some(chunk) = self.chunks.get_mut(&response.pos) {
+                    } else if let Some(chunk) = self.chunks.get_mut(&pos) {
                         // There was an update that didn't require a remesh (probably related to
                         // non-glowing sprites) so we just update those.
-                        chunk.sprite_instances = sprite_instances;
+                        chunk.sprite_instances = response.sprite_instances;
                         chunk.blocks_of_interest = response.blocks_of_interest;
+                    } else {
+                        // Not sure what happened here, but we should drop the result in the
+                        // background.
+                        slowjob.spawn(&"TERRAIN_DROP", move || { drop(response); });
                     }
 
-                    if response.started_tick == started_tick {
+                    if response_started_tick == started_tick {
                         // This was the latest worker for this chunk, so we don't need to worry
                         // about canceling any later tasks.
-                        self.mesh_todo.remove(&response.pos);
+                        self.mesh_todo.remove(&pos);
                     }
                 },
-                None => {},
+                // Old task, drop the response in the background.
+                None => {
+                    slowjob.spawn(&"TERRAIN_DROP", move || { drop(response); });
+                },
             }
         }
         // Drop the memory mapping and unmap the locals.
@@ -1923,7 +1962,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
             .filter(|(_, c)| c.visible.is_visible())
             .for_each(|(pos, chunk)| {
                 // Skip chunk if it has no sprites
-                if chunk.sprite_instances[0].count() == 0 {
+                if chunk.sprite_instances.1.count() == 0 {
                     return;
                 }
 
@@ -1949,7 +1988,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                         4
                     };
 
-                    sprite_drawer.draw(&chunk.locals, &chunk.sprite_instances[lod_level]);
+                    sprite_drawer.draw(&chunk.locals, (chunk.sprite_instances.0[lod_level].clone(), &chunk.sprite_instances.1));
                 }
             });
         drop(sprite_drawer);