All unmaps on the main thread, sprites consolidated into a buffer.

2024-08-30 18:12:32 +00:00 · 2022-08-16 21:32:03 -07:00
parent 2301b7f47a
commit 6adfa6680f
17 changed files with 308 additions and 188 deletions
--- a/voxygen/src/menu/main/scene.rs
+++ b/voxygen/src/menu/main/scene.rs
@ -10,9 +10,9 @@ pub struct Scene {
 impl Scene {
    pub fn new(renderer: &mut Renderer) -> Self {
        let global_data = GlobalModel {
-            globals: renderer.create_consts(&[Globals::default()]),
-            lights: renderer.create_consts(&[Light::default(); 32]),
-            shadows: renderer.create_consts(&[Shadow::default(); 32]),
+            globals: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Globals::default()]),
+            lights: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Light::default(); 32]),
+            shadows: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Shadow::default(); 32]),
            shadow_mats: renderer.create_shadow_bound_locals(&[ShadowLocals::default()]),
            rain_occlusion_mats: renderer
                .create_rain_occlusion_bound_locals(&[RainOcclusionLocals::default()]),
--- a/voxygen/src/mesh/greedy.rs
+++ b/voxygen/src/mesh/greedy.rs
@ -81,7 +81,7 @@ pub struct GreedyConfig<D, FV, FA, FL, FG, FO, FS, FP, FT> {
 /// coloring part as a continuation.  When called with a final tile size and
 /// vector, the continuation will consume the color data and write it to the
 /// vector.
-pub type SuspendedMesh<'a> = dyn for<'r> FnOnce(&'r mut ColLightInfo) + 'a;
+pub type SuspendedMesh<'a> = dyn for<'r> FnOnce(/*&'r mut ColLightInfo*/(&'r mut [[u8; 4]], Vec2<u16>)) + 'a;

 /// Abstraction over different atlas allocators. Useful to swap out the
 /// allocator implementation for specific cases (e.g. sprites).
@ -418,24 +418,41 @@ impl<'a, Allocator: AtlasAllocator> GreedyMesh<'a, Allocator> {
    /// are known, we can perform just a single allocation to construct a
    /// precisely fitting atlas.  This will also let us (in the future)
    /// suspend meshing partway through in order to meet frame budget, and
-    /// potentially use a single staged upload to the GPU.
+    /// allows us to use a single staged upload to the GPU.
    ///
-    /// Returns the ColLightsInfo corresponding to the constructed atlas.
-    pub fn finalize(self, alignment: Vec2<u16>) -> ColLightInfo {
-        span!(_guard, "finalize", "GreedyMesh::finalize");
+    /// `make_buffer` is the function that produces the buffer to which we draw (which may be
+    /// either a staging buffer for upload to the GPU, or any other s
+    ///
+    /// Returns a tuple containing the size of the required buffer, and a function that, when
+    /// applied to a buffer allocated with that size, will produce the correct bounds for the
+    /// texture (which can then be bundled up into a ColLightsInfo, if need be).  The reason
+    /// for this awkward API is to allow consumers to create a mapped buffer with the correct
+    /// size, then write to it directly, rather than introducing a second staging copy.
+    pub fn finalize(
+        self,
+        alignment: Vec2<u16>,
+    ) -> (usize, impl for<'b> FnOnce(&'b mut [[u8; 4]]) -> Vec2<u16> + 'a)
+    {
        let mut cur_size = self.col_lights_size;
        // Round to nearest alignment (assuming power of 2)
        cur_size.x = (cur_size.x + alignment.x - 1) / alignment.x * alignment.x;
        cur_size.y = (cur_size.y + alignment.y - 1) / alignment.y * alignment.y;
+        /* let col_lights = make_buffer(cur_size.x as usize * cur_size.y as usize);
        let col_lights = vec![
            TerrainVertex::make_col_light(254, 0, Rgb::broadcast(254), true);
            cur_size.x as usize * cur_size.y as usize
-        ];
-        let mut col_lights_info = (col_lights, cur_size);
-        self.suspended.into_iter().for_each(|cont| {
-            cont(&mut col_lights_info);
-        });
-        col_lights_info
+        ]; */
+        let alloc_size = cur_size.x as usize * cur_size.y as usize;
+        (alloc_size, move |col_lights| {
+            span!(_guard, "finalize", "GreedyMesh::finalize");
+            assert!(col_lights.len() == alloc_size);
+            self.suspended.into_iter().for_each(move |cont| {
+                let col_lights_info = (&mut *col_lights, cur_size);
+                cont(/*&mut */col_lights_info);
+            });
+            /* col_lights_info */
+            cur_size
+        })
    }

    pub fn max_size(&self) -> Vec2<u16> { self.max_size }
@ -783,7 +800,7 @@ fn add_to_atlas<Allocator: AtlasAllocator>(
 //
 // TODO: See if we can speed this up using SIMD.
 fn draw_col_lights<D>(
-    (col_lights, cur_size): &mut ColLightInfo,
+    (col_lights, cur_size): /*&mut ColLightInfo*/(&mut [[u8; 4]], Vec2<u16>),
    data: &mut D,
    todo_rects: Vec<TodoRect>,
    draw_delta: Vec3<i32>,
@ -793,6 +810,7 @@ fn draw_col_lights<D>(
    mut get_opacity: impl FnMut(&mut D, Vec3<i32>) -> bool,
    mut make_face_texel: impl FnMut(&mut D, Vec3<i32>, u8, u8, bool) -> [u8; 4],
 ) {
+    let col_lights = &mut col_lights[0..cur_size.y as usize * cur_size.x as usize];
    todo_rects.into_iter().for_each(|(pos, uv, rect, delta)| {
        // NOTE: Conversions are safe because width, height, and offset must be
        // non-negative, and because every allocated coordinate in the atlas must be in
--- a/voxygen/src/mesh/terrain.rs
+++ b/voxygen/src/mesh/terrain.rs
@ -5,7 +5,7 @@ use crate::{
        greedy::{self, GreedyConfig, GreedyMesh},
        MeshGen,
    },
-    render::{ColLightInfo, FluidVertex, Mesh, TerrainVertex},
+    render::{ColLightInfo, FluidVertex, Mesh, Model, TerrainVertex},
    scene::terrain::BlocksOfInterest,
 };
 use common::{
@ -332,6 +332,7 @@ type V = TerrainChunk;
 #[inline(always)]
 pub fn generate_mesh<'a/*, V: RectRasterableVol<Vox = Block> + ReadVol + Debug + 'static*/>(
    vol: &'a VolGrid2d<V>,
+    create_texture: impl Fn(usize) -> Option<Model<[u8; 4]>>,
    (range, max_texture_size, boi): (Aabb<i32>, Vec2<u16>, &'a BlocksOfInterest),
 ) -> MeshGen<
    TerrainVertex,
@ -339,7 +340,7 @@ pub fn generate_mesh<'a/*, V: RectRasterableVol<Vox = Block> + ReadVol + Debug +
    TerrainVertex,
    (
        Aabb<f32>,
-        ColLightInfo,
+        /*ColLightInfo*/(Option<Model<[u8; 4]>>, Vec2<u16>),
        Arc<dyn Fn(Vec3<i32>) -> f32 + Send + Sync>,
        Arc<dyn Fn(Vec3<i32>) -> f32 + Send + Sync>,
    ),
@ -997,10 +998,14 @@ pub fn generate_mesh<'a/*, V: RectRasterableVol<Vox = Block> + ReadVol + Debug +
        max: max_bounds + min_bounds,
    };
    // WGPU requires this alignment.
-    let (col_lights, col_lights_size) = greedy.finalize(
+    let /*(col_lights, col_lights_size)*/(col_lights_alloc_size, finalize) = greedy.finalize(
        Vec2::new((wgpu::COPY_BYTES_PER_ROW_ALIGNMENT / 4) as u16, 1),
    );
-
+    // Allocate the fresh mesh.
+    let mut col_lights = create_texture(col_lights_alloc_size);
+    let col_lights_size = col_lights.as_mut().map(|col_lights| {
+        finalize(bytemuck::cast_slice_mut(&mut col_lights.get_mapped_mut(0, col_lights.len())))
+    }).unwrap_or(Vec2::broadcast(0));
    (
        opaque_mesh,
        fluid_mesh,
--- a/voxygen/src/render/buffer.rs
+++ b/voxygen/src/render/buffer.rs
@ -16,14 +16,18 @@ impl<T: Copy + Pod> Buffer<T> {
                label: None,
                mapped_at_creation: true,
                size: len as u64 * std::mem::size_of::<T>() as u64,
-                usage: usage | wgpu::BufferUsage::COPY_DST,
+                usage: usage,
            }),
            len,
            phantom_data: std::marker::PhantomData,
        }
    }

-    pub fn new(device: &wgpu::Device, usage: wgpu::BufferUsage, data: &[T]) -> Self {
+    /// NOTE: Queue is not *explicitly* used here, but it is implicitly used during the unmap
+    /// (within wgpu internals) when mapped at creation, which is called by create_buffer_init,
+    /// and requires acquiring a lock on it, so it's left in the API to deter people from using
+    /// it when the queue isn't available.
+    pub fn new(device: &wgpu::Device, _queue: &wgpu::Queue, usage: wgpu::BufferUsage, data: &[T]) -> Self {
        let contents = bytemuck::cast_slice(data);

        Self {
@ -39,42 +43,6 @@ impl<T: Copy + Pod> Buffer<T> {

    #[allow(clippy::len_without_is_empty)]
    pub fn len(&self) -> usize { self.len }
-}
-
-pub struct DynamicBuffer<T: Copy + Pod>(Buffer<T>);
-
-impl<T: Copy + Pod> DynamicBuffer<T> {
-    pub fn new(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
-        let buffer = Buffer {
-            buf: device.create_buffer(&wgpu::BufferDescriptor {
-                label: None,
-                mapped_at_creation: false,
-                size: len as u64 * std::mem::size_of::<T>() as u64,
-                usage: usage | wgpu::BufferUsage::COPY_DST,
-            }),
-            len,
-            phantom_data: std::marker::PhantomData,
-        };
-        Self(buffer)
-    }
-
-    pub fn new_with_data(device: &wgpu::Device, usage: wgpu::BufferUsage, data: &[T]) -> Self {
-        Self(Buffer::new(device, usage | wgpu::BufferUsage::COPY_DST, data))
-    }
-
-    pub fn new_mapped(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
-        Self(Buffer::new_mapped(device, len, usage | wgpu::BufferUsage::COPY_DST))
-    }
-
-    pub fn update(&self, queue: &wgpu::Queue, vals: &[T], offset: usize) {
-        if !vals.is_empty() {
-            queue.write_buffer(
-                &self.buf,
-                offset as u64 * std::mem::size_of::<T>() as u64,
-                bytemuck::cast_slice(vals),
-            )
-        }
-    }

    /// Get the GPU-side mapped slice represented by this buffer handle, if it was previously
    /// memory mapped.
@ -100,8 +68,8 @@ impl<T: Copy + Pod> DynamicBuffer<T> {
    /// unmapped), either directly or via [Buffer::new_mapped].
    ///
    /// NOTE: Queue is not *explicitly* used here, but it is implicitly used during the unmap
-    /// (within wgpu internals) and requires acquiring a lock on it, so it's left in the API to
-    /// deter people from using it when the queue isn't available.
+    /// (within wgpu internals) when mapped at creation, and requires acquiring a lock on it,
+    /// so it's left in the API to deter people from using it when the queue isn't available.
    pub fn unmap(&self, _queue: &wgpu::Queue/* , vals: &[T], offset: usize */) {
        /* if !vals.is_empty() {
            let contents = bytemuck::cast_slice(vals);
@ -117,6 +85,42 @@ impl<T: Copy + Pod> DynamicBuffer<T> {
    }
 }

+pub struct DynamicBuffer<T: Copy + Pod>(Buffer<T>);
+
+impl<T: Copy + Pod> DynamicBuffer<T> {
+    pub fn new(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
+        let buffer = Buffer {
+            buf: device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                mapped_at_creation: false,
+                size: len as u64 * std::mem::size_of::<T>() as u64,
+                usage: usage,
+            }),
+            len,
+            phantom_data: std::marker::PhantomData,
+        };
+        Self(buffer)
+    }
+
+    pub fn new_with_data(device: &wgpu::Device, queue: &wgpu::Queue, usage: wgpu::BufferUsage, data: &[T]) -> Self {
+        Self(Buffer::new(device, queue, usage, data))
+    }
+
+    pub fn new_mapped(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Self {
+        Self(Buffer::new_mapped(device, len, usage))
+    }
+
+    pub fn update(&self, queue: &wgpu::Queue, vals: &[T], offset: usize) {
+        if !vals.is_empty() {
+            queue.write_buffer(
+                &self.buf,
+                offset as u64 * std::mem::size_of::<T>() as u64,
+                bytemuck::cast_slice(vals),
+            )
+        }
+    }
+}
+
 impl<T: Copy + Pod> std::ops::Deref for DynamicBuffer<T> {
    type Target = Buffer<T>;

--- a/voxygen/src/render/consts.rs
+++ b/voxygen/src/render/consts.rs
@ -10,27 +10,25 @@ pub struct Consts<T: Copy + Pod> {

 impl<T: Copy + Pod> Consts<T> {
    /// Create a new `Const<T>`.
-    pub fn new(device: &wgpu::Device, len: usize) -> Self {
+    pub fn new(device: &wgpu::Device, usage: wgpu::BufferUsage, len: usize) -> Self {
        Self {
            // TODO: examine if all our consts need to be updatable
-            buf: DynamicBuffer::new(device, len, wgpu::BufferUsage::UNIFORM),
+            buf: DynamicBuffer::new(device, len, wgpu::BufferUsage::COPY_DST | wgpu::BufferUsage::UNIFORM),
        }
    }

-    pub fn new_with_data(device: &wgpu::Device, data: &[T]) -> Self {
+    pub fn new_with_data(device: &wgpu::Device, queue: &wgpu::Queue, usage: wgpu::BufferUsage, data: &[T]) -> Self {
        Self {
-            // TODO: examine if all our consts need to be updatable
-            buf: DynamicBuffer::new_with_data(device, wgpu::BufferUsage::UNIFORM, data),
+            buf: DynamicBuffer::new_with_data(device, queue, usage | wgpu::BufferUsage::UNIFORM, data),
        }
    }

    /// Create a new `Const<T>` that is mapped at creation.
    ///
    /// Warning: buffer must be unmapped before attempting to use this buffer on the GPU!
-    pub fn new_mapped(device: &wgpu::Device, len: usize) -> Self {
+    pub fn new_mapped(device: &wgpu::Device, usage: wgpu::BufferUsage, len: usize) -> Self {
        Self {
-            // TODO: examine if all our consts need to be updatable
-            buf: DynamicBuffer::new_mapped(device, len, wgpu::BufferUsage::UNIFORM),
+            buf: DynamicBuffer::new_mapped(device, len, usage | wgpu::BufferUsage::UNIFORM),
        }
    }

--- a/voxygen/src/render/instances.rs
+++ b/voxygen/src/render/instances.rs
@ -1,34 +1,38 @@
-use super::buffer::DynamicBuffer;
+use super::buffer::Buffer;
 use bytemuck::Pod;

 /// Represents a mesh that has been sent to the GPU.
 pub struct Instances<T: Copy + Pod> {
-    buf: DynamicBuffer<T>,
+    buf: Buffer<T>,
 }

 impl<T: Copy + Pod> Instances<T> {
-    pub fn new(device: &wgpu::Device, len: usize) -> Self {
+    pub fn new_mapped(device: &wgpu::Device, len: usize) -> Self {
        Self {
-            // TODO: examine if we have Instances that are not updated (e.g. sprites) and if there
-            // would be any gains from separating those out
-            buf: DynamicBuffer::new(device, len, wgpu::BufferUsage::VERTEX),
+            buf: Buffer::new_mapped(device, len, wgpu::BufferUsage::VERTEX),
        }
    }

-    pub fn new_with_data(device: &wgpu::Device, data: &[T]) -> Self {
+    pub fn new_with_data(device: &wgpu::Device, queue: &wgpu::Queue, data: &[T]) -> Self {
        Self {
-            // TODO: examine if we have Instances that are not updated (e.g. sprites) and if there
-            // would be any gains from separating those out
-            buf: DynamicBuffer::new_with_data(device, wgpu::BufferUsage::VERTEX, data),
+            buf: Buffer::new(device, queue, wgpu::BufferUsage::VERTEX, data),
        }
    }

+    /// Get the GPU-side mapped slice represented by this instances buffer, if it was previously
+    /// memory mapped.
+    pub fn get_mapped_mut(&self, offset: usize, len: usize) -> /* &mut [T] */wgpu::BufferViewMut<'_> {
+        self.buf.get_mapped_mut(offset, len)
+    }
+
+    /// Unmaps the GPU-side handle represented by this instances buffer, if it was previously
+    /// memory-mapped.
+    pub fn unmap(&self, queue: &wgpu::Queue) {
+        self.buf.unmap(queue);
+    }
+
    // TODO: count vs len naming scheme??
    pub fn count(&self) -> usize { self.buf.len() }

-    pub fn update(&mut self, queue: &wgpu::Queue, vals: &[T], offset: usize) {
-        self.buf.update(queue, vals, offset)
-    }
-
    pub fn buf(&self) -> &wgpu::Buffer { &self.buf.buf }
 }
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@ -35,7 +35,8 @@ pub use self::{
        skybox::{create_mesh as create_skybox_mesh, Vertex as SkyboxVertex},
        sprite::{
            Instance as SpriteInstance, SpriteGlobalsBindGroup, SpriteVerts,
-            Vertex as SpriteVertex, VERT_PAGE_SIZE as SPRITE_VERT_PAGE_SIZE,
+            Vertex as SpriteVertex, LOD_LEVELS as SPRITE_LOD_LEVELS,
+            VERT_PAGE_SIZE as SPRITE_VERT_PAGE_SIZE,
        },
        terrain::{Locals as TerrainLocals, TerrainLayout, Vertex as TerrainVertex},
        trail::Vertex as TrailVertex,
--- a/voxygen/src/render/model.rs
+++ b/voxygen/src/render/model.rs
@ -30,16 +30,41 @@ pub struct Model<V: Vertex> {

 impl<V: Vertex> Model<V> {
    /// Returns None if the provided mesh is empty
-    pub fn new(device: &wgpu::Device, usage: wgpu::BufferUsage, mesh: &Mesh<V>) -> Option<Self> {
+    pub fn new(device: &wgpu::Device, queue: &wgpu::Queue, usage: wgpu::BufferUsage, mesh: &Mesh<V>) -> Option<Self> {
        if mesh.vertices().is_empty() {
            return None;
        }

        Some(Self {
-            vbuf: Buffer::new(device, /*wgpu::BufferUsage::VERTEX*/usage, mesh.vertices()),
+            vbuf: Buffer::new(device, queue, /*wgpu::BufferUsage::VERTEX*/usage, mesh.vertices()),
        })
    }

+    /// Create a new `Const<T>` that is mapped at creation.  Returns None if the mesh is empty.
+    ///
+    /// Warning: buffer must be unmapped before attempting to use this buffer on the GPU!
+    pub fn new_mapped(device: &wgpu::Device, len: usize, usage: wgpu::BufferUsage) -> Option<Self> {
+        if len == 0 {
+            return None;
+        }
+
+        Some(Self {
+            vbuf: Buffer::new_mapped(device, len, /*wgpu::BufferUsage::VERTEX*/usage/*, mesh.vertices()*/),
+        })
+    }
+
+    /// Get the GPU-side mapped slice represented by this model handle, if it was previously
+    /// memory mapped.
+    pub fn get_mapped_mut(&self, offset: usize, len: usize) -> /* &mut [T] */wgpu::BufferViewMut<'_> {
+        self.vbuf.get_mapped_mut(offset, len)
+    }
+
+    /// Unmaps the GPU-side handle represented by this model handle, if it was previously
+    /// memory-mapped.
+    pub fn unmap(&self, queue: &wgpu::Queue) {
+        self.vbuf.unmap(queue);
+    }
+
    /// Create a model with a slice of a portion of this model to send to the
    /// renderer.
    pub fn submodel(&self, vertex_range: Range<u32>) -> SubModel<V> {
@ -64,7 +89,7 @@ pub struct DynamicModel<V: Vertex> {
 impl<V: Vertex> DynamicModel<V> {
    pub fn new(device: &wgpu::Device, size: usize) -> Self {
        Self {
-            vbuf: DynamicBuffer::new(device, size, wgpu::BufferUsage::VERTEX),
+            vbuf: DynamicBuffer::new(device, size, wgpu::BufferUsage::VERTEX | wgpu::BufferUsage::COPY_DST),
        }
    }

--- a/voxygen/src/render/pipelines/sprite.rs
+++ b/voxygen/src/render/pipelines/sprite.rs
@ -9,6 +9,7 @@ use std::mem;
 use vek::*;

 pub const VERT_PAGE_SIZE: u32 = 256;
+pub const LOD_LEVELS: usize = 5;

 #[repr(C)]
 #[derive(Copy, Clone, Debug, Zeroable, Pod)]
@ -81,11 +82,13 @@ pub struct SpriteVerts(Buffer<Vertex>);

 pub(in super::super) fn create_verts_buffer(
    device: &wgpu::Device,
+    queue: &wgpu::Queue,
    mesh: Mesh<Vertex>,
 ) -> SpriteVerts {
    // TODO: type Buffer by wgpu::BufferUsage
    SpriteVerts(Buffer::new(
        device,
+        queue,
        wgpu::BufferUsage::STORAGE,
        mesh.vertices(),
    ))
--- a/voxygen/src/render/renderer.rs
+++ b/voxygen/src/render/renderer.rs
@ -475,9 +475,9 @@ impl Renderer {
        )?;

        let clouds_locals =
-            Self::create_consts_inner(&device, &[clouds::Locals::default()]);
+            Self::create_consts_inner(&device, &queue, wgpu::BufferUsage::COPY_DST, &[clouds::Locals::default()]);
        let postprocess_locals =
-            Self::create_consts_inner(&device, &[postprocess::Locals::default()]);
+            Self::create_consts_inner(&device, &queue, wgpu::BufferUsage::COPY_DST, &[postprocess::Locals::default()]);

        let locals = Locals::new(
            &device,
@ -488,7 +488,7 @@ impl Renderer {
            &views.tgt_depth,
            views.bloom_tgts.as_ref().map(|tgts| locals::BloomParams {
                locals: bloom_sizes.map(|size| {
-                    Self::create_consts_inner(&device, &[bloom::Locals::new(size)])
+                    Self::create_consts_inner(&device, &queue, wgpu::BufferUsage::empty(), &[bloom::Locals::new(size)])
                }),
                src_views: [&views.tgt_color_pp, &tgts[1], &tgts[2], &tgts[3], &tgts[4]],
                final_tgt_view: &tgts[0],
@ -499,9 +499,9 @@ impl Renderer {
        );

        let quad_index_buffer_u16 =
-            create_quad_index_buffer_u16(&device, QUAD_INDEX_BUFFER_U16_VERT_LEN.into());
+            create_quad_index_buffer_u16(&device, &queue, QUAD_INDEX_BUFFER_U16_VERT_LEN.into());
        let quad_index_buffer_u32 =
-            create_quad_index_buffer_u32(&device, QUAD_INDEX_BUFFER_U32_START_VERT_LEN as usize);
+            create_quad_index_buffer_u32(&device, &queue, QUAD_INDEX_BUFFER_U32_START_VERT_LEN as usize);
        let mut profiler = wgpu_profiler::GpuProfiler::new(4, queue.get_timestamp_period());
        other_modes.profiler_enabled &= profiler_features_enabled;
        profiler.enable_timer = other_modes.profiler_enabled;
@ -513,12 +513,15 @@ impl Renderer {
        let (maintain_tx, maintain_rx) = channel::bounded(0);

        let device_ = Arc::clone(&device);
-        std::thread::spawn(move || {
+        /* std::thread::spawn(move || {
            // Maintain each time we are requested to do so, until the renderer dies.
+            // Additionally, accepts CPU->GPU tasks containing updates to perform that need to lock
+            // the device (but not necessarily the queue?).  This is a hopefully temporary measure
+            // required because wgpu as currently written cannot help itself.
            while let Ok(()) = maintain_rx.recv() {
                device_.poll(wgpu::Maintain::Poll);
            }
-        });
+        }); */

        #[cfg(feature = "egui-ui")]
        let egui_renderpass =
@ -690,7 +693,7 @@ impl Renderer {
                .as_ref()
                .map(|tgts| locals::BloomParams {
                    locals: bloom_sizes.map(|size| {
-                        Self::create_consts_inner(&self.device, &[bloom::Locals::new(
+                        Self::create_consts_inner(&self.device, &self.queue, wgpu::BufferUsage::empty(), &[bloom::Locals::new(
                            size,
                        )])
                    }),
@ -813,7 +816,8 @@ impl Renderer {
        // Since if the channel is out of capacity, it means a maintain is already being processed
        // (in which case we can just catch up next frame), this is a long-winded way of saying we
        // can ignore the result of try_send.
-        let _ = self.maintain_tx.try_send(());
+        // let _ = self.maintain_tx.try_send(());
+        self.device.poll(wgpu::Maintain::Poll);
    }

    /// Create render target views
@ -1255,22 +1259,25 @@ impl Renderer {
    }

    /// Create a new set of constants with the provided values.
-    pub fn create_consts<T: Copy + bytemuck::Pod>(&mut self, vals: &[T]) -> Consts<T> {
-        Self::create_consts_inner(&self.device, vals)
+    pub fn create_consts<T: Copy + bytemuck::Pod>(&mut self, usage: wgpu::BufferUsage, vals: &[T]) -> Consts<T> {
+        Self::create_consts_inner(&self.device, &self.queue, usage, vals)
    }

    pub fn create_consts_inner<T: Copy + bytemuck::Pod>(
        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        usage: wgpu::BufferUsage,
        vals: &[T],
    ) -> Consts<T> {
-        Consts::new_with_data(device, vals)
+        Consts::new_with_data(device, queue, usage, vals)
    }

    pub fn create_consts_mapped<T: Copy + bytemuck::Pod>(
        &mut self,
+        usage: wgpu::BufferUsage,
        len: usize,
    ) -> Consts<T> {
-        Consts::new_mapped(&self.device, len)
+        Consts::new_mapped(&self.device, usage, len)
    }

    /// Update a set of constants with the provided values.
@ -1278,16 +1285,12 @@ impl Renderer {
        consts.update(&self.queue, vals, 0)
    }

-    /// Gets a memory mapped buffer of a set of constants.
-    pub fn get_consts_mapped<'a, T: Copy + bytemuck::Pod>(&self, consts: &'a Consts<T>) -> /* &'a mut [T] */wgpu::BufferViewMut<'a> {
-        consts.get_mapped_mut(0, consts.len())
-    }
-
-    /// Unmaps a set of memory mapped constants.
+    /// Unmaps a set of memory mapped consts.
    pub fn unmap_consts<T: Copy + bytemuck::Pod>(&self, consts: &Consts<T>) {
        consts.unmap(&self.queue)
    }

+
    pub fn update_clouds_locals(&mut self, new_val: clouds::Locals) {
        self.locals.clouds.update(&self.queue, &[new_val], 0)
    }
@ -1301,16 +1304,21 @@ impl Renderer {
        &mut self,
        vals: &[T],
    ) -> Result<Instances<T>, RenderError> {
-        Ok(Instances::new_with_data(&self.device, vals))
+        Ok(Instances::new_with_data(&self.device, &self.queue, vals))
    }

-    /// Create a new set of instances with the provided values lazily (for use off the main
+    /// Create a new set of instances with the provided size lazily (for use off the main
    /// thread).
    pub fn create_instances_lazy<T: Copy + bytemuck::Pod>(
        &mut self,
-    ) -> impl for<'a> Fn(&'a [T]) -> Instances<T> + Send + Sync {
+    ) -> impl /*for<'a> */Fn(/* &'a [T]*/usize) -> Instances<T> + Send + Sync {
        let device = Arc::clone(&self.device);
-        move |vals| Instances::new_with_data(&device, &vals)
+        move |/*vals*/len| Instances::new_mapped(&device, len)/*Instances::new_with_data(&device, &vals)*/
+    }
+
+    /// Unmaps a set of memory mapped instances.
+    pub fn unmap_instances<T: Copy + bytemuck::Pod>(&self, instances: &Instances<T>) {
+        instances.unmap(&self.queue)
    }

    /// Update the expected index length to be large enough for a quad vertex bfufer with this many
@ -1351,7 +1359,7 @@ impl Renderer {
                if self.quad_index_buffer_u32.len() < quad_index_length {
                    // Make sure we aren't over the max
                    self.quad_index_buffer_u32 =
-                        create_quad_index_buffer_u32(&self.device, vert_length);
+                        create_quad_index_buffer_u32(&self.device, &self.queue, vert_length);
                } */
            },
            None => {},
@ -1369,33 +1377,53 @@ impl Renderer {
        let vert_length = self.quad_index_buffer_u32_len.load(Ordering::Relaxed);
        if self.quad_index_buffer_u32.len() < vert_length {
            self.quad_index_buffer_u32 =
-                create_quad_index_buffer_u32(&self.device, vert_length);
+                create_quad_index_buffer_u32(&self.device, &self.queue, vert_length);
        }
    }

    pub fn create_sprite_verts(&mut self, mesh: Mesh<sprite::Vertex>) -> sprite::SpriteVerts {
        Self::update_index_length::<sprite::Vertex>(&self.quad_index_buffer_u32_len, sprite::VERT_PAGE_SIZE as usize);
-        sprite::create_verts_buffer(&self.device, mesh)
+        sprite::create_verts_buffer(&self.device, &self.queue, mesh)
    }

    /// Create a new model from the provided mesh.
    /// If the provided mesh is empty this returns None
    pub fn create_model<V: Vertex>(&mut self, mesh: &Mesh<V>) -> Option<Model<V>> {
        Self::update_index_length::<V>(&self.quad_index_buffer_u32_len, mesh.vertices().len());
-        Model::new(&self.device, wgpu::BufferUsage::VERTEX, mesh)
+        Model::new(&self.device, &self.queue, wgpu::BufferUsage::VERTEX, mesh)
    }

-    /// Create a new model from the provided mesh, lazily (for use off the main thread).
-    /// If the provided mesh is empty this returns None
-    pub fn create_model_lazy<V: Vertex>(&mut self, usage: wgpu::BufferUsage) -> impl for<'a> Fn(&'a Mesh<V>) -> Option<Model<V>> + Send + Sync {
+    /// Create a new model for a mesh with the provided length, lazily (for use off the main
+    /// thread).  If the provided mesh is empty this returns None.  The mesh is memory mapped, and
+    /// still needs to be unmapped before use.
+    pub fn create_model_lazy_base<V: Vertex>(&mut self, usage: wgpu::BufferUsage) -> impl Fn(usize) -> Option<Model<V>> + Send + Sync {
        let device = Arc::clone(&self.device);
        let quad_index_buffer_u32_len = Arc::clone(&self.quad_index_buffer_u32_len);
-        move |mesh| {
-            Self::update_index_length::<V>(&quad_index_buffer_u32_len, mesh.vertices().len());
-            Model::new(&device, usage, mesh)
+        move |len| {
+            Self::update_index_length::<V>(&quad_index_buffer_u32_len, len);
+            Model::new_mapped(&device, len, usage/*, mesh.vertices()*/)
        }
    }

+    /// Create a new model for a mesh with the provided length, lazily (for use off the main
+    /// thread).  If the provided mesh is empty this returns None.  The mesh is memory mapped, and
+    /// still needs to be unmapped before use.
+    pub fn create_model_lazy<V: Vertex>(&mut self, usage: wgpu::BufferUsage) -> impl for<'a> Fn(&'a Mesh<V>) -> Option<Model<V>> + Send + Sync {
+        let create_model = self.create_model_lazy_base(usage);
+        move |mesh| {
+            let len = mesh.vertices().len();
+            let model = create_model(len)?;
+            model.get_mapped_mut(0, len)
+                .copy_from_slice(bytemuck::cast_slice(mesh.vertices()));
+            Some(model)
+        }
+    }
+
+    /// Unmaps a memory mapped model.
+    pub fn unmap_model<V: Vertex>(&self, model: &Model<V>) {
+        model.unmap(&self.queue);
+    }
+
    /// Create a new dynamic model with the specified size.
    pub fn create_dynamic_model<V: Vertex>(&mut self, size: usize) -> DynamicModel<V> {
        Self::update_index_length::<V>(&self.quad_index_buffer_u32_len, size);
@ -1515,13 +1543,6 @@ impl Renderer {
        texture.clear(&self.queue)
    }

-    /// Replaces the destination texture with the contents of the source texture.
-    ///
-    /// The source size should at least fit within the destination texture's size.
-    pub fn replace_texture(&mut self, encoder: &mut wgpu::CommandEncoder, dest: &Texture, source: &Texture) {
-        dest.replace(&self.device, encoder, source);
-    }
-
    /// Queue to obtain a screenshot on the next frame render
    pub fn create_screenshot(
        &mut self,
@ -1613,7 +1634,7 @@ impl Renderer {
    // }
 }

-fn create_quad_index_buffer_u16(device: &wgpu::Device, vert_length: usize) -> Buffer<u16> {
+fn create_quad_index_buffer_u16(device: &wgpu::Device, queue: &wgpu::Queue, vert_length: usize) -> Buffer<u16> {
    assert!(vert_length <= u16::MAX as usize);
    let indices = [0, 1, 2, 2, 1, 3]
        .iter()
@ -1624,10 +1645,10 @@ fn create_quad_index_buffer_u16(device: &wgpu::Device, vert_length: usize) -> Bu
        .map(|(i, b)| (i / 6 * 4 + b) as u16)
        .collect::<Vec<_>>();

-    Buffer::new(device, wgpu::BufferUsage::INDEX, &indices)
+    Buffer::new(device, queue, wgpu::BufferUsage::INDEX, &indices)
 }

-fn create_quad_index_buffer_u32(device: &wgpu::Device, vert_length: usize) -> Buffer<u32> {
+fn create_quad_index_buffer_u32(device: &wgpu::Device, queue: &wgpu::Queue, vert_length: usize) -> Buffer<u32> {
    assert!(vert_length <= u32::MAX as usize);
    let indices = [0, 1, 2, 2, 1, 3]
        .iter()
@ -1638,5 +1659,5 @@ fn create_quad_index_buffer_u32(device: &wgpu::Device, vert_length: usize) -> Bu
        .map(|(i, b)| (i / 6 * 4 + b) as u32)
        .collect::<Vec<_>>();

-    Buffer::new(device, wgpu::BufferUsage::INDEX, &indices)
+    Buffer::new(device, queue, wgpu::BufferUsage::INDEX, &indices)
 }
--- a/voxygen/src/render/renderer/binding.rs
+++ b/voxygen/src/render/renderer/binding.rs
@ -40,12 +40,12 @@ impl Renderer {
    }

    pub fn create_debug_bound_locals(&mut self, vals: &[debug::Locals]) -> debug::BoundLocals {
-        let locals = self.create_consts(vals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, vals);
        self.layouts.debug.bind_locals(&self.device, locals)
    }

    pub fn create_ui_bound_locals(&mut self, vals: &[ui::Locals]) -> ui::BoundLocals {
-        let locals = self.create_consts(vals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, vals);
        self.layouts.ui.bind_locals(&self.device, locals)
    }

@ -58,22 +58,13 @@ impl Renderer {
        locals: &[figure::Locals],
        bone_data: &[figure::BoneData],
    ) -> figure::BoundLocals {
-        let locals = self.create_consts(locals);
-        let bone_data = self.create_consts(bone_data);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, locals);
+        let bone_data = self.create_consts(wgpu::BufferUsage::COPY_DST, bone_data);
        self.layouts
            .figure
            .bind_locals(&self.device, locals, bone_data)
    }

-    /* /// Create a new set of constants with the provided values, lazily (so this can be instantiated
-    /// from another thread).
-    pub fn create_consts_lazy<T: Copy + bytemuck::Pod>(&mut self) ->
-        impl for<'a> Fn(&'a [T]) -> Consts<T> + Send + Sync
-    {
-        let device = Arc::clone(&self.device);
-        move |vals| Self::create_consts_inner(&device, vals)
-    } */
-
    /// NOTE: Locals are mapped at creation, so you still have to memory map and bind them in order
    /// before use.
    pub fn create_terrain_bound_locals(
@ -84,14 +75,14 @@ impl Renderer {
        /* let device = Arc::clone(&self.device);
        let immutable = Arc::clone(&self.layouts.immutable);
        move || {
-            let locals = Consts::new_mapped(&device, 1);
+            let locals = Consts::new_mapped(&device, wgpu::BufferUsage::empty(), 1);
            immutable.terrain.bind_locals(&device, locals)
        } */
        self.layouts.immutable.terrain.bind_locals(&self.device, locals/* , offset */)
    }

    pub fn create_shadow_bound_locals(&mut self, locals: &[shadow::Locals]) -> shadow::BoundLocals {
-        let locals = self.create_consts(locals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, locals);
        self.layouts.shadow.bind_locals(&self.device, locals)
    }

@ -99,7 +90,7 @@ impl Renderer {
        &mut self,
        locals: &[rain_occlusion::Locals],
    ) -> rain_occlusion::BoundLocals {
-        let locals = self.create_consts(locals);
+        let locals = self.create_consts(wgpu::BufferUsage::COPY_DST, locals);
        self.layouts
            .rain_occlusion
            .bind_locals(&self.device, locals)
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@ -976,7 +976,7 @@ impl<'pass_ref, 'pass: 'pass_ref> SpriteDrawer<'pass_ref, 'pass> {
    pub fn draw<'data: 'pass>(
        &mut self,
        &(terrain_locals_offset, ref terrain_locals): &'data (wgpu::DynamicOffset, terrain::BoundLocals),
-        instances: &'data Instances<sprite::Instance>,
+        (range, instances): (Range<u32>, &'data Instances<sprite::Instance>),
    ) {
        self.render_pass
            .set_bind_group(3, &terrain_locals.bind_group, &[terrain_locals_offset]);
@ -986,7 +986,7 @@ impl<'pass_ref, 'pass: 'pass_ref> SpriteDrawer<'pass_ref, 'pass> {
        self.render_pass.draw_indexed(
            0..sprite::VERT_PAGE_SIZE / 4 * 6,
            0,
-            0..instances.count() as u32,
+            range,
        );
    }
 }
--- a/voxygen/src/render/texture.rs
+++ b/voxygen/src/render/texture.rs
@ -193,7 +193,7 @@ impl Texture {
    /// Replaces this texture with the contents of another texture.
    ///
    /// The source size should at least fit within this texture's size.
-    pub fn replace<'a>(&self, device: &wgpu::Device, encoder: &mut wgpu::CommandEncoder, texture: &Self) {
+    pub fn replace<'a>(&self, encoder: &mut wgpu::CommandEncoder, texture: &Self) {
        // Copy image
        encoder.copy_texture_to_texture(
            wgpu::ImageCopyTexture {
--- a/voxygen/src/scene/figure/cache.rs
+++ b/voxygen/src/scene/figure/cache.rs
@ -463,8 +463,13 @@ where
                        make_model(generate_mesh_lod_low),
                    ];

+                    let (col_lights_alloc_size, finalize) = greedy.finalize(Vec2::broadcast(1));
+                    let mut col_lights = vec![[0; 4]; col_lights_alloc_size];
+                    let col_lights_size = finalize(&mut col_lights);
+                    let col_light = (col_lights, col_lights_size);
+
                    slot_.store(Some(MeshWorkerResponse {
-                        col_light: greedy.finalize(Vec2::broadcast(1)),
+                        col_light,
                        opaque,
                        bounds: figure_bounds,
                        vertex_range: models,
--- a/voxygen/src/scene/mod.rs
+++ b/voxygen/src/scene/mod.rs
@ -282,9 +282,9 @@ impl Scene {
        let sprite_render_context = lazy_init(renderer);

        let data = GlobalModel {
-            globals: renderer.create_consts(&[Globals::default()]),
-            lights: renderer.create_consts(&[Light::default(); MAX_LIGHT_COUNT]),
-            shadows: renderer.create_consts(&[Shadow::default(); MAX_SHADOW_COUNT]),
+            globals: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Globals::default()]),
+            lights: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Light::default(); MAX_LIGHT_COUNT]),
+            shadows: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Shadow::default(); MAX_SHADOW_COUNT]),
            shadow_mats: renderer.create_shadow_bound_locals(&[ShadowLocals::default()]),
            rain_occlusion_mats: renderer
                .create_rain_occlusion_bound_locals(&[RainOcclusionLocals::default()]),
--- a/voxygen/src/scene/simple.rs
+++ b/voxygen/src/scene/simple.rs
@ -109,9 +109,9 @@ impl Scene {
        let mut col_lights = FigureColLights::new(renderer);

        let data = GlobalModel {
-            globals: renderer.create_consts(&[Globals::default()]),
-            lights: renderer.create_consts(&[Light::default(); 20]),
-            shadows: renderer.create_consts(&[Shadow::default(); 24]),
+            globals: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Globals::default()]),
+            lights: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Light::default(); 20]),
+            shadows: renderer.create_consts(wgpu::BufferUsage::COPY_DST, &[Shadow::default(); 24]),
            shadow_mats: renderer.create_shadow_bound_locals(&[ShadowLocals::default()]),
            rain_occlusion_mats: renderer
                .create_rain_occlusion_bound_locals(&[RainOcclusionLocals::default()]),
@ -145,9 +145,15 @@ impl Scene {
                // total size is bounded by 2^24 * 3 * 1.5 which is bounded by
                // 2^27, which fits in a u32.
                let range = 0..opaque_mesh.vertices().len() as u32;
+
+                let (col_lights_alloc_size, finalize) = greedy.finalize(Vec2::broadcast(1));
+                let mut col_light = vec![[0; 4]; col_lights_alloc_size];
+                let col_lights_size = finalize(&mut col_light);
+                let col_light = (col_light, col_lights_size);
+
                let model =
                    col_lights
-                        .create_figure(renderer, greedy.finalize(Vec2::broadcast(1)), (opaque_mesh, bounds), [range]);
+                        .create_figure(renderer, col_light, (opaque_mesh, bounds), [range]);
                let mut buf = [Default::default(); anim::MAX_BONE_COUNT];
                let common_params = FigureUpdateCommonParameters {
                    entity: None,
--- a/voxygen/src/scene/terrain.rs
+++ b/voxygen/src/scene/terrain.rs
@ -12,7 +12,7 @@ use crate::{
        pipelines::{self, ColLights},
        ColLightInfo, Consts, FirstPassDrawer, FluidVertex, GlobalModel, Instances, LodData, Mesh, Model,
        RenderError, Renderer, SpriteGlobalsBindGroup, SpriteInstance, SpriteVertex, SpriteVerts,
-        TerrainLocals, TerrainShadowDrawer, TerrainVertex, Texture, SPRITE_VERT_PAGE_SIZE,
+        TerrainLocals, TerrainShadowDrawer, TerrainVertex, Texture, SPRITE_LOD_LEVELS, SPRITE_VERT_PAGE_SIZE,
    },
 };

@ -46,7 +46,6 @@ use treeculler::{BVol, Frustum, AABB};
 use vek::*;

 const SPRITE_SCALE: Vec3<f32> = Vec3::new(1.0 / 11.0, 1.0 / 11.0, 1.0 / 11.0);
-const SPRITE_LOD_LEVELS: usize = 5;

 // For rain occlusion we only need to render the closest chunks.
 /// How many chunks are maximally rendered for rain occlusion.
@ -91,7 +90,7 @@ pub struct TerrainChunkData {
    col_lights: Arc<ColLights<pipelines::terrain::Locals>>,
    light_map: LightMapFn,
    glow_map: LightMapFn,
-    sprite_instances: [Instances<SpriteInstance>; SPRITE_LOD_LEVELS],
+    sprite_instances: ([core::ops::Range<u32>; SPRITE_LOD_LEVELS], Instances<SpriteInstance>),
    locals: (wgpu::DynamicOffset, pipelines::terrain::BoundLocals),
    pub blocks_of_interest: BlocksOfInterest,

@ -143,7 +142,7 @@ pub struct MeshWorkerResponseMesh {
 /// mesh of a chunk.
 struct MeshWorkerResponse {
    pos: Vec2<i32>,
-    sprite_instances: [Instances<SpriteInstance>; SPRITE_LOD_LEVELS],
+    sprite_instances: ([core::ops::Range<u32>; SPRITE_LOD_LEVELS], Instances<SpriteInstance>),
    /// If None, this update was requested without meshing.
    mesh: Option<MeshWorkerResponseMesh>,
    started_tick: u64,
@ -259,9 +258,9 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
    sprite_config: &SpriteSpec,
    create_opaque: impl for<'a> Fn(&'a Mesh<TerrainVertex>) -> Option<Model<TerrainVertex>>,
    create_fluid: impl for<'a> Fn(&'a Mesh<FluidVertex>) -> Option<Model<FluidVertex>>,
-    create_instances: impl for<'a> Fn(&'a [SpriteInstance]) -> Instances<SpriteInstance>,
+    create_instances: impl for<'a> Fn(/* &'a [SpriteInstance] */usize) -> Instances<SpriteInstance>,
    /* create_locals: impl Fn() -> pipelines::terrain::BoundLocals, */
-    create_texture: impl for<'a> Fn(/* wgpu::TextureDescriptor<'a>, wgpu::TextureViewDescriptor<'a>, wgpu::SamplerDescriptor<'a>*/&'a Mesh<[u8; 4]>) -> /*Texture + Send + Sync*/Option<Model<[u8; 4]>>,
+    create_texture: impl for<'a> Fn(/* wgpu::TextureDescriptor<'a>, wgpu::TextureViewDescriptor<'a>, wgpu::SamplerDescriptor<'a>*//*&'a Mesh<[u8; 4]>*/usize) -> /*Texture + Send + Sync*/Option<Model<[u8; 4]>>,
 ) -> MeshWorkerResponse {
    span!(_guard, "mesh_worker");
    let (blocks_of_interest, sprite_kinds) = BlocksOfInterest::from_chunk(&chunk)/*default()*/;
@ -281,15 +280,16 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
        let (opaque_mesh, fluid_mesh, _shadow_mesh, (bounds, col_lights_info, light_map, glow_map)) =
            generate_mesh(
                &volume,
+                create_texture,
                (
                    range,
                    Vec2::new(max_texture_size, max_texture_size),
                    &blocks_of_interest,
                ),
            );
-        let mut tex_ = Mesh::new();
+        /* let mut tex_ = Mesh::new();
        *tex_.vertices_mut_vec() = col_lights_info.0;
-        let tex = create_texture(&tex_);
+        let tex = create_texture(&tex_); */
        mesh = Some(MeshWorkerResponseMesh {
            // TODO: Take sprite bounds into account somehow?
            z_bounds: (bounds.min.z, bounds.max.z),
@ -301,7 +301,7 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
            opaque_model: create_opaque(&opaque_mesh),
            fluid_model: create_fluid(&fluid_mesh),
            /* locals: create_locals(), */
-            col_lights_info: (tex, col_lights_info.1),
+            col_lights_info/*: (tex, col_lights_info.1)*/,
            light_map,
            glow_map,
        });
@ -383,7 +383,25 @@ fn mesh_worker/*<V: BaseVol<Vox = Block> + RectRasterableVol + ReadVol + Debug +
            } */
            }

-            instances.map(|instances| create_instances(&instances))
+            let mut start = 0;
+            let instance_ranges = instances.each_ref().map(|instances| {
+                let range = start..start + instances.len() as u32;
+                start = range.end;
+                range
+            });
+            let sprite_instances = create_instances(instance_ranges.iter().map(|range| range.len()).sum());
+            if start > 0 {
+                sprite_instances
+                    .get_mapped_mut(0, sprite_instances.count())
+                    .array_chunks_mut::<{ core::mem::size_of::<SpriteInstance>() }>()
+                    .zip(instances.into_iter().flatten()).for_each(|(dst, src)| {
+                    // FIXME: cast doesn't work because bytemuck::cast isn't const generic-ified
+                    // yet, so it fails on some array lengths.
+                    // *dst = bytemuck::cast(src);
+                    dst.copy_from_slice(bytemuck::cast_slice(&[src]));
+                });
+            }
+            (instance_ranges, sprite_instances)
        },
        mesh,
        blocks_of_interest,
@ -601,7 +619,10 @@ impl SpriteRenderContext {

            let sprite_col_lights = {
                prof_span!("finalize");
-                greedy.finalize(Vec2::broadcast(1))
+                let (col_lights_alloc_size, finalize) = greedy.finalize(Vec2::broadcast(1));
+                let mut col_lights = vec![[0; 4]; col_lights_alloc_size];
+                let col_lights_size = finalize(&mut col_lights);
+                (col_lights, col_lights_size)
            };

            SpriteWorkerResponse {
@ -799,7 +820,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
        // a copy from the previous atlas, skipping the CPU->GPU upload.
        if let Some((old_texture, encoder)) = old_texture {
            // TODO: Delay submission, don't just submit immediately out of convenience!
-            renderer.replace_texture(encoder, &texture, old_texture);
+            texture.replace(encoder, old_texture);
        } else {
            renderer.clear_texture(&texture);
        }
@ -1286,7 +1307,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
            let create_fluid = renderer.create_model_lazy(wgpu::BufferUsage::VERTEX);
            let create_instances = renderer.create_instances_lazy();
            /* let create_locals = renderer.create_terrain_bound_locals(); */
-            let create_texture = renderer./*create_texture_raw*/create_model_lazy(wgpu::BufferUsage::COPY_SRC);
+            let create_texture = renderer./*create_texture_raw*/create_model_lazy_base(wgpu::BufferUsage::COPY_SRC);
            /* cnt.fetch_add(1, Ordering::Relaxed); */
            let job = move || {
                // Since this loads when the task actually *runs*, rather than when it's
@ -1347,8 +1368,8 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
        if max_recv_count > 0 {
        // Construct a buffer for all the chunks we're going to process in this frame.  There might
        // be some unused slots, which is fine.
-        let locals = /*Arc::new(*/renderer.create_consts_mapped(max_recv_count as usize)/*)*/;
-        let mut locals_buffer = renderer.get_consts_mapped(&locals);
+        let locals = /*Arc::new(*/renderer.create_consts_mapped(wgpu::BufferUsage::empty(), max_recv_count as usize)/*)*/;
+        let mut locals_buffer = locals.get_mapped_mut(0, locals.len());
        let mut locals_bound = renderer.create_terrain_bound_locals(&locals/*, locals_offset */);
        let mut encoder = renderer.device
            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
@ -1356,25 +1377,26 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
            });

        for (locals_offset, (response, locals_buffer)) in incoming_chunks.zip(locals_buffer.array_chunks_mut::<{ core::mem::size_of::<TerrainLocals>() }>()).enumerate() {
-            match self.mesh_todo.get(&response.pos) {
+            let pos = response.pos;
+            let response_started_tick = response.started_tick;
+            match self.mesh_todo.get(&pos) {
                // It's the mesh we want, insert the newly finished model into the terrain model
                // data structure (convert the mesh to a model first of course).
                Some(todo) => {
                    let started_tick = todo.started_tick.load(Ordering::Relaxed);
-                    if response.started_tick > started_tick {
+                    if response_started_tick > started_tick {
                        // Chunk must have been removed, or it was spawned on an old tick. Drop
-                        // the mesh since it's either out of date or no longer needed.
+                        // the mesh in the background since it's either out of date or no longer
+                        // needed.
+                        slowjob.spawn(&"TERRAIN_DROP", move || { drop(response); });
                        continue;
                    }

-                    let sprite_instances = response.sprite_instances;
-
                    if let Some(mut mesh) = response.mesh {
                        // Full update, insert the whole chunk.
-
                        let load_time = self
                            .chunks
-                            .get(&response.pos)
+                            .get(&pos)
                            .map(|chunk| chunk.load_time)
                            .unwrap_or(current_time as f32);
                        // TODO: Allocate new atlas on allocation failure.
@ -1422,6 +1444,16 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                                .expect("Chunk data does not fit in a texture of maximum size.")
                        });

+                        // Unmap buffers mapped on other threads (we do this here to avoid
+                        // contention with queue submission, as both of these take the device write
+                        // lock as of wgpu 0.8.1).
+                        //
+                        // FIXME: When we upgrade wgpu, reconsider all this.
+                        renderer.unmap_instances(&response.sprite_instances.1);
+                        mesh.opaque_model.as_ref().map(|model| renderer.unmap_model(model));
+                        mesh.fluid_model.as_ref().map(|model| renderer.unmap_model(model));
+                        renderer.unmap_model(&tex);
+
                        // NOTE: Cast is safe since the origin was a u16.
                        let atlas_offs = Vec2::new(
                            allocation.rectangle.min.x as u32,
@ -1467,7 +1499,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                        let locals_buffer_ =
                        /* renderer.update_mapped(&mut mesh.locals, &[*/TerrainLocals::new(
                            Vec3::from(
-                                response.pos.map2(VolGrid2d::<V>::chunk_size(), |e, sz| {
+                                pos.map2(VolGrid2d::<V>::chunk_size(), |e, sz| {
                                    e as f32 * sz as f32
                                }),
                            ),
@ -1477,7 +1509,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                        *locals_buffer = bytemuck::cast(locals_buffer_);

                        /* let locals = Arc::clone(&locals); */
-                        Self::insert_chunk(&slowjob, &mut self.chunks, &mut self.atlas, response.pos, TerrainChunkData {
+                        Self::insert_chunk(&slowjob, &mut self.chunks, &mut self.atlas, pos, TerrainChunkData {
                            load_time,
                            opaque_model: mesh.opaque_model,
                            fluid_model: mesh.fluid_model,
@ -1485,7 +1517,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                            col_lights: Arc::clone(&self.col_lights),
                            light_map: mesh.light_map,
                            glow_map: mesh.glow_map,
-                            sprite_instances,
+                            sprite_instances: response.sprite_instances,
                            locals: /* mesh.locals *//*renderer.create_terrain_bound_locals(&locals/*, locals_offset */)*/
                                ((locals_offset * core::mem::size_of::<TerrainLocals>()) as wgpu::DynamicOffset, Arc::clone(&locals_bound)),
                            visible: Visibility {
@ -1499,20 +1531,27 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                            shadow_z_bounds: mesh.shadow_z_bounds,
                            frustum_last_plane_index: 0,
                        });
-                    } else if let Some(chunk) = self.chunks.get_mut(&response.pos) {
+                    } else if let Some(chunk) = self.chunks.get_mut(&pos) {
                        // There was an update that didn't require a remesh (probably related to
                        // non-glowing sprites) so we just update those.
-                        chunk.sprite_instances = sprite_instances;
+                        chunk.sprite_instances = response.sprite_instances;
                        chunk.blocks_of_interest = response.blocks_of_interest;
+                    } else {
+                        // Not sure what happened here, but we should drop the result in the
+                        // background.
+                        slowjob.spawn(&"TERRAIN_DROP", move || { drop(response); });
                    }

-                    if response.started_tick == started_tick {
+                    if response_started_tick == started_tick {
                        // This was the latest worker for this chunk, so we don't need to worry
                        // about canceling any later tasks.
-                        self.mesh_todo.remove(&response.pos);
+                        self.mesh_todo.remove(&pos);
                    }
                },
-                None => {},
+                // Old task, drop the response in the background.
+                None => {
+                    slowjob.spawn(&"TERRAIN_DROP", move || { drop(response); });
+                },
            }
        }
        // Drop the memory mapping and unmap the locals.
@ -1923,7 +1962,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
            .filter(|(_, c)| c.visible.is_visible())
            .for_each(|(pos, chunk)| {
                // Skip chunk if it has no sprites
-                if chunk.sprite_instances[0].count() == 0 {
+                if chunk.sprite_instances.1.count() == 0 {
                    return;
                }

@ -1949,7 +1988,7 @@ impl/*<V: RectRasterableVol>*/ Terrain<V> {
                        4
                    };

-                    sprite_drawer.draw(&chunk.locals, &chunk.sprite_instances[lod_level]);
+                    sprite_drawer.draw(&chunk.locals, (chunk.sprite_instances.0[lod_level].clone(), &chunk.sprite_instances.1));
                }
            });
        drop(sprite_drawer);