Progress on GPU premultiplication.

* General progress in setting up code paths to support GPU premultiplication. * Created `PremultiplyUpload` type to represent an initiated image upload where the premultiply pass needs to be ran to complete it. * Converted from compute pass to render pass since current limitations make it difficult to write directly to a srgb image from a compute shader. * Replace `CachedDetails::Immutable` with keeping track of the parameters used to create the texture (i.e. the border color). * Create `TextureRequirements`, `TextureParamters`, and `CacheKey` types to encode parameters that go into texture creation and image caching and to determine when the space in texture memory should be reused when replacing a graphic. * Add custom texture creation logic for the UI textures since those need certain usage combinations.
2024-08-30 18:12:32 +00:00 · 2022-10-25 00:31:35 -04:00 · 2022-10-25 00:31:35 -04:00 · efd932c71e
commit efd932c71e
parent 1d51aae3b2
16 changed files with 789 additions and 258 deletions
--- a/assets/voxygen/shaders/include/srgb.glsl
+++ b/assets/voxygen/shaders/include/srgb.glsl
@ -43,6 +43,16 @@ vec3 linear_to_srgb(vec3 col) {
    );
 }

+vec4 srgba8_to_linear(uint srgba8) {
+    uvec4 nonlinear = vec4(uvec4(
+        (srgba8 >> 24) & 0xFFu,
+        (srgba8 >> 16) & 0xFFu,
+        (srgba8 >>  8) & 0xFFu,
+         srgba8        & 0xFFu
+    )) / 255.0;
+    return vec4(srgb_to_linear(nonlinear.rgb), nonlinear.a);
+}
+
 float pow5(float x) {
    float x2 = x * x;
    return x2 * x2 * x;
--- a/assets/voxygen/shaders/premultiply-alpha-compute.glsl
+++ b/assets/voxygen/shaders/premultiply-alpha-compute.glsl
@ -1,36 +0,0 @@
-#version 420 core
-
-// TODO: should we modify this based on the current device?
-// TODO: would it be better to have 2D workgroup for writing to a local area in the target image? 
-layout(local_size_x = 256) in;
-
-// TODO: writing all images into a single buffer?
-layout(set = 0, binding = 0) readonly buffer InputImage {
-    uint input_pixels[];
-};
-
-layout (std140, set = 0, binding = 1)
-uniform u_locals {
-    // Size of the input image.
-    uvec2 image_size;
-    // Offset to place the transformed input image at in the target
-    // image.
-    uvec2 target_offset;
-};
-
-layout(rgba8, set = 0, binding = 2) uniform writeonly image2D target_image;
-
-void main() {
-    uint global_id = gl_GlobalInvocationId.x;
-    uvec2 src_pixel_pos = uvec2(global_id % image_size.x, global_id / image_size.x);
-    // Otherwise this is is an out of bounds compute instance.
-    if (src_pixel_pos < image_size.y) {
-        uint pixel = input_pixels[global_id]; 
-        vec4 nonlinear = vec4((pixel >> 16) & 0xFFu, (pixel >> 8) & 0xFFu, (pixel >> 8) & 0xFFu, pixel & 0xFFu);
-        vec4 linear;
-        vec4 premultiplied_linear;
-        vec4 premultiplied_nonlinear;
-        // No free srgb with image store operations https://www.khronos.org/opengl/wiki/Image_Load_Store#Format_compatibility
-        imageStore(target_image, src_pixel_pos + target_offset, premultiplied_nonlinear);
-    }
-}
--- a/assets/voxygen/shaders/premultiply-alpha-frag.glsl
+++ b/assets/voxygen/shaders/premultiply-alpha-frag.glsl
@ -0,0 +1,16 @@
+#version 420 core
+
+layout(set = 0, binding = 0)
+uniform texture2D source_texture;
+
+layout(location = 0) in vec2 source_coords;
+
+layout(location = 0) out vec4 target_color;
+
+void main() {
+    // We get free nonlinear -> linear conversion when sampling from srgb texture;
+    vec4 linear = texelFetch(source_texture, ivec2(source_coords), 0);
+    vec4 premultiplied_linear = vec4(linear.rgb * linear.a, linear.a);
+    // We get free linear -> nonlinear conversion rendering to srgb texture.
+    target_color = premultiplied_linear;
+}
--- a/assets/voxygen/shaders/premultiply-alpha-vert.glsl
+++ b/assets/voxygen/shaders/premultiply-alpha-vert.glsl
@ -0,0 +1,48 @@
+#version 420 core
+
+layout(push_constant) uniform Params {
+    // Size of the source image.
+    uint source_size_xy;
+    // Offset to place the image at in the target texture.
+    //
+    // Origin is the top-left.
+    uint target_offset_xy;
+    // Size of the target texture.
+    uint target_size_xy;
+};
+
+layout(location = 0) out vec2 source_coords;
+
+uvec2 unpack(uint xy) {
+    return uvec2(
+        bitfieldExtract(xy, 0, 16), 
+        bitfieldExtract(xy, 16, 16), 
+    );
+}
+
+void main() {
+    vec2 source_size = vec2(unpack(source_size_xy));
+    vec2 target_offset = vec2(unpack(target_offset_size_xy));
+    vec2 target_size = vec2(unpack(target_size_xy));
+
+    // Generate rectangle (counter clockwise triangles)
+    //
+    // 0 0 1 1 1 0
+    float x_select = float(((uint(gl_VertexIndex) + 1u) / 3u) % 2u);
+    // 1 0 0 0 1 1
+    float y_select = float(((uint(gl_VertexIndex) + 5u) / 3u) % 2u);
+
+    source_coords = vec2(
+        // left -> right (on screen)
+        mix(0.0, 1.0, x_select),
+        // bottom -> top (on screen)
+        mix(1.0, 0.0, y_select),
+    );
+
+    vec2 target_coords_normalized = (target_offset + source_coords * source_size) / target_size;
+
+    // Flip y and transform [0.0, 1.0] -> [-1.0, 1.0] to get NDC coordinates.
+    vec2 v_pos = ((target_coords_normalized * 2.0) - vec2(1.0)) * vec2(1.0, -1.0); 
+
+    gl_Position = vec4(v_pos, 0.0, 1.0);
+}
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@ -43,7 +43,8 @@ pub use self::{
            create_quad as create_ui_quad,
            create_quad_vert_gradient as create_ui_quad_vert_gradient, create_tri as create_ui_tri,
            BoundLocals as UiBoundLocals, Locals as UiLocals, Mode as UiMode,
-            TextureBindGroup as UiTextureBindGroup, Vertex as UiVertex,
+            PremultiplyUpload as UiPremultiplyUpload, TextureBindGroup as UiTextureBindGroup,
+            Vertex as UiVertex,
        },
        GlobalModel, Globals, GlobalsBindGroup, GlobalsLayouts, Light, Shadow,
    },
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@ -1,8 +1,21 @@
 use super::super::{Bound, Consts, GlobalsLayouts, Quad, Texture, Tri, Vertex as VertexTrait};
 use bytemuck::{Pod, Zeroable};
+use core::num::NonZeroU32;
 use std::mem;
 use vek::*;

+// TODO: profile UI rendering before and after on laptop.
+
+/// The format of textures that the UI sources image data from.
+///
+/// Note, the is not directly used in all relevant locations, but still helps to
+/// more clearly document the that this is the format being used. Notably,
+/// textures are created via `renderer.create_dynamic_texture(...)` and
+/// `renderer.create_texture(&DynamicImage::ImageRgba(image), ...)` (TODO:
+/// update if we have to refactor when implementing the RENDER_ATTACHMENT
+/// usage).
+const UI_IMAGE_FORMAT: wgpu::TextureFormat = wgpu::TextureFormat::Rgba8UnormSrgb;
+
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Zeroable, Pod)]
 pub struct Vertex {
@ -132,8 +145,8 @@ pub struct TextureBindGroup {
 }

 pub struct UiLayout {
-    pub locals: wgpu::BindGroupLayout,
-    pub texture: wgpu::BindGroupLayout,
+    locals: wgpu::BindGroupLayout,
+    texture: wgpu::BindGroupLayout,
 }

 impl UiLayout {
@ -395,20 +408,77 @@ pub fn create_tri(
    )
 }

-// Steps:
-// 1. Upload new image via `Device::create_buffer_init`, with `MAP_WRITE` flag
-//    to avoid staging buffer.
-// 2. Run compute pipeline to multiply by alpha reading from this buffer and
-//    writing to the final texture (this may be in an atlas or an independent
-//    texture if the image is over a certain size threshold).
+// Premultiplying alpha on the GPU before placing images into the textures that
+// will be sampled from in the UI pipeline.
 //
-// Info needed in compute shader:
-// * source buffer
-// * target texture
-// * image dimensions
-// * position in the target texture
-// (what is the overhead of compute call? at some point we may be better off
-// converting small images on the cpu)
+// Steps:
+//
+// 1. Upload new image via `Device::create_texture_with_data`.
+//
+//    (NOTE: Initially considered: Creating a storage buffer to read from in the
+// shader via    `Device::create_buffer_init`, with `MAP_WRITE` flag to avoid
+// staging buffer. However, with    dedicated GPUs combining usages other than
+// `COPY_SRC` with `MAP_WRITE` may be less ideal.    Plus, by copying into a
+// texture first we can get free srgb conversion when fetching colors
+//    from the texture. In the future, we may want to branch based on the
+// whether the GPU is    integrated and avoid this extra copy.)
+//
+// 2. Run render pipeline to multiply by alpha reading from this texture and
+// writing to the final    texture (this can either be in an atlas or in an
+// independent texture if the image is over a    certain size threshold).
+//
+//    (NOTE: Initially considered: using a compute pipeline and writing to the
+// final texture as a    storage texture. However, the srgb format can't be used
+// with storage texture and there is not    yet the capability to create
+// non-srgb views of srgb textures.)
+//
+// Info needed:
+//
+// * source texture (texture binding)
+// * target texture (render attachment)
+// * source image dimensions (push constant)
+// * target texture dimensions (push constant)
+// * position in the target texture (push constant)
+//
+// TODO: potential optimizations
+// * what is the overhead of this draw call call? at some point we may be better
+//   off converting very small images on the cpu and/or batching these into a
+//   single draw call
+// * what is the overhead of creating new small textures? for processing many
+//   small images would it be useful to create a single texture the same size as
+//   our cache texture and use Queue::write_texture?
+// * is using create_buffer_init and reading directly from that (with manual
+//   srgb conversion) worth avoiding staging buffer/copy-to-texture for
+//   integrated GPUs?
+// * premultipying alpha in a release asset preparation step
+
+pub struct PremultiplyAlphaLayout {
+    source_texture: wgpu::BindGroupLayout,
+}
+
+impl PremultiplyAlphaLayout {
+    pub fn new(device: &wgpu::Device) -> Self {
+        Self {
+            source_texture: device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: None,
+                entries: &[
+                    // source_texture
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStage::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Float { filterable: false },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: None,
+                    },
+                ],
+            }),
+        }
+    }
+}
+
 pub struct PremultiplyAlphaPipeline {
    pub pipeline: wgpu::RenderPipeline,
 }
@ -416,22 +486,163 @@ pub struct PremultiplyAlphaPipeline {
 impl PremultiplyAlphaPipeline {
    pub fn new(
        device: &wgpu::Device,
-        module: &wgpu::ShaderModule,
-        layout: &PremultiplAlphaLayout,
+        vs_module: &wgpu::ShaderModule,
+        fs_module: &wgpu::ShaderModule,
+        layout: &PremultiplyAlphaLayout,
    ) -> Self {
        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
            label: Some("Premultiply alpha pipeline layout"),
-            push_constant_ranges: &[],
-            bind_group_layouts: &[layout],
+            bind_group_layouts: &[&layout.source_texture],
+            push_constant_ranges: &[wgpu::PushConstantRange {
+                stages: wgpu::ShaderStage::VERTEX,
+                range: 0..core::mem::size_of::<PremultiplyAlphaParams>() as u32,
+            }],
        });

-        let pipeline = device.create_compute_pipeline(&wgpu::RenderPipelineDescriptor {
+        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("Premultiply alpha pipeline"),
            layout: Some(&pipeline_layout),
-            module,
-            entry_point: "main",
+            vertex: wgpu::VertexState {
+                module: vs_module,
+                entry_point: "main",
+                buffers: &[],
+            },
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleList,
+                strip_index_format: None,
+                front_face: wgpu::FrontFace::Ccw,
+                cull_mode: Some(wgpu::Face::Back),
+                clamp_depth: false,
+                polygon_mode: wgpu::PolygonMode::Fill,
+                conservative: false,
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            fragment: Some(wgpu::FragmentState {
+                module: fs_module,
+                entry_point: "main",
+                targets: &[wgpu::ColorTargetState {
+                    format: UI_IMAGE_FORMAT,
+                    blend: None,
+                    write_mask: wgpu::ColorWrite::ALL,
+                }],
+            }),
        });

        Self { pipeline }
    }
 }
+
+/// Uploaded as push constant.
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Zeroable, Pod)]
+pub struct PremultiplyAlphaParams {
+    /// Size of the source image.
+    source_size_xy: u32,
+    /// Offset to place the image at in the target texture.
+    ///
+    /// Origin is the top-left.
+    target_offset_xy: u32,
+    /// Size of the target texture.
+    target_size_xy: u32,
+}
+
+/// An image upload that needs alpha premultiplication and which is in a pending
+/// state.
+///
+/// From here we will use the `PremultiplyAlpha` pipeline to premultiply the
+/// alpha while transfering the image to its destination texture.
+pub struct PremultiplyUpload {
+    source_bg: wgpu::BindGroup,
+    source_size_xy: u32,
+    /// The location in the final texture this will be placed at. Technically,
+    /// we don't need this information at this point but it is convenient to
+    /// store it here.
+    offset: Vec2<u16>,
+}
+
+impl PremultiplyUpload {
+    pub fn prepare(
+        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        layout: &PremultiplyAlphaLayout,
+        image: &image::RgbaImage,
+        offset: Vec2<u16>,
+    ) -> Self {
+        // TODO: duplicating some code from `Texture` since:
+        // 1. We don't need to create a sampler.
+        // 2. Texture::new accepts &DynamicImage which isn't possible to create from
+        //    &RgbaImage without cloning.
+        let image_size = wgpu::Extent3d {
+            width: image.width(),
+            height: image.height(),
+            depth_or_array_layers: 1,
+        };
+        let source_tex = device.create_texture(&wgpu::TextureDescriptor {
+            label: None,
+            size: image_size,
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: wgpu::TextureFormat::Rgba8UnormSrgb,
+            usage: wgpu::TextureUsage::SAMPLED | wgpu::TextureUsage::COPY_DST,
+        });
+        queue.write_texture(
+            wgpu::ImageCopyTexture {
+                texture: &source_tex,
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+            },
+            &(&**image)[..(image.width() as usize * image.height() as usize)],
+            wgpu::ImageDataLayout {
+                offset: 0,
+                bytes_per_row: NonZeroU32::new(image.width() * 4),
+                rows_per_image: NonZeroU32::new(image.height()),
+            },
+            image_size,
+        );
+        // Create view to use to create bind group
+        let view = source_tex.create_view(&wgpu::TextureViewDescriptor {
+            label: None,
+            format: Some(wgpu::TextureFormat::Rgba8UnormSrgb),
+            dimension: Some(wgpu::TextureViewDimension::D2),
+            aspect: wgpu::TextureAspect::All,
+            base_mip_level: 0,
+            mip_level_count: None,
+            base_array_layer: 0,
+            array_layer_count: None,
+        });
+        let source_bg = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: None,
+            layout: &layout.source_texture,
+            entries: &[wgpu::BindGroupEntry {
+                binding: 0,
+                resource: wgpu::BindingResource::TextureView(&view),
+            }],
+        });
+
+        // NOTE: We assume the max texture size is less than u16::MAX.
+        let source_size_xy = image_size.width + image_size.height << 16;
+
+        Self {
+            source_bg,
+            source_size_xy,
+            offset,
+        }
+    }
+
+    /// Semantically, this consumes the `PremultiplyUpload` but we need to keep
+    /// the bind group alive to the end of the render pass and don't want to
+    /// bother storing it somewhere else.
+    pub fn draw_data(&self, target: &Texture) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
+        let target_offset_xy = u32::from(self.offset.x) + u32::from(self.offset.y) << 16;
+        let target_dims = target.get_dimensions();
+        // NOTE: We assume the max texture size is less than u16::MAX.
+        let target_size_xy = target_dims.x + target_dims.y << 16;
+        (&self.source_bg, PremultiplyAlphaParams {
+            source_size_xy: self.source_size_xy,
+            target_offset_xy,
+            target_size_xy,
+        })
+    }
+}
--- a/voxygen/src/render/renderer.rs
+++ b/voxygen/src/render/renderer.rs
@ -63,6 +63,7 @@ struct ImmutableLayouts {
    clouds: clouds::CloudsLayout,
    bloom: bloom::BloomLayout,
    ui: ui::UiLayout,
+    premultiply_alpha: ui::PremultiplyAlphaLayout,
    blit: blit::BlitLayout,
 }

@ -393,6 +394,7 @@ impl Renderer {
                &pipeline_modes,
            ));
            let ui = ui::UiLayout::new(&device);
+            let premultiply_alpha = ui::PremultiplyAlphaLayout::new(&device);
            let blit = blit::BlitLayout::new(&device);

            let immutable = Arc::new(ImmutableLayouts {
@ -407,6 +409,7 @@ impl Renderer {
                clouds,
                bloom,
                ui,
+                premultiply_alpha,
                blit,
            });

@ -1434,6 +1437,20 @@ impl Renderer {
        texture.update(&self.queue, offset, size, bytemuck::cast_slice(data))
    }

+    pub fn prepare_premultiply_upload(
+        &self,
+        image: &image::RgbaImage,
+        offset: Vec2<u16>,
+    ) -> ui::PremultiplyUpload {
+        ui::PremultiplyUpload::prepare(
+            &self.device,
+            &self.queue,
+            &self.layouts.premultiply_alpha,
+            image,
+            offset,
+        )
+    }
+
    /// Queue to obtain a screenshot on the next frame render
    pub fn create_screenshot(
        &mut self,
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@ -12,6 +12,7 @@ use super::{
    rain_occlusion_map::{RainOcclusionMap, RainOcclusionMapRenderer},
    Renderer, ShadowMap, ShadowMapRenderer,
 };
+use common_base::prof_span;
 use core::{num::NonZeroU32, ops::Range};
 use std::sync::Arc;
 use vek::Aabr;
@ -424,6 +425,44 @@ impl<'frame> Drawer<'frame> {
        });
    }

+    pub fn run_ui_premultiply_passes<'a>(
+        &mut self,
+        targets: impl Iterator<Item = (&'a super::super::Texture, Vec<ui::PremultiplyUpload>)>,
+    ) {
+        let encoder = self.encoder.as_mut().unwrap();
+        let device = self.borrow.device;
+
+        // TODO: What is the CPU overhead of each renderpass?
+        for (i, (target_texture, uploads)) in targets.enumerate() {
+            prof_span!("ui premultiply pass");
+            tracing::info!("{} uploads", uploads.len());
+            let profile_name = format!("ui_premultiply_pass {}", i);
+            let label = format!("ui premultiply pass {}", i);
+            // TODO: a GPU profile scope on each of the passes here may be a bit too fine
+            // grained.
+            let mut render_pass =
+                encoder.scoped_render_pass(&profile_name, device, &wgpu::RenderPassDescriptor {
+                    label: Some(&label),
+                    color_attachments: &[wgpu::RenderPassColorAttachment {
+                        view: &target_texture.view,
+                        resolve_target: None,
+                        ops: wgpu::Operations {
+                            load: wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT),
+                            store: true,
+                        },
+                    }],
+                    depth_stencil_attachment: None,
+                });
+            for upload in &uploads {
+                let (source_bind_group, push_constant_data) = upload.draw_data(target_texture);
+                let bytes = bytemuck::bytes_of(&push_constant_data);
+                render_pass.set_bind_group(0, source_bind_group, &[]);
+                render_pass.set_push_constants(wgpu::ShaderStage::VERTEX, 0, bytes);
+                render_pass.draw_indexed(0..6, 0, 0..1);
+            }
+        }
+    }
+
    pub fn third_pass(&mut self) -> ThirdPassDrawer {
        let encoder = self.encoder.as_mut().unwrap();
        let device = self.borrow.device;
--- a/voxygen/src/render/renderer/pipeline_creation.rs
+++ b/voxygen/src/render/renderer/pipeline_creation.rs
@ -33,6 +33,7 @@ pub struct Pipelines {
    pub lod_object: lod_object::LodObjectPipeline,
    pub terrain: terrain::TerrainPipeline,
    pub ui: ui::UiPipeline,
+    pub premultiply_alpha: ui::PremultiplyAlphaPipeline,
    pub blit: blit::BlitPipeline,
 }

@ -79,6 +80,7 @@ pub struct IngameAndShadowPipelines {
 /// Use to decouple interface pipeline creation when initializing the renderer
 pub struct InterfacePipelines {
    pub ui: ui::UiPipeline,
+    pub premultiply_alpha: ui::PremultiplyAlphaPipeline,
    pub blit: blit::BlitPipeline,
 }

@ -100,6 +102,7 @@ impl Pipelines {
            lod_object: ingame.lod_object,
            terrain: ingame.terrain,
            ui: interface.ui,
+            premultiply_alpha: interface.premultiply_alpha,
            blit: interface.blit,
        }
    }
@ -127,6 +130,8 @@ struct ShaderModules {
    trail_frag: wgpu::ShaderModule,
    ui_vert: wgpu::ShaderModule,
    ui_frag: wgpu::ShaderModule,
+    premultiply_alpha_vert: wgpu::ShaderModule,
+    premultiply_alpha_frag: wgpu::ShaderModule,
    lod_terrain_vert: wgpu::ShaderModule,
    lod_terrain_frag: wgpu::ShaderModule,
    clouds_vert: wgpu::ShaderModule,
@ -336,6 +341,8 @@ impl ShaderModules {
            trail_frag: create_shader("trail-frag", ShaderKind::Fragment)?,
            ui_vert: create_shader("ui-vert", ShaderKind::Vertex)?,
            ui_frag: create_shader("ui-frag", ShaderKind::Fragment)?,
+            premultiply_alpha_vert: create_shader("premultiply-alpha-vert", ShaderKind::Vertex)?,
+            premultiply_alpha_frag: create_shader("premultiply-alpha-frag", ShaderKind::Fragment)?,
            lod_terrain_vert: create_shader("lod-terrain-vert", ShaderKind::Vertex)?,
            lod_terrain_frag: create_shader("lod-terrain-frag", ShaderKind::Fragment)?,
            clouds_vert: create_shader("clouds-vert", ShaderKind::Vertex)?,
@ -416,11 +423,11 @@ struct PipelineNeeds<'a> {
 fn create_interface_pipelines(
    needs: PipelineNeeds,
    pool: &rayon::ThreadPool,
-    tasks: [Task; 2],
+    tasks: [Task; 3],
 ) -> InterfacePipelines {
    prof_span!(_guard, "create_interface_pipelines");

-    let [ui_task, blit_task] = tasks;
+    let [ui_task, premultiply_alpha_task, blit_task] = tasks;
    // Construct a pipeline for rendering UI elements
    let create_ui = || {
        ui_task.run(
@ -438,6 +445,20 @@ fn create_interface_pipelines(
        )
    };

+    let create_premultiply_alpha = || {
+        premultiply_alpha_task.run(
+            || {
+                ui::PremultiplyAlphaPipeline::new(
+                    needs.device,
+                    &needs.shaders.premultiply_alpha_vert,
+                    &needs.shaders.premultiply_alpha_frag,
+                    &needs.layouts.premultiply_alpha,
+                )
+            },
+            "premultiply alpha pipeline creation",
+        )
+    };
+
    // Construct a pipeline for blitting, used during screenshotting
    let create_blit = || {
        blit_task.run(
@ -454,9 +475,15 @@ fn create_interface_pipelines(
        )
    };

-    let (ui, blit) = pool.join(create_ui, create_blit);
+    let (ui, (premultiply_alpha, blit)) = pool.join(create_ui, || {
+        pool.join(create_premultiply_alpha, create_blit)
+    });

-    InterfacePipelines { ui, blit }
+    InterfacePipelines {
+        ui,
+        premultiply_alpha,
+        blit,
+    }
 }

 /// Create IngamePipelines and shadow pipelines in parallel
--- a/voxygen/src/render/renderer/shaders.rs
+++ b/voxygen/src/render/renderer/shaders.rs
@ -73,6 +73,8 @@ impl assets::Compound for Shaders {
            "trail-frag",
            "ui-vert",
            "ui-frag",
+            "premultiply-alpha-vert",
+            "premultiply_alpha-frag",
            "lod-terrain-vert",
            "lod-terrain-frag",
            "clouds-vert",
--- a/voxygen/src/render/texture.rs
+++ b/voxygen/src/render/texture.rs
@ -224,6 +224,7 @@ impl Texture {
        );
    }

+    // TODO: remove `get` from this name
    /// Get dimensions of the represented image.
    pub fn get_dimensions(&self) -> vek::Vec3<u32> {
        vek::Vec3::new(
--- a/voxygen/src/ui/cache.rs
+++ b/voxygen/src/ui/cache.rs
@ -51,7 +51,9 @@ impl Cache {
        })
    }

-    pub fn glyph_cache_tex(&self) -> &(Texture, UiTextureBindGroup) { &self.glyph_cache_tex }
+    pub fn glyph_cache_tex(&self) -> (&Texture, &UiTextureBindGroup) {
+        (&self.glyph_cache_tex.0, &self.glyph_cache_tex.1)
+    }

    pub fn cache_mut_and_tex(
        &mut self,
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@ -4,7 +4,7 @@ pub mod renderer;
 pub use renderer::{SampleStrat, Transform};

 use crate::{
-    render::{Renderer, Texture, UiTextureBindGroup},
+    render::{Renderer, Texture, UiPremultiplyUpload, UiTextureBindGroup},
    ui::KeyedJobs,
 };
 use common::{figure::Segment, slowjob::SlowJobPool};
@ -12,7 +12,7 @@ use guillotiere::{size2, SimpleAtlasAllocator};
 use hashbrown::{hash_map::Entry, HashMap};
 use image::{DynamicImage, RgbaImage};
 use slab::Slab;
-use std::{hash::Hash, sync::Arc};
+use std::{borrow::Cow, hash::Hash, sync::Arc};
 use tracing::{error, warn};
 use vek::*;

@ -29,6 +29,7 @@ pub enum Graphic {
    Image(Arc<DynamicImage>, Option<Rgba<f32>>),
    // Note: none of the users keep this Arc currently
    Voxel(Arc<Segment>, Transform, SampleStrat),
+    // TODO: Re-evaluate whether we need this (especially outside conrod context)
    Blank,
 }

@ -63,11 +64,11 @@ pub struct TexId(usize);

 enum CachedDetails {
    Atlas {
-        // Index of the atlas this is cached in
+        // Index of the atlas this is cached in.
        atlas_idx: usize,
        // Whether this texture is valid.
        valid: bool,
-        // Where in the cache texture this is
+        // Where in the cache texture this is.
        aabr: Aabr<u16>,
    },
    Texture {
@ -76,10 +77,6 @@ enum CachedDetails {
        // Whether this texture is valid.
        valid: bool,
    },
-    Immutable {
-        // Index of the (unique, immutable, non-atlas) texture this is cached in.
-        index: usize,
-    },
 }

 impl CachedDetails {
@ -89,10 +86,8 @@ impl CachedDetails {
    fn info(
        &self,
        atlases: &[(SimpleAtlasAllocator, usize)],
-        textures: &Slab<(Texture, UiTextureBindGroup)>,
+        textures: &Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
    ) -> (usize, bool, Aabr<u16>) {
-        // NOTE: We don't accept images larger than u16::MAX (rejected in `cache_res`)
-        // (and probably would not be able to create a texture this large).
        match *self {
            CachedDetails::Atlas {
                atlas_idx,
@ -102,38 +97,136 @@ impl CachedDetails {
            CachedDetails::Texture { index, valid } => {
                (index, valid, Aabr {
                    min: Vec2::zero(),
-                    // Note texture should always match the cached dimensions
-                    max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
-                })
-            },
-            CachedDetails::Immutable { index } => {
-                (index, true, Aabr {
-                    min: Vec2::zero(),
-                    // Note texture should always match the cached dimensions
+                    // NOTE (as cast): We don't accept images larger than u16::MAX (rejected in
+                    // `cache_res`) (and probably would not be able to create a texture this
+                    // large).
+                    //
+                    // Note texture should always match the cached dimensions.
                    max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
                })
            },
        }
    }

-    /// Attempt to invalidate this cache entry.
-    /// If invalidation is not possible this returns the index of the texture to
-    /// deallocate
-    fn invalidate(&mut self) -> Result<(), usize> {
+    /// Invalidate this cache entry.
+    fn invalidate(&mut self) {
        match self {
            Self::Atlas { ref mut valid, .. } => {
                *valid = false;
-                Ok(())
            },
            Self::Texture { ref mut valid, .. } => {
                *valid = false;
-                Ok(())
            },
-            Self::Immutable { index } => Err(*index),
        }
    }
 }

+/// Requirements that a particular graphic has with respect to the atlas
+/// allocation or independent texture it will be stored in.
+///
+/// If this matches between an old graphic and a new one which is replacing it,
+/// we can reuse any of the corresponding locations where it is cached in
+/// textures on the GPU. That is we can invalidate such textures and upload the
+/// new graphic there, rather than needing to allocate a new texture (or new
+/// location in an atlas).
+#[derive(PartialEq)]
+enum TextureRequirements {
+    /// These are uploaded to the GPU in the original resolution of the image
+    /// supplied by the `Graphic` and any scaling is done during sampling in
+    /// the UI fragment shader.
+    Fixed {
+        size: Vec2<u16>,
+        /// Graphics with a border color specified are placed into their own
+        /// individual textures so that the border color can be set
+        /// there. (Note: this is partially a theoretical description as
+        /// border color options are limited in the current graphics API).
+        border_color: Option<Rgba<f32>>,
+    },
+    /// These are rasterized to the exact resolution that they will be displayed
+    /// at and then uploaded to the GPU. This corresponds to
+    /// `Graphic::Voxel`. There may be multiple copies on the GPU if
+    /// different resolutions are requested.
+    ///
+    /// It is expected that the requested sizes will generally not differ when
+    /// switching out a graphic. Thus, dependent cached depdendent should
+    /// always be invalidated since those cached locations will be reusable
+    /// if the requested size is the same.
+    Dependent,
+}
+
+/// These solely determine how a place in an atlas will be found or how a
+/// texture will be created to place the image for a graphic.
+struct TextureParameters {
+    size: Vec2<u16>,
+    border_color: Option<Rgba<f32>>,
+}
+
+/// Key used to refer to an instance of a graphic that has been uploaded to the
+/// GPU.
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+struct CacheKey {
+    graphic_id: Id,
+    /// This is `Some` for `TextureRequirements::Dependent`.
+    size: Option<Vec2<u16>>,
+}
+
+impl TextureRequirements {
+    fn from_graphic(graphic: &Graphic) -> Option<Self> {
+        match graphic {
+            Graphic::Image(image, border_color) => {
+                // Image sizes over u16::MAX are not supported (and we would probably not be
+                // able to create a texture large enough to hold them on the GPU anyway)!
+                let image_dims = match (u16::try_from(image.width()), u16::try_from(image.height()))
+                {
+                    (Ok(x), Ok(y)) if x != 0 && y != 0 => Vec2::new(x, y),
+                    _ => {
+                        error!(
+                            "Image dimensions greater than u16::MAX are not supported! Supplied \
+                             image size: ({}, {}).",
+                            image.width(),
+                            image.height(),
+                        );
+                        // TODO: reasonable to return None on this error case? We could potentially
+                        // validate images sizes on add_graphic/replace_graphic?
+                        return None;
+                    },
+                };
+
+                Some(Self::Fixed {
+                    size: image_dims,
+                    border_color: *border_color,
+                })
+            },
+            Graphic::Voxel(_, _, _) => Some(Self::Dependent),
+            Graphic::Blank => None,
+        }
+    }
+
+    // TODO: what if requested size is 0? Do we currently panic on this case and
+    // expect caller not to ask for 0 size? (if so document that)
+    fn to_key_and_tex_parameters(
+        self,
+        graphic_id: Id,
+        requested_size: Vec2<u16>,
+    ) -> (CacheKey, TextureParameters) {
+        // NOTE: Any external parameters which influence the value of the returned
+        // `TextureParameters` must be included in the `CacheKey`. Otherwise,
+        // invalidation and subsequent re-use of cache locations based on the
+        // value of `self` would be wrong.
+        let (size, border_color, key_size) = match self {
+            Self::Fixed { size, border_color } => (size, border_color, None),
+            Self::Dependent => (requested_size, None, Some(requested_size)),
+        };
+        (
+            CacheKey {
+                graphic_id,
+                size: key_size,
+            },
+            TextureParameters { size, border_color },
+        )
+    }
+}
+
 // Caches graphics, only deallocates when changing screen resolution (completely
 // cleared)
 pub struct GraphicCache {
@ -142,27 +235,35 @@ pub struct GraphicCache {
    /// Next id to use when a new graphic is added
    next_id: u32,

-    /// Atlases with the index of their texture in the textures vec
+    /// Atlases with the index of their texture in the textures slab.
    atlases: Vec<(SimpleAtlasAllocator, usize)>,
-    textures: Slab<(Texture, UiTextureBindGroup)>,
+    /// Third tuple element is a list of pending premultiply + upload operations
+    /// for this frame. The purpose of this is to collect all the operations
+    /// together so that a single renderpass is performed for each target
+    /// texture.
+    textures: Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
    /// The location and details of graphics cached on the GPU.
    ///
    /// Graphic::Voxel images include the dimensions they were rasterized at in
    /// the key. Other images are scaled as part of sampling them on the
    /// GPU.
-    cache_map: HashMap<(Id, Option<Vec2<u16>>), CachedDetails>,
+    cache_map: HashMap<CacheKey, CachedDetails>,

-    keyed_jobs: KeyedJobs<(Id, Option<Vec2<u16>>), (RgbaImage, Option<Rgba<f32>>)>,
+    keyed_jobs: KeyedJobs<CacheKey, RgbaImage>,
 }
+
 impl GraphicCache {
    pub fn new(renderer: &mut Renderer) -> Self {
-        let (atlas, texture) = create_atlas_texture(renderer);
+        let (atlas, (tex, bind)) = create_atlas_texture(renderer);
+
+        let mut textures = Slab::new();
+        let tex_id = textures.insert((tex, bind, Vec::new()));

        Self {
            graphic_map: HashMap::default(),
            next_id: 0,
-            atlases: vec![(atlas, 0)],
-            textures: core::iter::once((0, texture)).collect(),
+            atlases: vec![(atlas, tex_id)],
+            textures,
            cache_map: HashMap::default(),
            keyed_jobs: KeyedJobs::new("IMAGE_PROCESSING"),
        }
@ -179,29 +280,64 @@ impl GraphicCache {
    }

    pub fn replace_graphic(&mut self, id: Id, graphic: Graphic) {
-        if self.graphic_map.insert(id, graphic).is_none() {
-            // This was not an update, so no need to search for keys.
-            return;
-        }
+        let (old, new) = match self.graphic_map.entry(id) {
+            Entry::Occupied(o) => {
+                let slot_mut = o.into_mut();
+                let old = core::mem::replace(slot_mut, graphic);
+                (old, slot_mut)
+            },
+            Entry::Vacant(v) => {
+                // This was not an update, so no need to cleanup caches.
+                v.insert(graphic);
+                return;
+            },
+        };

-        // Remove from caches
+        let old_requirements = TextureRequirements::from_graphic(&old);
+        let new_requirements = TextureRequirements::from_graphic(&new);
+        let should_invalidate = old_requirements == new_requirements && old_requirements.is_some();
+
+        // Invalidate if possible or remove from caches.
        // Maybe make this more efficient if replace graphic is used more often
-        self.cache_map.retain(|&(key_id, _), details| {
-            // If the entry does not reference id, or it does but we can successfully
-            // invalidate, retain the entry; otherwise, discard this entry completely.
-            key_id != id
-                || details
-                    .invalidate()
-                    .map_err(|index| self.textures.remove(index))
-                    .is_ok()
-        });
+        // (especially since we should know the exact key for non-voxel
+        // graphics).
+        //
+        // NOTE: at the time of writing, replace_graphic is only used for voxel minimap
+        // updates and item image reloading.
+        if should_invalidate {
+            self.cache_map.iter_mut().for_each(|(key, details)| {
+                if key.graphic_id == id {
+                    details.invalidate();
+                }
+            });
+        } else {
+            self.cache_map.drain_filter(|key, details| {
+                if key.graphic_id == id {
+                    match details {
+                        // TODO: if replace_graphic is used continously for small images (i.e.
+                        // images placed into an atlas) of different sizes, that can use up our
+                        // atlas space since spots in the atlas can't be reused. (this scenario is
+                        // now possible with scaling being done during sampling rather than placing
+                        // resized version into the atlas)
+                        CachedDetails::Atlas { .. } => {},
+                        CachedDetails::Texture { index, .. } => {
+                            self.textures.remove(*index);
+                        },
+                    };
+                    true
+                } else {
+                    false
+                }
+            });
+        }
    }

    pub fn get_graphic(&self, id: Id) -> Option<&Graphic> { self.graphic_map.get(&id) }

    /// Used to acquire textures for rendering
-    pub fn get_tex(&self, id: TexId) -> &(Texture, UiTextureBindGroup) {
-        self.textures.get(id.0).expect("Invalid TexId used")
+    pub fn get_tex(&self, id: TexId) -> (&Texture, &UiTextureBindGroup) {
+        let (tex, bind, _uploads) = self.textures.get(id.0).expect("Invalid TexId used");
+        (tex, bind)
    }

    pub fn get_graphic_dims(&self, (id, rot): (Id, Rotation)) -> Option<(u32, u32)> {
@ -230,20 +366,28 @@ impl GraphicCache {
    pub fn clear_cache(&mut self, renderer: &mut Renderer) {
        self.cache_map.clear();

-        let (atlas, texture) = create_atlas_texture(renderer);
-        self.atlases = vec![(atlas, 0)];
-        self.textures = core::iter::once((0, texture)).collect();
+        let (atlas, (tex, bind)) = create_atlas_texture(renderer);
+        let mut textures = Slab::new();
+        let tex_id = textures.insert((tex, bind, Vec::new()));
+        self.atlases = vec![(atlas, tex_id)];
+        self.textures = textures;
    }

    /// Source rectangle should be from 0 to 1, and represents a bounding box
    /// for the source image of the graphic.
+    ///
+    /// [`complete_premultiply_uploads`](Self::complete_premultiply_uploads)
+    /// needs to be called to finalize updates on the GPU that are initiated
+    /// here. Thus, ideally that would be called before drawing UI elements
+    /// using the images cached here.
    pub fn cache_res(
        &mut self,
        renderer: &mut Renderer,
        pool: Option<&SlowJobPool>,
        graphic_id: Id,
-        // TODO: if we aren't resizing here we can upload image earlier... (as long as this doesn't
-        // lead to uploading too much unused stuff).
+        // TODO: if we aren't resizing here we can potentially upload the image earlier... (as long
+        // as this doesn't lead to uploading too much unused stuff). (currently not sure whether it
+        // would be an overall gain to pursue this.)
        requested_dims: Vec2<u16>,
        source: Aabr<f64>,
        rotation: Rotation,
@ -290,6 +434,7 @@ impl GraphicCache {
            // S-TODO: A bit hacky inserting this here, just to get things working initially
            let scale = requested_dims_upright.map2(
                Vec2::from(scaled.size()),
+                // S-TODO div by zero potential? If so, is NaN an issue in that case?
                |screen_pixels, sample_pixels: f64| screen_pixels as f32 / sample_pixels as f32,
            );
            let transformed = rotated_aabr(scaled);
@ -315,13 +460,9 @@ impl GraphicCache {
            },
        };

-        let key = (
-            graphic_id,
-            // Dimensions only included in the key for voxel graphics which we rasterize at the
-            // size that they will be displayed at (other images are scaled when sampling them on
-            // the GPU).
-            matches!(graphic, Graphic::Voxel { .. }).then(|| requested_dims_upright),
-        );
+        let requirements = TextureRequirements::from_graphic(&graphic)?;
+        let (key, texture_parameters) =
+            requirements.to_key_and_tex_parameters(graphic_id, requested_dims_upright);

        let details = match cache_map.entry(key) {
            Entry::Occupied(details) => {
@ -332,20 +473,23 @@ impl GraphicCache {
                // graphic
                if !valid {
                    // Create image
-                    let (image, border) = prepare_graphic(
+                    let image = prepare_graphic(
                        graphic,
-                        graphic_id,
+                        key,
                        requested_dims_upright,
+                        false,
                        &mut self.keyed_jobs,
                        pool,
                    )?;
-                    // If the cache location is invalid, we know the underlying texture is mutable,
-                    // so we should be able to replace the graphic.  However, we still want to make
-                    // sure that we are not reusing textures for images that specify a border
-                    // color.
-                    assert!(border.is_none());
+                    // Ensure we don't have any bugs causing the size used to determine if the
+                    // cached version is reusable to not match the size of the image produced by
+                    // prepare_graphic.
+                    assert_eq!(
+                        image.dimensions(),
+                        texture_parameters.size.map(u32::from).into_tuple()
+                    );
                    // Transfer to the gpu
-                    upload_image(renderer, aabr, &textures[idx].0, &image);
+                    upload_image(renderer, aabr, &mut textures[idx].2, &image);
                }

                return Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)));
@ -354,62 +498,49 @@ impl GraphicCache {
        };

        // Construct image in an optional threadpool.
-        let (image, border_color) = prepare_graphic(
+        let image = prepare_graphic(
            graphic,
-            graphic_id,
+            key,
            requested_dims_upright,
+            false,
            &mut self.keyed_jobs,
            pool,
        )?;
+        // Assert dimensions of image from `prepare_graphic` are as expected!
+        assert_eq!(
+            image.dimensions(),
+            texture_parameters.size.map(u32::from).into_tuple()
+        );
+        // Image dimensions in the format used by the allocator crate.
+        let image_dims_size2d = size2(
+            i32::from(texture_parameters.size.x),
+            i32::from(texture_parameters.size.y),
+        );

-        // Image sizes over u16::MAX are not supported (and we would probably not be
-        // able to create a texture large enough to hold them on the GPU anyway)!
-        let image_dims = match {
-            let (x, y) = image.dimensions();
-            (u16::try_from(x), u16::try_from(y))
-        } {
-            (Ok(x), Ok(y)) => Vec2::new(x, y),
-            _ => {
-                error!(
-                    "Image dimensions greater than u16::MAX are not supported! Supplied image \
-                     size: {:?}.",
-                    image.dimensions()
-                );
-                return None;
-            },
-        };
+        // Now we allocate space on the gpu (either in an atlas or an independent
+        // texture) and upload the image to that location.

-        // Upload
        let atlas_size = atlas_size(renderer);
-
-        // Allocate space on the gpu.
-        //
-        // Graphics with a border color.
-        let location = if let Some(border_color) = border_color {
-            // Create a new immutable texture.
-            let texture = create_image(renderer, image, border_color);
-            // NOTE: All mutations happen only after the upload succeeds!
-            let index = textures.insert(texture);
-            CachedDetails::Immutable { index }
-        // Graphics over a particular size compared to the atlas size are sent
-        // to their own textures. Here we check for ones under that
-        // size.
-        } else if atlas_size
-            .map2(image_dims, |a, d| a as f32 * ATLAS_CUTOFF_FRAC >= d as f32)
-            .reduce_and()
-        {
+        // Graphics that request a border color or which are over a particular size
+        // compared to the atlas size are sent to their own textures.
+        let can_place_in_atlas = texture_parameters.border_color.is_none()
+            && atlas_size
+                .map2(texture_parameters.size, |a, d| {
+                    a as f32 * ATLAS_CUTOFF_FRAC >= d as f32
+                })
+                .reduce_and();
+        let location = if can_place_in_atlas {
            // Fit into an atlas
            let mut loc = None;
            for (atlas_idx, &mut (ref mut atlas, texture_idx)) in atlases.iter_mut().enumerate() {
-                let clamped_dims = image_dims.map(|e| i32::from(e.max(1)));
-                if let Some(rectangle) = atlas.allocate(size2(clamped_dims.x, clamped_dims.y)) {
+                if let Some(rectangle) = atlas.allocate(image_dims_size2d) {
                    let aabr = aabr_from_alloc_rect(rectangle);
                    loc = Some(CachedDetails::Atlas {
                        atlas_idx,
                        valid: true,
                        aabr,
                    });
-                    upload_image(renderer, aabr, &textures[texture_idx].0, &image);
+                    upload_image(renderer, aabr, &mut textures[texture_idx].2, &image);
                    break;
                }
            }
@ -418,17 +549,16 @@ impl GraphicCache {
                Some(loc) => loc,
                // Create a new atlas
                None => {
-                    let (mut atlas, texture) = create_atlas_texture(renderer);
-                    let clamped_dims = image_dims.map(|e| i32::from(e.max(1)));
+                    let (mut atlas, (tex, bind)) = create_atlas_texture(renderer);
                    let aabr = atlas
-                        .allocate(size2(clamped_dims.x, clamped_dims.y))
+                        .allocate(image_dims_size2d)
                        .map(aabr_from_alloc_rect)
                        .unwrap();
                    // NOTE: All mutations happen only after the texture creation succeeds!
-                    let tex_idx = textures.insert(texture);
+                    let tex_idx = textures.insert((tex, bind, Vec::new()));
                    let atlas_idx = atlases.len();
                    atlases.push((atlas, tex_idx));
-                    upload_image(renderer, aabr, &textures[tex_idx].0, &image);
+                    upload_image(renderer, aabr, &mut textures[tex_idx].2, &image);
                    CachedDetails::Atlas {
                        atlas_idx,
                        valid: true,
@ -438,23 +568,11 @@ impl GraphicCache {
            }
        } else {
            // Create a texture just for this
-            let texture = {
-                let tex = renderer.create_dynamic_texture(image_dims.map(u32::from));
-                let bind = renderer.ui_bind_texture(&tex);
-                (tex, bind)
-            };
-            // NOTE: All mutations happen only after the texture creation succeeds!
-            let index = textures.insert(texture);
-            upload_image(
-                renderer,
-                Aabr {
-                    min: Vec2::zero(),
-                    // Note texture should always match the cached dimensions
-                    max: image_dims,
-                },
-                &textures[index].0,
-                &image,
-            );
+            let (tex, bind, uploads) = create_image(renderer, &image, texture_parameters);
+            // NOTE: All mutations happen only after the texture creation and upload
+            // initiation succeeds! (completing the upload does not have any failure cases
+            // afaik)
+            let index = textures.insert((tex, bind, uploads));
            CachedDetails::Texture { index, valid: true }
        };

@ -466,54 +584,77 @@ impl GraphicCache {

        Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)))
    }
+
+    /// Runs render passes with alpha premultiplication pipeline to complete any
+    /// pending uploads.
+    ///
+    /// This should be called before starting the pass where the ui is rendered.
+    pub fn complete_premultiply_uploads(&mut self, drawer: &mut crate::render::Drawer<'_>) {
+        drawer.run_ui_premultiply_passes(
+            self.textures
+                .iter_mut()
+                .map(|(_tex_id, (texture, _, uploads))| (&*texture, core::mem::take(uploads))),
+        );
+    }
 }

 /// Prepare the graphic into the form that will be uploaded to the GPU.
 ///
 /// For voxel graphics, draws the graphic at the specified dimensions.
 ///
-/// Also pre-multiplies alpha in images so they can be linearly filtered on the
-/// GPU.
-fn prepare_graphic(
-    graphic: &Graphic,
-    graphic_id: Id,
+/// Also can pre-multiplies alpha in images so they can be linearly filtered on
+/// the GPU (this is optional since we also have a path to do this
+/// premultiplication on the GPU).
+fn prepare_graphic<'graphic>(
+    graphic: &'graphic Graphic,
+    cache_key: CacheKey,
    dims: Vec2<u16>,
-    keyed_jobs: &mut KeyedJobs<(Id, Option<Vec2<u16>>), (RgbaImage, Option<Rgba<f32>>)>,
+    premultiply_on_cpu: bool, // TODO: currently unused
+    keyed_jobs: &mut KeyedJobs<CacheKey, RgbaImage>,
    pool: Option<&SlowJobPool>,
-) -> Option<(RgbaImage, Option<Rgba<f32>>)> {
+) -> Option<Cow<'graphic, RgbaImage>> {
    match graphic {
        // Short-circuit spawning a job on the threadpool for blank graphics
        Graphic::Blank => None,
-        // Dimensions are only included in the key for Graphic::Voxel since otherwise we will
-        // resize on the GPU.
-        Graphic::Image(image, border_color) => keyed_jobs
-            .spawn(pool, (graphic_id, None), || {
-                let image = Arc::clone(image);
-                let border_color = *border_color;
-                move |_| {
-                    // Image will be rescaled when sampling from it on the GPU so we don't
-                    // need to resize it here.
-                    let mut image = image.to_rgba8();
-                    // TODO: could potentially do this when loading the image and for voxel
-                    // images maybe at some point in the `draw_vox` processing. Or we could
-                    // push it in the other direction and do conversion on the GPU.
-                    premultiply_alpha(&mut image);
-                    (image, border_color)
-                }
-            })
-            .map(|(_, v)| v),
+        Graphic::Image(image, _border_color) => {
+            if premultiply_on_cpu {
+                keyed_jobs
+                    .spawn(pool, cache_key, || {
+                        let image = Arc::clone(image);
+                        move |_| {
+                            // Image will be rescaled when sampling from it on the GPU so we don't
+                            // need to resize it here.
+                            let mut image = image.to_rgba8();
+                            // TODO: could potentially do this when loading the image and for voxel
+                            // images maybe at some point in the `draw_vox` processing. Or we could
+                            // push it in the other direction and do conversion on the GPU.
+                            premultiply_alpha(&mut image);
+                            image
+                        }
+                    })
+                    .map(|(_, v)| Cow::Owned(v))
+            } else if let Some(rgba) = image.as_rgba8() {
+                Some(Cow::Borrowed(rgba))
+            } else {
+                // TODO: we should require rgba8 format
+                warn!("Non-rgba8 image in UI used this may be deprecated.");
+                Some(Cow::Owned(image.to_rgba8()))
+            }
+        },
        Graphic::Voxel(segment, trans, sample_strat) => keyed_jobs
-            .spawn(pool, (graphic_id, Some(dims)), || {
+            .spawn(pool, cache_key, || {
                let segment = Arc::clone(segment);
                let (trans, sample_strat) = (*trans, *sample_strat);
                move |_| {
                    // Render voxel model at requested resolution
                    let mut image = renderer::draw_vox(&segment, dims, trans, sample_strat);
-                    premultiply_alpha(&mut image);
-                    (image, None)
+                    if premultiply_on_cpu {
+                        premultiply_alpha(&mut image);
+                    }
+                    image
                }
            })
-            .map(|(_, v)| v),
+            .map(|(_, v)| Cow::Owned(v)),
    }
 }

@ -525,19 +666,52 @@ fn atlas_size(renderer: &Renderer) -> Vec2<u32> {
        .map(|e| (e * GRAPHIC_CACHE_RELATIVE_SIZE).clamp(512, max_texture_size))
 }

+/// This creates a texture suitable for sampling from during the UI pass and
+/// rendering too during alpha premultiplication upload passes.
+fn create_image_texture(
+    renderer: &mut Renderer,
+    size: Vec2<u32>,
+    address_mode: Option<wgpu::AddressMode>,
+) -> (Texture, UiTextureBindGroup) {
+    let tex_info = wgpu::TextureDescriptor {
+        label: None,
+        size: wgpu::Extent3d {
+            width: size.x,
+            height: size.y,
+            depth_or_array_layers: 1,
+        },
+        mip_level_count: 1,
+        sample_count: 1,
+        dimension: wgpu::TextureDimension::D2,
+        format: wgpu::TextureFormat::Rgba8UnormSrgb,
+        usage: wgpu::TextureUsage::RENDER_ATTACHMENT | wgpu::TextureUsage::SAMPLED,
+    };
+    let view_info = wgpu::TextureViewDescriptor {
+        format: Some(tex_info.format),
+        dimension: Some(wgpu::TextureViewDimension::D2),
+        ..Default::default()
+    };
+    let address_mode = address_mode.unwrap_or(wgpu::AddressMode::ClampToEdge);
+    let sampler_info = wgpu::SamplerDescriptor {
+        address_mode_u: address_mode,
+        address_mode_v: address_mode,
+        mag_filter: wgpu::FilterMode::Linear,
+        min_filter: wgpu::FilterMode::Linear,
+        ..Default::default()
+    };
+    let tex = renderer.create_texture_raw(&tex_info, &view_info, &sampler_info);
+    let bind = renderer.ui_bind_texture(&tex);
+    (tex, bind)
+}
+
 fn create_atlas_texture(
    renderer: &mut Renderer,
 ) -> (SimpleAtlasAllocator, (Texture, UiTextureBindGroup)) {
    let size = atlas_size(renderer);
    // Note: here we assume the max texture size is under i32::MAX.
    let atlas = SimpleAtlasAllocator::new(size2(size.x as i32, size.y as i32));
-    let texture = {
-        let tex = renderer.create_dynamic_texture(size);
-        let bind = renderer.ui_bind_texture(&tex);
-        (tex, bind)
-    };
-
-    (atlas, texture)
+    let (tex, bind) = create_image_texture(renderer, size, None);
+    (atlas, (tex, bind))
 }

 fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
@ -550,37 +724,49 @@ fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
    }
 }

-fn upload_image(renderer: &mut Renderer, aabr: Aabr<u16>, tex: &Texture, image: &RgbaImage) {
+fn upload_image(
+    renderer: &mut Renderer,
+    aabr: Aabr<u16>,
+    target_texture_uploads: &mut Vec<UiPremultiplyUpload>,
+    image: &RgbaImage,
+) {
    let aabr = aabr.map(u32::from);
+    // Check that this image and the target aabr are the same size (otherwise there
+    // is a bug in this module).
+    debug_assert_eq!(aabr.size().into_tuple(), image.dimensions());
    let offset = aabr.min.into_array();
-    let size = aabr.size().into_array();
-    renderer.update_texture(
-        tex,
-        offset,
-        size,
-        // NOTE: Rgba texture, so each pixel is 4 bytes, ergo this cannot fail.
-        // We make the cast parameters explicit for clarity.
-        bytemuck::cast_slice::<u8, [u8; 4]>(image),
-    );
+
+    // TODO: can we transparently have cpu based version behind this (actually this
+    // would introduce more complexity to be able to do it in the background,
+    // but we could to it not in the background here especially for smaller
+    // things this would work well)
+    let upload = UiPremultiplyUpload::prepare(renderer, image, offset);
+    target_texture_uploads.push(upload);
+    //todo!()
 }

+// This is used for border_color.is_some() images (ie the map image).
 fn create_image(
    renderer: &mut Renderer,
-    image: RgbaImage,
-    _border_color: Rgba<f32>, // See TODO below
-) -> (Texture, UiTextureBindGroup) {
-    let tex = renderer
-        .create_texture(
-            &DynamicImage::ImageRgba8(image),
-            Some(wgpu::FilterMode::Linear),
+    image: &RgbaImage,
+    texture_parameters: TextureParameters,
+) -> (Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>) {
+    let (tex, bind) = create_image_texture(
+        renderer,
+        texture_parameters.size.map(u32::from),
+        texture_parameters
+            .border_color
            // TODO: either use the desktop only border color or just emulate this
-            // Some(border_color.into_array().into()),
-            Some(wgpu::AddressMode::ClampToBorder),
-        )
-        .expect("create_texture only panics if non ImageRbga8 is passed");
-    let bind = renderer.ui_bind_texture(&tex);
-
-    (tex, bind)
+            //.map(|c| c.into_array().into()),
+            .map(|_| wgpu::AddressMode::ClampToBorder),
+    );
+    let mut uploads = Vec::new();
+    let aabr = Aabr {
+        min: Vec2::zero(),
+        max: texture_parameters.size,
+    };
+    upload_image(renderer, aabr, &mut uploads, image);
+    (tex, bind, uploads)
 }

 fn premultiply_alpha(image: &mut RgbaImage) {
@ -592,7 +778,7 @@ fn premultiply_alpha(image: &mut RgbaImage) {
    // https://github.com/image-rs/image/blob/a1ce569afd476e881acafdf9e7a5bce294d0db9a/src/buffer.rs#L664
    let dims = image.dimensions();
    let image_buffer_len = dims.0 as usize * dims.1 as usize * 4;
-    let (arrays, end) = image[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
+    let (arrays, end) = (&mut **image)[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
    // Rgba8 has 4 bytes per pixel they should be no remainder when dividing by 4.
    let (end, _) = end.as_chunks_mut::<4>();
    end.iter_mut().for_each(|pixel| {
@ -637,3 +823,8 @@ fn premultiply_alpha(image: &mut RgbaImage) {
        }
    })
 }
+
+// Next step: Handling invalidation / removal of old textures when
+// replace_graphic is used under new resizing scheme.
+//
+// TODO: does screenshot texture have COPY_DST? I don't think it needs this.
--- a/voxygen/src/ui/ice/cache.rs
+++ b/voxygen/src/ui/ice/cache.rs
@ -61,7 +61,9 @@ impl Cache {
        })
    }

-    pub fn glyph_cache_tex(&self) -> &(Texture, UiTextureBindGroup) { &self.glyph_cache_tex }
+    pub fn glyph_cache_tex(&self) -> (&Texture, &UiTextureBindGroup) {
+        (&self.glyph_cache_tex.0, &self.glyph_cache_tex.1)
+    }

    pub fn glyph_cache_mut_and_tex(&mut self) -> (&mut GlyphBrush, &(Texture, UiTextureBindGroup)) {
        (self.glyph_brush.get_mut(), &self.glyph_cache_tex)
--- a/voxygen/src/ui/ice/renderer/mod.rs
+++ b/voxygen/src/ui/ice/renderer/mod.rs
@ -791,7 +791,7 @@ impl IcedRenderer {
                        DrawKind::Image(tex_id) => self.cache.graphic_cache().get_tex(*tex_id),
                        DrawKind::Plain => self.cache.glyph_cache_tex(),
                    };
-                    drawer.draw(&tex.1, verts.clone()); // Note: trivial clone
+                    drawer.draw(tex.1, verts.clone()); // Note: trivial clone
                },
            }
        }
--- a/voxygen/src/ui/mod.rs
+++ b/voxygen/src/ui/mod.rs
@ -1073,7 +1073,7 @@ impl Ui {
                        DrawKind::Image(tex_id) => self.cache.graphic_cache().get_tex(*tex_id),
                        DrawKind::Plain => self.cache.glyph_cache_tex(),
                    };
-                    drawer.draw(&tex.1, verts.clone()); // Note: trivial clone
+                    drawer.draw(tex.1, verts.clone()); // Note: trivial clone
                },
            }
        }