From d62bf8a7903ef9679a215e5ac9cbcf93686444cf Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Mon, 29 Aug 2022 00:42:14 -0400
Subject: [PATCH 01/14] init (scaling UI images during sampling on the GPU
 instead of on the CPU)

---
 .../element/ui/generic/frames/banner_bot.png  |   4 +-
 .../element/ui/generic/frames/esc_menu.png    |   4 +-
 assets/voxygen/shaders/ui-frag.glsl           | 190 +++++++++++++++++-
 assets/voxygen/shaders/ui-vert.glsl           |  13 +-
 voxygen/src/render/mod.rs                     |   3 +
 voxygen/src/render/pipelines/ui.rs            |  83 +++++++-
 voxygen/src/render/renderer/binding.rs        |   8 +-
 voxygen/src/render/texture.rs                 |   4 +-
 voxygen/src/ui/graphic/mod.rs                 |  81 +++++---
 voxygen/src/ui/ice/renderer/mod.rs            |  10 +-
 voxygen/src/ui/mod.rs                         |  12 +-
 11 files changed, 352 insertions(+), 60 deletions(-)

diff --git a/assets/voxygen/element/ui/generic/frames/banner_bot.png b/assets/voxygen/element/ui/generic/frames/banner_bot.png
index 1e06228b78..695b03d727 100644
--- a/assets/voxygen/element/ui/generic/frames/banner_bot.png
+++ b/assets/voxygen/element/ui/generic/frames/banner_bot.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f83239c77d84ba225425a4ba36e439d105df12deef189a36897ce548a70ae0ee
-size 110
+oid sha256:bec5e310f6793255dc77eb04ba2ffaa9786d288108576503cbac8a133d4347ae
+size 149
diff --git a/assets/voxygen/element/ui/generic/frames/esc_menu.png b/assets/voxygen/element/ui/generic/frames/esc_menu.png
index 0c6a4e0faa..ad36939de5 100644
--- a/assets/voxygen/element/ui/generic/frames/esc_menu.png
+++ b/assets/voxygen/element/ui/generic/frames/esc_menu.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7adc6dea422a19eb8153c0f7a5c28095f69c9f099c816c9905a473c210fc6a87
-size 113
+oid sha256:06de968f79c35f9258467eaa726e973c5b2bf0ebed74f04a205f6d7ad871975e
+size 239
diff --git a/assets/voxygen/shaders/ui-frag.glsl b/assets/voxygen/shaders/ui-frag.glsl
index 549f60bfb3..c1f7721580 100644
--- a/assets/voxygen/shaders/ui-frag.glsl
+++ b/assets/voxygen/shaders/ui-frag.glsl
@@ -1,31 +1,215 @@
 #version 420 core
 
 #include <globals.glsl>
+#include <constants.glsl>
 
 layout(location = 0) in vec2 f_uv;
 layout(location = 1) in vec4 f_color;
-layout(location = 2) flat in uint f_mode;
+layout(location = 2) flat in vec2 f_scale;
+layout(location = 3) flat in uint f_mode;
 
 layout (std140, set = 1, binding = 0)
 uniform u_locals {
     vec4 w_pos;
 };
 
+// TODO: swap with u_locals because that may change more frequently?
 layout(set = 2, binding = 0)
 uniform texture2D t_tex;
 layout(set = 2, binding = 1)
 uniform sampler s_tex;
+layout (std140, set = 2, binding = 2)
+uniform tex_locals {
+    uvec2 texture_size;
+};
 
 layout(location = 0) out vec4 tgt_color;
 
+// Adjusts the provided uv value to account for coverage of pixels from the
+// sampled texture by the current fragment when upscaling.
+//
+// * `pos` - Position in the sampled texture in pixel coordinates. This is
+//   where the center of the current fragment lies on the sampled texture.
+// * `scale` - Scaling of pixels from the sampled texture to the render target.
+//   This is the amount of fragments that each pixel from the sampled texture
+//   covers.
+float upscale_adjust(float pos, float scale) {
+    // To retain crisp borders of upscaled pixel art, images are upscaled
+    // following the algorithm outlined here:
+    //
+    // https://csantosbh.wordpress.com/2014/01/25/manual-texture-filtering-for-pixelated-games-in-webgl/
+    // 
+    // `min(x * scale, 0.5) + max((x - 1.0) * scale, 0.0)`
+    //
+    float frac = fract(pos);
+    // Right of nearest pixel in the sampled texture.
+    float base = floor(pos);
+    // This will be 0.5 when the current fragment lies entirely inside a pixel
+    // in the sampled texture.
+    float adjustment = min(frac * scale, 0.5) + max((frac - 1.0) * scale + 0.5, 0.0);
+    return base + adjustment;
+}
+
+// Computes info needed for downscaling using two samples in a single
+// dimension. This info includes the two position to sample at (called
+// `offsets` even though they aren't actually offsets from the supplied
+// position) and the `weights` to multiply each of those samples by before
+// combining them.
+//
+// See `upscale_adjust` for semantics of `pos` and `scale` parameters.
+// 
+// Ouput via `weights` and `offsets` parameters.
+void downscale_params(float pos, float scale, out vec2 weights, out vec2 offsets) {
+    // For `scale` 0.33333..1.0 we round to the nearest pixel edge and split
+    // there. We compute the length of each side. Then the sampling point is
+    // computed as this distance from the split point via this formula where
+    // `l` is the length of that side of split:
+    //
+    // `1.5 - (1.0 / max(l, 1.0))`
+    //
+    // For `scale` ..0.3333 the current fragment can potentially span more than
+    // 4 pixels (within a single dimension) in the sampled texture. So we can't
+    // perfectly compute the contribution of each covered pixel in the sampled
+    // texture with only 2 samples (along each dimension). Thus, we fallback to
+    // an imperfect technique of of just sampling a 1 pixel length from the
+    // center on each side. An alternative would be to pre-compute mipmap
+    // levels that could be sampled from.
+    if (scale > (1.0 / 3.0)) {
+        // Width of the fragment in terms of pixels in the sampled texture.
+        float width = 1.0 / scale;
+        // Right side of the fragment in the sampled texture.
+        float right = pos - width / 2.0;
+        float split = round(pos);
+        float right_len = split - right;
+        float left_len = width - right_len;
+        float right_sample_offset = 1.5 - (1.0 / max(right_len, 1.0));
+        float left_sample_offset = 1.5 - (1.0 / max(left_len, 1.0));
+        offsets = vec2(split) + vec2(-right_sample_offset, left_sample_offset);
+        weights = vec2(right_len, left_len) / width;
+    } else {
+        offsets = pos + vec2(-1.0, 1.0);
+        // We split in the middle so weights for both sides are the same.
+        weights = vec2(0.5);
+    }
+}
+
+// 1 sample
+vec4 upscale_xy(vec2 uv_pixel, vec2 scale) {
+    // When slowly panning something (e.g. the map), a very small amount of
+    // wobbling is still observable (not as much as nearest sampling). It
+    // is possible to eliminate this by making the edges slightly blurry by
+    // lowering the scale a bit here. However, this does make edges little
+    // less crisp and can cause bleeding in from other images packed into
+    // the atlas in the current setup.
+    vec2 adjusted = vec2(upscale_adjust(uv_pixel.x, scale.x), upscale_adjust(uv_pixel.y, scale.y));
+    // Convert back to 0.0..1.0 by dividing by texture size.
+    vec2 uv = adjusted / texture_size;
+    return textureLod(sampler2D(t_tex, s_tex), uv, 0);
+}
+
+// 2 samples
+vec4 upscale_x_downscale_y(vec2 uv_pixel, vec2 scale) {
+    float x_adjusted = upscale_adjust(uv_pixel.x, scale.x);
+    vec2 weights, offsets;
+    downscale_params(uv_pixel.y, scale.y, weights, offsets);
+    vec2 uv0 = vec2(x_adjusted, offsets[0]) / texture_size;
+    vec2 uv1 = vec2(x_adjusted, offsets[1]) / texture_size;
+    vec4 s0 = textureLod(sampler2D(t_tex, s_tex), uv0, 0);
+    vec4 s1 = textureLod(sampler2D(t_tex, s_tex), uv1, 0);
+    return s0 * weights[0] + s1 * weights[1];
+}
+
+// 2 samples
+vec4 downscale_x_upscale_y(vec2 uv_pixel, vec2 scale) {
+    float y_adjusted = upscale_adjust(uv_pixel.y, scale.y);
+    vec2 weights, offsets;
+    downscale_params(uv_pixel.x, scale.x, weights, offsets);
+    vec2 uv0 = vec2(offsets[0], y_adjusted) / texture_size;
+    vec2 uv1 = vec2(offsets[1], y_adjusted) / texture_size;
+    vec4 s0 = textureLod(sampler2D(t_tex, s_tex), uv0, 0);
+    vec4 s1 = textureLod(sampler2D(t_tex, s_tex), uv1, 0);
+    return s0 * weights[0] + s1 * weights[1];
+}
+
+// 4 samples
+vec4 downscale_xy(vec2 uv_pixel, vec2 scale) {
+    vec2 weights_x, offsets_x, weights_y, offsets_y;
+    downscale_params(uv_pixel.x, scale.x, weights_x, offsets_x);
+    downscale_params(uv_pixel.y, scale.y, weights_y, offsets_y);
+    vec2 uv0 = vec2(offsets_x[0], offsets_y[0]) / texture_size;
+    vec2 uv1 = vec2(offsets_x[1], offsets_y[0]) / texture_size;
+    vec2 uv2 = vec2(offsets_x[0], offsets_y[1]) / texture_size;
+    vec2 uv3 = vec2(offsets_x[1], offsets_y[1]) / texture_size;
+    uv0 = uv_pixel / texture_size;
+    uv1 = uv_pixel / texture_size;
+    uv2 = uv_pixel / texture_size;
+    uv3 = uv_pixel / texture_size;
+    vec4 s0 = textureLod(sampler2D(t_tex, s_tex), uv0, 0);
+    vec4 s1 = textureLod(sampler2D(t_tex, s_tex), uv1, 0);
+    vec4 s2 = textureLod(sampler2D(t_tex, s_tex), uv2, 0);
+    vec4 s3 = textureLod(sampler2D(t_tex, s_tex), uv3, 0);
+    vec4 s01 = s0 * weights_x[0] + s1 * weights_x[1];
+    vec4 s23 = s2 * weights_x[0] + s3 * weights_x[1];
+    // Useful to visualize things below the limit where downscaling is supposed
+    // to be accurate.
+    /*if (scale.x < (1.0 / 3.0)) {
+        return vec4(1, 0, 0, 1);
+    }*/
+    return s01 * weights_y[0] + s23 * weights_y[1];
+}
+
 void main() {
     // Text
     if (f_mode == uint(0)) {
-        tgt_color = f_color * vec4(1.0, 1.0, 1.0, textureLod(sampler2D(t_tex, s_tex), f_uv, 0).a);
+        // NOTE: This now uses linear filter since all `Texture::new_dynamic`
+        // was changed to this by default. Glyphs are usually rasterized to be
+        // pretty close to the target size (so the filter change may have no
+        // effect), but there are thresholds within which the same rasterized
+        // glyph will be re-used. I wasn't able to observe any differences.
+        vec2 uv = f_uv;
+        #ifdef EXPERIMENTAL_UINEARESTSCALING
+            uv = (floor(uv * texture_size) + 0.5) / texture_size;
+        #endif
+        tgt_color = f_color * vec4(1.0, 1.0, 1.0, textureLod(sampler2D(t_tex, s_tex), uv, 0).a);
     // Image
     // HACK: bit 0 is set for both ordinary and north-facing images.
     } else if ((f_mode & uint(1)) == uint(1)) {
-        tgt_color = f_color * textureLod(sampler2D(t_tex, s_tex), f_uv, 0);
+        // NOTE: We don't have to account for bleeding over the border of an image
+        // due to how the ui currently handles rendering images. Currently, any
+        // edges of an image being rendered that don't line up with a pixel are
+        // snapped to a pixel, so we will never render any pixels containing an
+        // image that lie partly outside that image (and thus the sampling here
+        // will never try to sample outside an image). So we don't have to
+        // worry about bleeding in the atlas and/or what the border behavior
+        // should be.
+
+        // TODO: benchmark before and after by viewing zoomed out map in the UI
+        // (at particular ui scale and map zoom out, far enough to trigger downscaling)
+        // (seems like ~10% increase if any)
+        
+        // Convert to sampled pixel coordinates.
+        vec2 uv_pixel = f_uv * texture_size;
+        vec4 image_color = vec4(1.0, 0, 0, 1);
+        #ifdef EXPERIMENTAL_UINEARESTSCALING
+            vec2 uv = (floor(uv_pixel) + 0.5) / texture_size;
+            image_color = textureLod(sampler2D(t_tex, s_tex), uv, 0);
+        #else 
+            if (f_scale.x >= 1.0) {
+                if (f_scale.y >= 1.0) {
+                    image_color = upscale_xy(uv_pixel, f_scale);     
+                } else {
+                    image_color = upscale_x_downscale_y(uv_pixel, f_scale);     
+                }
+            } else {
+                if (f_scale.y >= 1.0) {
+                    image_color = downscale_x_upscale_y(uv_pixel, f_scale);     
+                } else {
+                    image_color = downscale_xy(uv_pixel, f_scale);     
+                }
+            }
+        #endif
+
+        tgt_color = f_color * image_color;
     // 2D Geometry
     } else if (f_mode == uint(2)) {
         tgt_color = f_color;
diff --git a/assets/voxygen/shaders/ui-vert.glsl b/assets/voxygen/shaders/ui-vert.glsl
index 7c25f01fd3..0990656757 100644
--- a/assets/voxygen/shaders/ui-vert.glsl
+++ b/assets/voxygen/shaders/ui-vert.glsl
@@ -6,7 +6,8 @@ layout(location = 0) in vec2 v_pos;
 layout(location = 1) in vec2 v_uv;
 layout(location = 2) in vec4 v_color;
 layout(location = 3) in vec2 v_center;
-layout(location = 4) in uint v_mode;
+layout(location = 4) in vec2 v_scale;
+layout(location = 5) in uint v_mode;
 
 layout (std140, set = 1, binding = 0)
 uniform u_locals {
@@ -17,10 +18,15 @@ layout(set = 2, binding = 0)
 uniform texture2D t_tex;
 layout(set = 2, binding = 1)
 uniform sampler s_tex;
+layout (std140, set = 2, binding = 2)
+uniform tex_locals {
+    uvec2 texture_size;
+};
 
 layout(location = 0) out vec2 f_uv;
 layout(location = 1) out vec4 f_color;
-layout(location = 2) flat out uint f_mode;
+layout(location = 2) flat out vec2 f_scale;
+layout(location = 3) flat out uint f_mode;
 
 void main() {
     f_color = v_color;
@@ -39,7 +45,7 @@ void main() {
         gl_Position = vec4(v_pos, 0.5, 1.0);
         vec2 look_at_dir = normalize(vec2(-view_mat[0][2], -view_mat[1][2]));
         // TODO: Consider cleaning up matrix to something more efficient (e.g. a mat3).
-        vec2 aspect_ratio = textureSize(sampler2D(t_tex, s_tex), 0).yx;
+        vec2 aspect_ratio = texture_size.yx;
         mat2 look_at = mat2(look_at_dir.y, look_at_dir.x, -look_at_dir.x, look_at_dir.y);
         vec2 v_centered = (v_uv - v_center) / aspect_ratio;
         vec2 v_rotated = look_at * v_centered;
@@ -60,5 +66,6 @@ void main() {
         gl_Position = vec4(v_pos, 0.5, 1.0);
     }
 
+    f_scale = v_scale;
     f_mode = v_mode;
 }
diff --git a/voxygen/src/render/mod.rs b/voxygen/src/render/mod.rs
index 76494cb8b2..601200128f 100644
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@@ -536,4 +536,7 @@ pub enum ExperimentalShader {
     SmearReflections,
     /// Apply the point shadows from cheap shadows on top of shadow mapping.
     PointShadowsWithShadowMapping,
+    /// Make the UI uses nearest neighbor filtering for scaling images instead
+    /// of trying to filter based on the coverage of the sampled pixels.
+    UiNearestScaling,
 }
diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index 8181af9e10..a741020103 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -10,12 +10,17 @@ pub struct Vertex {
     uv: [f32; 2],
     color: [f32; 4],
     center: [f32; 2],
+    // Used calculating where to sample scaled images.
+    scale: [f32; 2],
     mode: u32,
 }
 
 impl Vertex {
     fn desc<'a>() -> wgpu::VertexBufferLayout<'a> {
-        const ATTRIBUTES: [wgpu::VertexAttribute; 5] = wgpu::vertex_attr_array![0 => Float32x2, 1 => Float32x2, 2 => Float32x4, 3 => Float32x2, 4 => Uint32];
+        const ATTRIBUTES: [wgpu::VertexAttribute; 6] = wgpu::vertex_attr_array![
+            0 => Float32x2, 1 => Float32x2, 2 => Float32x4,
+            3 => Float32x2, 4 => Float32x2,    5 => Uint32,
+        ];
         wgpu::VertexBufferLayout {
             array_stride: Self::STRIDE,
             step_mode: wgpu::InputStepMode::Vertex,
@@ -47,6 +52,20 @@ impl Default for Locals {
     fn default() -> Self { Self { pos: [0.0; 4] } }
 }
 
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Zeroable, Pod)]
+pub struct TexLocals {
+    texture_size: [u32; 2],
+}
+
+impl From<Vec2<u32>> for TexLocals {
+    fn from(texture_size: Vec2<u32>) -> Self {
+        Self {
+            texture_size: texture_size.into_array(),
+        }
+    }
+}
+
 /// Draw text from the text cache texture `tex` in the fragment shader.
 pub const MODE_TEXT: u32 = 0;
 /// Draw an image from the texture at `tex` in the fragment shader.
@@ -64,22 +83,44 @@ pub const MODE_IMAGE_SOURCE_NORTH: u32 = 3;
 /// FIXME: Make more principled.
 pub const MODE_IMAGE_TARGET_NORTH: u32 = 5;
 
+#[derive(Clone, Copy)]
 pub enum Mode {
     Text,
-    Image,
+    Image {
+        scale: Vec2<f32>,
+    },
     Geometry,
-    ImageSourceNorth,
-    ImageTargetNorth,
+    /// Draw an image from the texture at `tex` in the fragment shader, with the
+    /// source rectangle rotated to face north (TODO: detail on what "north"
+    /// means here).
+    ImageSourceNorth {
+        scale: Vec2<f32>,
+    },
+    /// Draw an image from the texture at `tex` in the fragment shader, with the
+    /// target rectangle rotated to face north. (TODO: detail on what "target"
+    /// means)
+    ImageTargetNorth {
+        scale: Vec2<f32>,
+    },
 }
 
 impl Mode {
     fn value(self) -> u32 {
         match self {
             Mode::Text => MODE_TEXT,
-            Mode::Image => MODE_IMAGE,
+            Mode::Image { .. } => MODE_IMAGE,
             Mode::Geometry => MODE_GEOMETRY,
-            Mode::ImageSourceNorth => MODE_IMAGE_SOURCE_NORTH,
-            Mode::ImageTargetNorth => MODE_IMAGE_TARGET_NORTH,
+            Mode::ImageSourceNorth { .. } => MODE_IMAGE_SOURCE_NORTH,
+            Mode::ImageTargetNorth { .. } => MODE_IMAGE_TARGET_NORTH,
+        }
+    }
+
+    /// Gets the scaling of the displayed image compared to the source.
+    fn scale(self) -> Vec2<f32> {
+        match self {
+            Mode::ImageSourceNorth { scale } | Mode::ImageTargetNorth { scale } => scale,
+            Mode::Image { scale } => scale,
+            Mode::Text | Mode::Geometry => Vec2::one(),
         }
     }
 }
@@ -137,6 +178,17 @@ impl UiLayout {
                         },
                         count: None,
                     },
+                    // tex_locals
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 2,
+                        visibility: wgpu::ShaderStage::VERTEX | wgpu::ShaderStage::FRAGMENT,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
                 ],
             }),
         }
@@ -158,7 +210,12 @@ impl UiLayout {
         }
     }
 
-    pub fn bind_texture(&self, device: &wgpu::Device, texture: &Texture) -> TextureBindGroup {
+    pub fn bind_texture(
+        &self,
+        device: &wgpu::Device,
+        texture: &Texture,
+        tex_locals: Consts<TexLocals>,
+    ) -> TextureBindGroup {
         let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
             label: None,
             layout: &self.texture,
@@ -171,6 +228,10 @@ impl UiLayout {
                     binding: 1,
                     resource: wgpu::BindingResource::Sampler(&texture.sampler),
                 },
+                wgpu::BindGroupEntry {
+                    binding: 2,
+                    resource: tex_locals.buf().as_entire_binding(),
+                },
             ],
         });
 
@@ -268,17 +329,19 @@ pub fn create_quad_vert_gradient(
     let top_color = top_color.into_array();
     let bottom_color = bottom_color.into_array();
 
-    let center = if let Mode::ImageSourceNorth = mode {
+    let center = if let Mode::ImageSourceNorth { .. } = mode {
         uv_rect.center().into_array()
     } else {
         rect.center().into_array()
     };
+    let scale = mode.scale().into_array();
     let mode_val = mode.value();
     let v = |pos, uv, color| Vertex {
         pos,
         uv,
         center,
         color,
+        scale,
         mode: mode_val,
     };
     let aabr_to_lbrt = |aabr: Aabr<f32>| (aabr.min.x, aabr.min.y, aabr.max.x, aabr.max.y);
@@ -315,12 +378,14 @@ pub fn create_tri(
     mode: Mode,
 ) -> Tri<Vertex> {
     let center = [0.0, 0.0];
+    let scale = mode.scale().into_array();
     let mode_val = mode.value();
     let v = |pos, uv| Vertex {
         pos,
         uv,
         center,
         color: color.into_array(),
+        scale,
         mode: mode_val,
     };
     Tri::new(
diff --git a/voxygen/src/render/renderer/binding.rs b/voxygen/src/render/renderer/binding.rs
index 6deffb0f28..a97c0be179 100644
--- a/voxygen/src/render/renderer/binding.rs
+++ b/voxygen/src/render/renderer/binding.rs
@@ -47,8 +47,12 @@ impl Renderer {
         self.layouts.ui.bind_locals(&self.device, locals)
     }
 
-    pub fn ui_bind_texture(&self, texture: &Texture) -> ui::TextureBindGroup {
-        self.layouts.ui.bind_texture(&self.device, texture)
+    pub fn ui_bind_texture(&mut self, texture: &Texture) -> ui::TextureBindGroup {
+        let tex_locals = ui::TexLocals::from(texture.get_dimensions().xy());
+        let tex_locals_consts = self.create_consts(&[tex_locals]);
+        self.layouts
+            .ui
+            .bind_texture(&self.device, texture, tex_locals_consts)
     }
 
     pub fn create_figure_bound_locals(
diff --git a/voxygen/src/render/texture.rs b/voxygen/src/render/texture.rs
index 39db02e2ba..fe07cb1921 100644
--- a/voxygen/src/render/texture.rs
+++ b/voxygen/src/render/texture.rs
@@ -136,8 +136,8 @@ impl Texture {
             address_mode_u: wgpu::AddressMode::ClampToEdge,
             address_mode_v: wgpu::AddressMode::ClampToEdge,
             address_mode_w: wgpu::AddressMode::ClampToEdge,
-            mag_filter: wgpu::FilterMode::Nearest,
-            min_filter: wgpu::FilterMode::Nearest,
+            mag_filter: wgpu::FilterMode::Linear,
+            min_filter: wgpu::FilterMode::Linear,
             mipmap_filter: wgpu::FilterMode::Nearest,
             ..Default::default()
         };
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 5bf35e870b..859c81b084 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -61,7 +61,6 @@ pub struct Id(u32);
 #[derive(PartialEq, Eq, Hash, Copy, Clone)]
 pub struct TexId(usize);
 
-type Parameters = (Id, Vec2<u16>);
 // TODO replace with slab/slotmap
 type GraphicMap = HashMap<Id, Graphic>;
 
@@ -93,7 +92,7 @@ impl CachedDetails {
     fn info(
         &self,
         atlases: &[(SimpleAtlasAllocator, usize)],
-        dims: Vec2<u16>,
+        textures: &Slab<(Texture, UiTextureBindGroup)>,
     ) -> (usize, bool, Aabr<u16>) {
         match *self {
             CachedDetails::Atlas {
@@ -105,14 +104,16 @@ impl CachedDetails {
                 (index, valid, Aabr {
                     min: Vec2::zero(),
                     // Note texture should always match the cached dimensions
-                    max: dims,
+                    // TODO: Justify cast here?
+                    max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
                 })
             },
             CachedDetails::Immutable { index } => {
                 (index, true, Aabr {
                     min: Vec2::zero(),
                     // Note texture should always match the cached dimensions
-                    max: dims,
+                    // TODO: Justify cast here?
+                    max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
                 })
             },
         }
@@ -147,7 +148,7 @@ pub struct GraphicCache {
     atlases: Vec<(SimpleAtlasAllocator, usize)>,
     textures: Slab<(Texture, UiTextureBindGroup)>,
     // Stores the location of graphics rendered at a particular resolution and cached on the cpu
-    cache_map: HashMap<Parameters, CachedDetails>,
+    cache_map: HashMap<Id, CachedDetails>,
 
     keyed_jobs: KeyedJobs<(Id, Vec2<u16>), Option<(RgbaImage, Option<Rgba<f32>>)>>,
 }
@@ -237,17 +238,23 @@ impl GraphicCache {
         renderer: &mut Renderer,
         pool: Option<&SlowJobPool>,
         graphic_id: Id,
+        // TODO: if we aren't resizing here we can upload image earlier... (as long as this doesn't
+        // lead to uploading too much unused stuff). (cache_res name invalid)
         dims: Vec2<u16>,
         source: Aabr<f64>,
         rotation: Rotation,
-    ) -> Option<(Aabr<f64>, TexId)> {
+    ) -> Option<((Aabr<f64>, Vec2<f32>), TexId)> {
         let dims = match rotation {
+            // The image is placed into the atlas with no rotation, so we need to swap the
+            // dimensions here to get the resolution that the image will be displayed at but
+            // re-oriented into the "upright" space that the image is stored in and sampled from
+            // (this can be bit confusing initially / hard to explain).
             Rotation::Cw90 | Rotation::Cw270 => Vec2::new(dims.y, dims.x),
             Rotation::None | Rotation::Cw180 => dims,
             Rotation::SourceNorth => dims,
             Rotation::TargetNorth => dims,
         };
-        let key = (graphic_id, dims);
+        let key = graphic_id;
 
         // Rotate aabr according to requested rotation.
         let rotated_aabr = |Aabr { min, max }| match rotation {
@@ -264,7 +271,7 @@ impl GraphicCache {
         };
         // Scale aabr according to provided source rectangle.
         let scaled_aabr = |aabr: Aabr<_>| {
-            let size: Vec2<_> = aabr.size().into();
+            let size: Vec2<f64> = aabr.size().into();
             Aabr {
                 min: size.mul_add(source.min, aabr.min),
                 max: size.mul_add(source.max, aabr.min),
@@ -272,7 +279,19 @@ impl GraphicCache {
         };
         // Apply all transformations.
         // TODO: Verify rotation is being applied correctly.
-        let transformed_aabr = |aabr| rotated_aabr(scaled_aabr(aabr));
+        let transformed_aabr = |aabr| {
+            let scaled = scaled_aabr(aabr);
+            // Calculate how many displayed pixels there are for each pixel in the source
+            // image. We need this to calculate where to sample in the shader to
+            // retain crisp pixel borders when scaling the image.
+            // TODO: A bit hacky inserting this here, just to get things working initially
+            let scale = dims.map2(
+                Vec2::from(scaled.size()),
+                |screen_pixels, sample_pixels: f64| screen_pixels as f32 / sample_pixels as f32,
+            );
+            let transformed = rotated_aabr(scaled);
+            (transformed, scale)
+        };
 
         let Self {
             textures,
@@ -285,7 +304,7 @@ impl GraphicCache {
         let details = match cache_map.entry(key) {
             Entry::Occupied(details) => {
                 let details = details.get();
-                let (idx, valid, aabr) = details.info(atlases, dims);
+                let (idx, valid, aabr) = details.info(atlases, textures);
 
                 // Check if the cached version has been invalidated by replacing the underlying
                 // graphic
@@ -312,28 +331,33 @@ impl GraphicCache {
         let (image, border_color) =
             draw_graphic(graphic_map, graphic_id, dims, &mut self.keyed_jobs, pool)?;
 
+        // TODO: justify cast?
+        let image_dims = Vec2::<u32>::from(image.dimensions()).map(|e| e as u16);
+
         // Upload
         let atlas_size = atlas_size(renderer);
 
-        // Allocate space on the gpu
-        // Check size of graphic
-        // Graphics over a particular size are sent to their own textures
+        // Allocate space on the gpu.
+        //
+        // Graphics with a border color.
         let location = if let Some(border_color) = border_color {
             // Create a new immutable texture.
             let texture = create_image(renderer, image, border_color);
             // NOTE: All mutations happen only after the upload succeeds!
             let index = textures.insert(texture);
             CachedDetails::Immutable { index }
+        // Graphics over a particular size compared to the atlas size are sent
+        // to their own textures. Here we check for ones under that
+        // size.
         } else if atlas_size
-            .map2(dims, |a, d| a as f32 * ATLAS_CUTOFF_FRAC >= d as f32)
+            .map2(image_dims, |a, d| a as f32 * ATLAS_CUTOFF_FRAC >= d as f32)
             .reduce_and()
         {
             // Fit into an atlas
             let mut loc = None;
             for (atlas_idx, &mut (ref mut atlas, texture_idx)) in atlases.iter_mut().enumerate() {
-                let dims = dims.map(|e| e.max(1));
-                if let Some(rectangle) = atlas.allocate(size2(i32::from(dims.x), i32::from(dims.y)))
-                {
+                let clamped_dims = image_dims.map(|e| i32::from(e.max(1)));
+                if let Some(rectangle) = atlas.allocate(size2(clamped_dims.x, clamped_dims.y)) {
                     let aabr = aabr_from_alloc_rect(rectangle);
                     loc = Some(CachedDetails::Atlas {
                         atlas_idx,
@@ -350,9 +374,9 @@ impl GraphicCache {
                 // Create a new atlas
                 None => {
                     let (mut atlas, texture) = create_atlas_texture(renderer);
-                    let dims = dims.map(|e| e.max(1));
+                    let clamped_dims = image_dims.map(|e| i32::from(e.max(1)));
                     let aabr = atlas
-                        .allocate(size2(i32::from(dims.x), i32::from(dims.y)))
+                        .allocate(size2(clamped_dims.x, clamped_dims.y))
                         .map(aabr_from_alloc_rect)
                         .unwrap();
                     // NOTE: All mutations happen only after the texture creation succeeds!
@@ -370,7 +394,7 @@ impl GraphicCache {
         } else {
             // Create a texture just for this
             let texture = {
-                let tex = renderer.create_dynamic_texture(dims.map(|e| e as u32));
+                let tex = renderer.create_dynamic_texture(image_dims.map(u32::from));
                 let bind = renderer.ui_bind_texture(&tex);
                 (tex, bind)
             };
@@ -381,7 +405,7 @@ impl GraphicCache {
                 Aabr {
                     min: Vec2::zero(),
                     // Note texture should always match the cached dimensions
-                    max: dims,
+                    max: image_dims,
                 },
                 &textures[index].0,
                 &image,
@@ -390,7 +414,7 @@ impl GraphicCache {
         };
 
         // Extract information from cache entry.
-        let (idx, _, aabr) = location.info(atlases, dims);
+        let (idx, _, aabr) = location.info(atlases, textures);
 
         // Insert into cached map
         details.insert(location);
@@ -419,14 +443,17 @@ fn draw_graphic(
                             // Render image at requested resolution
                             // TODO: Use source aabr.
                             Graphic::Image(ref image, border_color) => Some((
-                                resize_pixel_art(
+                                /*resize_pixel_art(
                                     &image.to_rgba8(),
                                     u32::from(dims.x),
                                     u32::from(dims.y),
-                                ),
+                                ),*/
+                                image.to_rgba8(),
                                 border_color,
                             )),
                             Graphic::Voxel(ref segment, trans, sample_strat) => {
+                                // TODO: how to decide on dimensions to render voxel models to?
+                                // (with resizing being done in shaders)
                                 Some((renderer::draw_vox(segment, dims, trans, sample_strat), None))
                             },
                             Graphic::Blank => None,
@@ -498,12 +525,12 @@ fn create_image(
     let tex = renderer
         .create_texture(
             &DynamicImage::ImageRgba8(image),
-            None,
-            //TODO: either use the desktop only border color or just emulate this
+            Some(wgpu::FilterMode::Linear),
+            // TODO: either use the desktop only border color or just emulate this
             // Some(border_color.into_array().into()),
             Some(wgpu::AddressMode::ClampToBorder),
         )
-        .expect("create_texture only panics is non ImageRbga8 is passed");
+        .expect("create_texture only panics if non ImageRbga8 is passed");
     let bind = renderer.ui_bind_texture(&tex);
 
     (tex, bind)
diff --git a/voxygen/src/ui/ice/renderer/mod.rs b/voxygen/src/ui/ice/renderer/mod.rs
index b3b415741e..eb7ea61237 100644
--- a/voxygen/src/ui/ice/renderer/mod.rs
+++ b/voxygen/src/ui/ice/renderer/mod.rs
@@ -533,7 +533,7 @@ impl IcedRenderer {
                 }
 
                 // Cache graphic at particular resolution.
-                let (uv_aabr, tex_id) = match graphic_cache.cache_res(
+                let (uv_aabr, scale, tex_id) = match graphic_cache.cache_res(
                     renderer,
                     pool,
                     graphic_id,
@@ -543,7 +543,7 @@ impl IcedRenderer {
                     rotation,
                 ) {
                     // TODO: get dims from graphic_cache (or have it return floats directly)
-                    Some((aabr, tex_id)) => {
+                    Some(((aabr, scale), tex_id)) => {
                         let cache_dims = graphic_cache
                             .get_tex(tex_id)
                             .0
@@ -552,7 +552,7 @@ impl IcedRenderer {
                             .map(|e| e as f32);
                         let min = Vec2::new(aabr.min.x as f32, aabr.max.y as f32) / cache_dims;
                         let max = Vec2::new(aabr.max.x as f32, aabr.min.y as f32) / cache_dims;
-                        (Aabr { min, max }, tex_id)
+                        (Aabr { min, max }, scale, tex_id)
                     },
                     None => return,
                 };
@@ -562,7 +562,9 @@ impl IcedRenderer {
                 self.switch_state(State::Image(tex_id));
 
                 self.mesh
-                    .push_quad(create_ui_quad(gl_aabr, uv_aabr, color, UiMode::Image));
+                    .push_quad(create_ui_quad(gl_aabr, uv_aabr, color, UiMode::Image {
+                        scale,
+                    }));
             },
             Primitive::Gradient {
                 bounds,
diff --git a/voxygen/src/ui/mod.rs b/voxygen/src/ui/mod.rs
index 535ec2e3ad..c48726c025 100644
--- a/voxygen/src/ui/mod.rs
+++ b/voxygen/src/ui/mod.rs
@@ -854,7 +854,7 @@ impl Ui {
                         srgba_to_linear(color.unwrap_or(conrod_core::color::WHITE).to_fsa().into());
 
                     // Cache graphic at particular resolution.
-                    let (uv_aabr, tex_id) = match graphic_cache.cache_res(
+                    let (uv_aabr, scale, tex_id) = match graphic_cache.cache_res(
                         renderer,
                         pool,
                         *graphic_id,
@@ -863,7 +863,7 @@ impl Ui {
                         *rotation,
                     ) {
                         // TODO: get dims from graphic_cache (or have it return floats directly)
-                        Some((aabr, tex_id)) => {
+                        Some(((aabr, scale), tex_id)) => {
                             let cache_dims = graphic_cache
                                 .get_tex(tex_id)
                                 .0
@@ -872,7 +872,7 @@ impl Ui {
                                 .map(|e| e as f32);
                             let min = Vec2::new(aabr.min.x as f32, aabr.max.y as f32) / cache_dims;
                             let max = Vec2::new(aabr.max.x as f32, aabr.min.y as f32) / cache_dims;
-                            (Aabr { min, max }, tex_id)
+                            (Aabr { min, max }, scale, tex_id)
                         },
                         None => continue,
                     };
@@ -897,10 +897,10 @@ impl Ui {
 
                     mesh.push_quad(create_ui_quad(gl_aabr, uv_aabr, color, match *rotation {
                         Rotation::None | Rotation::Cw90 | Rotation::Cw180 | Rotation::Cw270 => {
-                            UiMode::Image
+                            UiMode::Image { scale }
                         },
-                        Rotation::SourceNorth => UiMode::ImageSourceNorth,
-                        Rotation::TargetNorth => UiMode::ImageTargetNorth,
+                        Rotation::SourceNorth => UiMode::ImageSourceNorth { scale },
+                        Rotation::TargetNorth => UiMode::ImageTargetNorth { scale },
                     }));
                 },
                 PrimitiveKind::Text { .. } => {

From 69a1a661b6f999e51e728e6d53f37f971effdce6 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Tue, 30 Aug 2022 03:02:12 -0400
Subject: [PATCH 02/14] fixes and tweaks (various minor changes related to UI
 image scaling on GPU)

---
 assets/voxygen/shaders/ui-frag.glsl | 26 +++++++++++++++++---------
 voxygen/src/ui/graphic/mod.rs       |  3 +--
 voxygen/src/ui/graphic/pixel_art.rs |  9 ++++++---
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/assets/voxygen/shaders/ui-frag.glsl b/assets/voxygen/shaders/ui-frag.glsl
index c1f7721580..088e1bbb6a 100644
--- a/assets/voxygen/shaders/ui-frag.glsl
+++ b/assets/voxygen/shaders/ui-frag.glsl
@@ -71,9 +71,10 @@ void downscale_params(float pos, float scale, out vec2 weights, out vec2 offsets
     // 4 pixels (within a single dimension) in the sampled texture. So we can't
     // perfectly compute the contribution of each covered pixel in the sampled
     // texture with only 2 samples (along each dimension). Thus, we fallback to
-    // an imperfect technique of of just sampling a 1 pixel length from the
-    // center on each side. An alternative would be to pre-compute mipmap
-    // levels that could be sampled from.
+    // an imperfect technique of just sampling a 1 pixel length from the center
+    // on each side of the nearest pixel edge. An alternative might be to
+    // pre-compute mipmap levels that could be sampled from, although this
+    // could interact poorly with the atlas.
     if (scale > (1.0 / 3.0)) {
         // Width of the fragment in terms of pixels in the sampled texture.
         float width = 1.0 / scale;
@@ -87,7 +88,7 @@ void downscale_params(float pos, float scale, out vec2 weights, out vec2 offsets
         offsets = vec2(split) + vec2(-right_sample_offset, left_sample_offset);
         weights = vec2(right_len, left_len) / width;
     } else {
-        offsets = pos + vec2(-1.0, 1.0);
+        offsets = round(pos) + vec2(-1.0, 1.0);
         // We split in the middle so weights for both sides are the same.
         weights = vec2(0.5);
     }
@@ -140,10 +141,6 @@ vec4 downscale_xy(vec2 uv_pixel, vec2 scale) {
     vec2 uv1 = vec2(offsets_x[1], offsets_y[0]) / texture_size;
     vec2 uv2 = vec2(offsets_x[0], offsets_y[1]) / texture_size;
     vec2 uv3 = vec2(offsets_x[1], offsets_y[1]) / texture_size;
-    uv0 = uv_pixel / texture_size;
-    uv1 = uv_pixel / texture_size;
-    uv2 = uv_pixel / texture_size;
-    uv3 = uv_pixel / texture_size;
     vec4 s0 = textureLod(sampler2D(t_tex, s_tex), uv0, 0);
     vec4 s1 = textureLod(sampler2D(t_tex, s_tex), uv1, 0);
     vec4 s2 = textureLod(sampler2D(t_tex, s_tex), uv2, 0);
@@ -189,7 +186,7 @@ void main() {
         
         // Convert to sampled pixel coordinates.
         vec2 uv_pixel = f_uv * texture_size;
-        vec4 image_color = vec4(1.0, 0, 0, 1);
+        vec4 image_color;
         #ifdef EXPERIMENTAL_UINEARESTSCALING
             vec2 uv = (floor(uv_pixel) + 0.5) / texture_size;
             image_color = textureLod(sampler2D(t_tex, s_tex), uv, 0);
@@ -209,6 +206,17 @@ void main() {
             }
         #endif
 
+        // un-premultiply alpha (TODO: we pretend all input images are
+        // premultiplied for now although they aren't all necessarily)
+        if (image_color.a > 0.001) {
+            image_color.rgb /= image_color.a;
+        } 
+
+        // TEMP: Use this to make map a solid color
+        if (texture_size.x != 1600) {
+            //image_color = vec4(0, 0, 1, 1);
+        }
+
         tgt_color = f_color * image_color;
     // 2D Geometry
     } else if (f_mode == uint(2)) {
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 859c81b084..6a9dbfd1b6 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -11,7 +11,6 @@ use common::{figure::Segment, slowjob::SlowJobPool};
 use guillotiere::{size2, SimpleAtlasAllocator};
 use hashbrown::{hash_map::Entry, HashMap};
 use image::{DynamicImage, RgbaImage};
-use pixel_art::resize_pixel_art;
 use slab::Slab;
 use std::{hash::Hash, sync::Arc};
 use tracing::warn;
@@ -184,7 +183,7 @@ impl GraphicCache {
 
         // Remove from caches
         // Maybe make this more efficient if replace graphic is used more often
-        self.cache_map.retain(|&(key_id, _key_dims), details| {
+        self.cache_map.retain(|&key_id, details| {
             // If the entry does not reference id, or it does but we can successfully
             // invalidate, retain the entry; otherwise, discard this entry completely.
             key_id != id
diff --git a/voxygen/src/ui/graphic/pixel_art.rs b/voxygen/src/ui/graphic/pixel_art.rs
index 6ced1a620c..a94034648a 100644
--- a/voxygen/src/ui/graphic/pixel_art.rs
+++ b/voxygen/src/ui/graphic/pixel_art.rs
@@ -11,10 +11,13 @@ const EPSILON: f32 = 0.0001;
 
 // Averaging colors with alpha such that when blending with the background color
 // the same color will be produced as when the individual colors were blended
-// with the background and then averaged
+// with the background and then averaged.
+//
 // Say we have two areas that we are combining to form a single pixel
 // A1 and A2 where these are the fraction of the area of the pixel each color
-// contributes to Then if the colors were opaque we would say that the final
+// contributes to.
+//
+// Then if the colors were opaque we would say that the final
 // color output color o3 is
 //     E1: o3 = A1 * o1 + A2 * o2
 // where o1 and o2 are the opaque colors of the two areas
@@ -30,7 +33,7 @@ const EPSILON: f32 = 0.0001;
 //     E6: c3 * a3 = A1 * c1 * a1 + A2 * c2 * a2
 //     E7: b * (1 - a3) = A1 * b * (1 - a1) + A2 * b * (1 - a2)
 // dropping b from E7 and solving for a3
-//     E8: a3 = 1 - A1 * (1 - a1) + A2 * (1 - a2)
+//     E8: a3 = 1 - A1 * (1 - a1) - A2 * (1 - a2)
 // we can now calculate the combined alpha value
 // and E6 can then be solved for c3
 //     E9: c3 = (A1 * c1 * a1 + A2 * c2 * a2) / a3

From eb6d16b02caf38fa86f3a0f693dc9ba3c286b01f Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Fri, 2 Sep 2022 21:34:59 -0400
Subject: [PATCH 03/14] Rename draw_graphic -> prepare_graphic since it is no
 longer always specifically rendering an image at the provided dimensions.
 Start working on alpha premultipication which is necessary for being able to
 properly sample colors on the GPU with bilinear filtering. Various tweaks to
 comments and misc changes.

---
 assets/voxygen/shaders/ui-frag.glsl |  11 +-
 voxygen/src/ui/graphic/mod.rs       | 228 ++++++++++++++++++----------
 voxygen/src/ui/graphic/renderer.rs  |   2 +-
 3 files changed, 154 insertions(+), 87 deletions(-)

diff --git a/assets/voxygen/shaders/ui-frag.glsl b/assets/voxygen/shaders/ui-frag.glsl
index 088e1bbb6a..ee04285033 100644
--- a/assets/voxygen/shaders/ui-frag.glsl
+++ b/assets/voxygen/shaders/ui-frag.glsl
@@ -148,7 +148,7 @@ vec4 downscale_xy(vec2 uv_pixel, vec2 scale) {
     vec4 s01 = s0 * weights_x[0] + s1 * weights_x[1];
     vec4 s23 = s2 * weights_x[0] + s3 * weights_x[1];
     // Useful to visualize things below the limit where downscaling is supposed
-    // to be accurate.
+    // to be perfectly accurate.
     /*if (scale.x < (1.0 / 3.0)) {
         return vec4(1, 0, 0, 1);
     }*/
@@ -206,17 +206,12 @@ void main() {
             }
         #endif
 
-        // un-premultiply alpha (TODO: we pretend all input images are
-        // premultiplied for now although they aren't all necessarily)
+        // un-premultiply alpha (linear filtering above requires alpha to be
+        // pre-multiplied)
         if (image_color.a > 0.001) {
             image_color.rgb /= image_color.a;
         } 
 
-        // TEMP: Use this to make map a solid color
-        if (texture_size.x != 1600) {
-            //image_color = vec4(0, 0, 1, 1);
-        }
-
         tgt_color = f_color * image_color;
     // 2D Geometry
     } else if (f_mode == uint(2)) {
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 6a9dbfd1b6..c6426b4221 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -13,7 +13,7 @@ use hashbrown::{hash_map::Entry, HashMap};
 use image::{DynamicImage, RgbaImage};
 use slab::Slab;
 use std::{hash::Hash, sync::Arc};
-use tracing::warn;
+use tracing::{error, warn};
 use vek::*;
 
 #[derive(Clone)]
@@ -25,6 +25,7 @@ pub enum Graphic {
     /// non-square (meaning if we want to display the whole map and render to a
     /// square, we may render out of bounds unless we perform proper
     /// clipping).
+    // TODO: probably convert this type to `RgbaImage`.
     Image(Arc<DynamicImage>, Option<Rgba<f32>>),
     // Note: none of the users keep this Arc currently
     Voxel(Arc<Segment>, Transform, SampleStrat),
@@ -60,9 +61,6 @@ pub struct Id(u32);
 #[derive(PartialEq, Eq, Hash, Copy, Clone)]
 pub struct TexId(usize);
 
-// TODO replace with slab/slotmap
-type GraphicMap = HashMap<Id, Graphic>;
-
 enum CachedDetails {
     Atlas {
         // Index of the atlas this is cached in
@@ -93,6 +91,8 @@ impl CachedDetails {
         atlases: &[(SimpleAtlasAllocator, usize)],
         textures: &Slab<(Texture, UiTextureBindGroup)>,
     ) -> (usize, bool, Aabr<u16>) {
+        // NOTE: We don't accept images larger than u16::MAX (rejected in `cache_res`)
+        // (and probably would not be able to create a texture this large).
         match *self {
             CachedDetails::Atlas {
                 atlas_idx,
@@ -103,7 +103,6 @@ impl CachedDetails {
                 (index, valid, Aabr {
                     min: Vec2::zero(),
                     // Note texture should always match the cached dimensions
-                    // TODO: Justify cast here?
                     max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
                 })
             },
@@ -111,7 +110,6 @@ impl CachedDetails {
                 (index, true, Aabr {
                     min: Vec2::zero(),
                     // Note texture should always match the cached dimensions
-                    // TODO: Justify cast here?
                     max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
                 })
             },
@@ -139,17 +137,22 @@ impl CachedDetails {
 // Caches graphics, only deallocates when changing screen resolution (completely
 // cleared)
 pub struct GraphicCache {
-    graphic_map: GraphicMap,
-    // Next id to use when a new graphic is added
+    // TODO replace with slotmap
+    graphic_map: HashMap<Id, Graphic>,
+    /// Next id to use when a new graphic is added
     next_id: u32,
 
-    // Atlases with the index of their texture in the textures vec
+    /// Atlases with the index of their texture in the textures vec
     atlases: Vec<(SimpleAtlasAllocator, usize)>,
     textures: Slab<(Texture, UiTextureBindGroup)>,
-    // Stores the location of graphics rendered at a particular resolution and cached on the cpu
-    cache_map: HashMap<Id, CachedDetails>,
+    /// The location and details of graphics cached on the GPU.
+    ///
+    /// Graphic::Voxel images include the dimensions they were rasterized at in
+    /// the key. Other images are scaled as part of sampling them on the
+    /// GPU.
+    cache_map: HashMap<(Id, Option<Vec2<u16>>), CachedDetails>,
 
-    keyed_jobs: KeyedJobs<(Id, Vec2<u16>), Option<(RgbaImage, Option<Rgba<f32>>)>>,
+    keyed_jobs: KeyedJobs<(Id, Option<Vec2<u16>>), (RgbaImage, Option<Rgba<f32>>)>,
 }
 impl GraphicCache {
     pub fn new(renderer: &mut Renderer) -> Self {
@@ -183,7 +186,7 @@ impl GraphicCache {
 
         // Remove from caches
         // Maybe make this more efficient if replace graphic is used more often
-        self.cache_map.retain(|&key_id, details| {
+        self.cache_map.retain(|&(key_id, _), details| {
             // If the entry does not reference id, or it does but we can successfully
             // invalidate, retain the entry; otherwise, discard this entry completely.
             key_id != id
@@ -210,6 +213,8 @@ impl GraphicCache {
                     use common::vol::SizedVol;
                     let size = segment.size();
                     // TODO: HACK because they can be rotated arbitrarily, remove
+                    // (and they can be rasterized at arbitrary resolution)
+                    // (might need to return None here?)
                     Some((size.x, size.z))
                 },
                 Graphic::Blank => None,
@@ -238,22 +243,21 @@ impl GraphicCache {
         pool: Option<&SlowJobPool>,
         graphic_id: Id,
         // TODO: if we aren't resizing here we can upload image earlier... (as long as this doesn't
-        // lead to uploading too much unused stuff). (cache_res name invalid)
-        dims: Vec2<u16>,
+        // lead to uploading too much unused stuff).
+        requested_dims: Vec2<u16>,
         source: Aabr<f64>,
         rotation: Rotation,
     ) -> Option<((Aabr<f64>, Vec2<f32>), TexId)> {
-        let dims = match rotation {
-            // The image is placed into the atlas with no rotation, so we need to swap the
-            // dimensions here to get the resolution that the image will be displayed at but
-            // re-oriented into the "upright" space that the image is stored in and sampled from
-            // (this can be bit confusing initially / hard to explain).
-            Rotation::Cw90 | Rotation::Cw270 => Vec2::new(dims.y, dims.x),
-            Rotation::None | Rotation::Cw180 => dims,
-            Rotation::SourceNorth => dims,
-            Rotation::TargetNorth => dims,
+        let requested_dims_upright = match rotation {
+            // The image is stored on the GPU with no rotation, so we need to swap the dimensions
+            // here to get the resolution that the image will be displayed at but re-oriented into
+            // the "upright" space that the image is stored in and sampled from (this can be bit
+            // confusing initially / hard to explain).
+            Rotation::Cw90 | Rotation::Cw270 => requested_dims.yx(),
+            Rotation::None | Rotation::Cw180 => requested_dims,
+            Rotation::SourceNorth => requested_dims,
+            Rotation::TargetNorth => requested_dims,
         };
-        let key = graphic_id;
 
         // Rotate aabr according to requested rotation.
         let rotated_aabr = |Aabr { min, max }| match rotation {
@@ -283,8 +287,8 @@ impl GraphicCache {
             // Calculate how many displayed pixels there are for each pixel in the source
             // image. We need this to calculate where to sample in the shader to
             // retain crisp pixel borders when scaling the image.
-            // TODO: A bit hacky inserting this here, just to get things working initially
-            let scale = dims.map2(
+            // S-TODO: A bit hacky inserting this here, just to get things working initially
+            let scale = requested_dims_upright.map2(
                 Vec2::from(scaled.size()),
                 |screen_pixels, sample_pixels: f64| screen_pixels as f32 / sample_pixels as f32,
             );
@@ -300,6 +304,25 @@ impl GraphicCache {
             ..
         } = self;
 
+        let graphic = match graphic_map.get(&graphic_id) {
+            Some(g) => g,
+            None => {
+                warn!(
+                    ?graphic_id,
+                    "A graphic was requested via an id which is not in use"
+                );
+                return None;
+            },
+        };
+
+        let key = (
+            graphic_id,
+            // Dimensions only included in the key for voxel graphics which we rasterize at the
+            // size that they will be displayed at (other images are scaled when sampling them on
+            // the GPU).
+            matches!(graphic, Graphic::Voxel { .. }).then(|| requested_dims_upright),
+        );
+
         let details = match cache_map.entry(key) {
             Entry::Occupied(details) => {
                 let details = details.get();
@@ -309,8 +332,13 @@ impl GraphicCache {
                 // graphic
                 if !valid {
                     // Create image
-                    let (image, border) =
-                        draw_graphic(graphic_map, graphic_id, dims, &mut self.keyed_jobs, pool)?;
+                    let (image, border) = prepare_graphic(
+                        graphic,
+                        graphic_id,
+                        requested_dims_upright,
+                        &mut self.keyed_jobs,
+                        pool,
+                    )?;
                     // If the cache location is invalid, we know the underlying texture is mutable,
                     // so we should be able to replace the graphic.  However, we still want to make
                     // sure that we are not reusing textures for images that specify a border
@@ -325,13 +353,31 @@ impl GraphicCache {
             Entry::Vacant(details) => details,
         };
 
-        // Construct image in a threadpool
+        // Construct image in an optional threadpool.
+        let (image, border_color) = prepare_graphic(
+            graphic,
+            graphic_id,
+            requested_dims_upright,
+            &mut self.keyed_jobs,
+            pool,
+        )?;
 
-        let (image, border_color) =
-            draw_graphic(graphic_map, graphic_id, dims, &mut self.keyed_jobs, pool)?;
-
-        // TODO: justify cast?
-        let image_dims = Vec2::<u32>::from(image.dimensions()).map(|e| e as u16);
+        // Image sizes over u16::MAX are not supported (and we would probably not be
+        // able to create a texture large enough to hold them on the GPU anyway)!
+        let image_dims = match {
+            let (x, y) = image.dimensions();
+            (u16::try_from(x), u16::try_from(y))
+        } {
+            (Ok(x), Ok(y)) => Vec2::new(x, y),
+            _ => {
+                error!(
+                    "Image dimensions greater than u16::MAX are not supported! Supplied image \
+                     size: {:?}.",
+                    image.dimensions()
+                );
+                return None;
+            },
+        };
 
         // Upload
         let atlas_size = atlas_size(renderer);
@@ -422,52 +468,52 @@ impl GraphicCache {
     }
 }
 
-// Draw a graphic at the specified dimensions
-fn draw_graphic(
-    graphic_map: &GraphicMap,
+/// Prepare the graphic into the form that will be uploaded to the GPU.
+///
+/// For voxel graphics, draws the graphic at the specified dimensions.
+///
+/// Also pre-multiplies alpha in images so they can be linearly filtered on the
+/// GPU.
+fn prepare_graphic(
+    graphic: &Graphic,
     graphic_id: Id,
     dims: Vec2<u16>,
-    keyed_jobs: &mut KeyedJobs<(Id, Vec2<u16>), Option<(RgbaImage, Option<Rgba<f32>>)>>,
+    keyed_jobs: &mut KeyedJobs<(Id, Option<Vec2<u16>>), (RgbaImage, Option<Rgba<f32>>)>,
     pool: Option<&SlowJobPool>,
 ) -> Option<(RgbaImage, Option<Rgba<f32>>)> {
-    match graphic_map.get(&graphic_id) {
+    match graphic {
         // Short-circuit spawning a job on the threadpool for blank graphics
-        Some(Graphic::Blank) => None,
-        Some(inner) => {
-            keyed_jobs
-                .spawn(pool, (graphic_id, dims), || {
-                    let inner = inner.clone();
-                    move |_| {
-                        match inner {
-                            // Render image at requested resolution
-                            // TODO: Use source aabr.
-                            Graphic::Image(ref image, border_color) => Some((
-                                /*resize_pixel_art(
-                                    &image.to_rgba8(),
-                                    u32::from(dims.x),
-                                    u32::from(dims.y),
-                                ),*/
-                                image.to_rgba8(),
-                                border_color,
-                            )),
-                            Graphic::Voxel(ref segment, trans, sample_strat) => {
-                                // TODO: how to decide on dimensions to render voxel models to?
-                                // (with resizing being done in shaders)
-                                Some((renderer::draw_vox(segment, dims, trans, sample_strat), None))
-                            },
-                            Graphic::Blank => None,
-                        }
-                    }
-                })
-                .and_then(|(_, v)| v)
-        },
-        None => {
-            warn!(
-                ?graphic_id,
-                "A graphic was requested via an id which is not in use"
-            );
-            None
-        },
+        Graphic::Blank => None,
+        // Dimensions are only included in the key for Graphic::Voxel since otherwise we will
+        // resize on the GPU.
+        Graphic::Image(image, border_color) => keyed_jobs
+            .spawn(pool, (graphic_id, None), || {
+                let image = Arc::clone(image);
+                let border_color = *border_color;
+                move |_| {
+                    // Image will be rescaled when sampling from it on the GPU so we don't
+                    // need to resize it here.
+                    let mut image = image.to_rgba8();
+                    // TODO: could potentially do this when loading the image and for voxel
+                    // images maybe at some point in the `draw_vox` processing. Or we could
+                    // push it in the other direction and do conversion on the GPU.
+                    premultiply_alpha(&mut image);
+                    (image, border_color)
+                }
+            })
+            .map(|(_, v)| v),
+        Graphic::Voxel(segment, trans, sample_strat) => keyed_jobs
+            .spawn(pool, (graphic_id, Some(dims)), || {
+                let segment = Arc::clone(segment);
+                let (trans, sample_strat) = (*trans, *sample_strat);
+                move |_| {
+                    // Render voxel model at requested resolution
+                    let mut image = renderer::draw_vox(&segment, dims, trans, sample_strat);
+                    premultiply_alpha(&mut image);
+                    (image, None)
+                }
+            })
+            .map(|(_, v)| v),
     }
 }
 
@@ -483,7 +529,7 @@ fn create_atlas_texture(
     renderer: &mut Renderer,
 ) -> (SimpleAtlasAllocator, (Texture, UiTextureBindGroup)) {
     let size = atlas_size(renderer);
-    // Note: here we assume the atlas size is under i32::MAX
+    // Note: here we assume the max texture size is under i32::MAX.
     let atlas = SimpleAtlasAllocator::new(size2(size.x as i32, size.y as i32));
     let texture = {
         let tex = renderer.create_dynamic_texture(size);
@@ -496,6 +542,8 @@ fn create_atlas_texture(
 
 fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
     let (min, max) = (rect.min, rect.max);
+    // Note: here we assume the max texture size (and thus the maximum size of the
+    // atlas) is under `u16::MAX`.
     Aabr {
         min: Vec2::new(min.x as u16, min.y as u16),
         max: Vec2::new(max.x as u16, max.y as u16),
@@ -503,7 +551,7 @@ fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
 }
 
 fn upload_image(renderer: &mut Renderer, aabr: Aabr<u16>, tex: &Texture, image: &RgbaImage) {
-    let aabr = aabr.map(|e| e as u32);
+    let aabr = aabr.map(u32::from);
     let offset = aabr.min.into_array();
     let size = aabr.size().into_array();
     renderer.update_texture(
@@ -534,3 +582,27 @@ fn create_image(
 
     (tex, bind)
 }
+
+fn premultiply_alpha(image: &mut RgbaImage) {
+    // S-TODO: temp remove me
+    // TODO: check with minimap
+    // TODO: log image size
+    // TODO: benchmark
+    common_base::prof_span!("premultiply_alpha");
+    tracing::error!("{:?}", image.dimensions());
+    use common::util::{linear_to_srgba, srgba_to_linear};
+    image.pixels_mut().for_each(|pixel| {
+        let alpha = pixel.0[3];
+        if alpha == 0 && pixel.0 != [0; 4] {
+            pixel.0 = [0; 4];
+        } else if alpha != 255 {
+            // Convert to linear, multiply color components by alpha, and convert back to
+            // non-linear.
+            let linear = srgba_to_linear(Rgba::from(pixel.0).map(|e: u8| e as f32 / 255.0));
+            let premultiplied = Rgba::from_translucent(Rgb::from(linear) * linear.a, linear.a);
+            pixel.0 = linear_to_srgba(premultiplied)
+                .map(|e| (e * 255.0) as u8)
+                .into_array();
+        }
+    })
+}
diff --git a/voxygen/src/ui/graphic/renderer.rs b/voxygen/src/ui/graphic/renderer.rs
index 05e4275bce..1f519153bc 100644
--- a/voxygen/src/ui/graphic/renderer.rs
+++ b/voxygen/src/ui/graphic/renderer.rs
@@ -14,7 +14,7 @@ pub enum SampleStrat {
     PixelCoverage,
 }
 
-#[derive(Clone)]
+#[derive(Clone, Copy)]
 pub struct Transform {
     pub ori: Quaternion<f32>,
     pub offset: Vec3<f32>,

From f62c2cde709dfd887e5fa9e8975cad1e94adbcf3 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Sat, 3 Sep 2022 16:21:50 -0400
Subject: [PATCH 04/14] Use fast-srgb8 crate to efficiently convert between
 non-linear srgb u8 and linear f32 values for performing alpha
 premultiplication on the CPU.

---
 Cargo.lock                    |  7 +++++++
 voxygen/Cargo.toml            |  1 +
 voxygen/src/lib.rs            |  1 +
 voxygen/src/ui/graphic/mod.rs | 31 +++++++++++++++++++++----------
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ae40df69e3..59d38d0088 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1959,6 +1959,12 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
 
+[[package]]
+name = "fast-srgb8"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd2e7510819d6fbf51a5545c8f922716ecfb14df168a3242f7d33e0239efe6a1"
+
 [[package]]
 name = "fastrand"
 version = "1.8.0"
@@ -7008,6 +7014,7 @@ dependencies = [
  "enum-iterator 1.1.3",
  "etagere",
  "euc",
+ "fast-srgb8",
  "gilrs",
  "glyph_brush",
  "guillotiere",
diff --git a/voxygen/Cargo.toml b/voxygen/Cargo.toml
index c630cfc619..0b3d7cd63a 100644
--- a/voxygen/Cargo.toml
+++ b/voxygen/Cargo.toml
@@ -134,6 +134,7 @@ num_cpus = "1.0"
 # vec_map = { version = "0.8.2" }
 inline_tweak = "1.0.2"
 itertools = "0.10.0"
+fast-srgb8 = "1.0.0"
 
 # Tracy
 tracing = "0.1"
diff --git a/voxygen/src/lib.rs b/voxygen/src/lib.rs
index 9f157c2529..53fe362a2b 100644
--- a/voxygen/src/lib.rs
+++ b/voxygen/src/lib.rs
@@ -5,6 +5,7 @@
 #![feature(
     array_methods,
     array_zip,
+    array_from_fn,
     drain_filter,
     once_cell,
     trait_alias,
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index c6426b4221..1c37b89e20 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -584,25 +584,36 @@ fn create_image(
 }
 
 fn premultiply_alpha(image: &mut RgbaImage) {
+    use fast_srgb8::{f32x4_to_srgb8, srgb8_to_f32};
     // S-TODO: temp remove me
-    // TODO: check with minimap
-    // TODO: log image size
-    // TODO: benchmark
-    common_base::prof_span!("premultiply_alpha");
+    // TODO: benchmark (29 ns per pixel)
     tracing::error!("{:?}", image.dimensions());
+    common_base::prof_span!("premultiply_alpha");
     use common::util::{linear_to_srgba, srgba_to_linear};
     image.pixels_mut().for_each(|pixel| {
         let alpha = pixel.0[3];
-        if alpha == 0 && pixel.0 != [0; 4] {
+        // With fast path checks, longest image was 16 ms with current assets.
+        // Without longest is 60 ms. (but not the same image!)
+        if alpha == 0 {
             pixel.0 = [0; 4];
         } else if alpha != 255 {
             // Convert to linear, multiply color components by alpha, and convert back to
             // non-linear.
-            let linear = srgba_to_linear(Rgba::from(pixel.0).map(|e: u8| e as f32 / 255.0));
-            let premultiplied = Rgba::from_translucent(Rgb::from(linear) * linear.a, linear.a);
-            pixel.0 = linear_to_srgba(premultiplied)
-                .map(|e| (e * 255.0) as u8)
-                .into_array();
+            let linear = Rgba::new(
+                srgb8_to_f32(pixel.0[0]),
+                srgb8_to_f32(pixel.0[1]),
+                srgb8_to_f32(pixel.0[2]),
+                alpha as f32 / 255.0,
+            );
+            let converted = fast_srgb8::f32x4_to_srgb8([
+                linear.r * linear.a,
+                linear.g * linear.a,
+                linear.b * linear.a,
+                0.0,
+            ]);
+            pixel.0[0] = converted[0];
+            pixel.0[1] = converted[1];
+            pixel.0[2] = converted[2];
         }
     })
 }

From 7538b04348ea1f54cf905e7d4f0fd7783ac49928 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Sun, 4 Sep 2022 12:47:10 -0400
Subject: [PATCH 05/14] Operate on 4 pixels at a time when premultiplying alpha
 to speed things up a little.

---
 voxygen/src/lib.rs            |  1 +
 voxygen/src/ui/graphic/mod.rs | 74 ++++++++++++++++++++++-------------
 2 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/voxygen/src/lib.rs b/voxygen/src/lib.rs
index 53fe362a2b..4c0a525091 100644
--- a/voxygen/src/lib.rs
+++ b/voxygen/src/lib.rs
@@ -13,6 +13,7 @@
     map_try_insert,
     slice_as_chunks,
     let_chains
+    portable_simd
 )]
 #![recursion_limit = "2048"]
 
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 1c37b89e20..06db5fc6a0 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -585,35 +585,55 @@ fn create_image(
 
 fn premultiply_alpha(image: &mut RgbaImage) {
     use fast_srgb8::{f32x4_to_srgb8, srgb8_to_f32};
-    // S-TODO: temp remove me
-    // TODO: benchmark (29 ns per pixel)
-    tracing::error!("{:?}", image.dimensions());
-    common_base::prof_span!("premultiply_alpha");
-    use common::util::{linear_to_srgba, srgba_to_linear};
-    image.pixels_mut().for_each(|pixel| {
-        let alpha = pixel.0[3];
-        // With fast path checks, longest image was 16 ms with current assets.
-        // Without longest is 60 ms. (but not the same image!)
+    // TODO: Apparently it is possible for ImageBuffer raw vec to have more pixels
+    // than the dimensions of the actual image (I don't think we actually have
+    // this occuring but we should probably fix other spots that use the raw
+    // buffer). See:
+    // https://github.com/image-rs/image/blob/a1ce569afd476e881acafdf9e7a5bce294d0db9a/src/buffer.rs#L664
+    let dims = image.dimensions();
+    let image_buffer_len = dims.0 as usize * dims.1 as usize * 4;
+    let (arrays, end) = image[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
+    // Rgba8 has 4 bytes per pixel they should be no remainder when dividing by 4.
+    let (end, _) = end.as_chunks_mut::<4>();
+    end.iter_mut().for_each(|pixel| {
+        let alpha = pixel[3];
         if alpha == 0 {
-            pixel.0 = [0; 4];
+            *pixel = [0; 4];
         } else if alpha != 255 {
-            // Convert to linear, multiply color components by alpha, and convert back to
-            // non-linear.
-            let linear = Rgba::new(
-                srgb8_to_f32(pixel.0[0]),
-                srgb8_to_f32(pixel.0[1]),
-                srgb8_to_f32(pixel.0[2]),
-                alpha as f32 / 255.0,
-            );
-            let converted = fast_srgb8::f32x4_to_srgb8([
-                linear.r * linear.a,
-                linear.g * linear.a,
-                linear.b * linear.a,
-                0.0,
-            ]);
-            pixel.0[0] = converted[0];
-            pixel.0[1] = converted[1];
-            pixel.0[2] = converted[2];
+            let linear_alpha = alpha as f32 / 255.0;
+            let [r, g, b] = core::array::from_fn(|i| srgb8_to_f32(pixel[i]) * linear_alpha);
+            let srgb8 = f32x4_to_srgb8([r, g, b, 0.0]);
+            (pixel[0], pixel[1], pixel[3]) = (srgb8[0], srgb8[1], srgb8[3]);
+        }
+    });
+    arrays.iter_mut().for_each(|pixelx4| {
+        use core::simd::{f32x4, u8x4, Simd};
+        let alpha = Simd::from_array([pixelx4[3], pixelx4[7], pixelx4[11], pixelx4[15]]);
+        if alpha == Simd::splat(0) {
+            *pixelx4 = [0; 16];
+        } else if alpha != Simd::splat(255) {
+            let linear_simd = |array: [u8; 4]| Simd::from_array(array.map(srgb8_to_f32));
+            // Pack rgb components from the 4th pixel into the the last position for each of
+            // the other 3 pixels.
+            let a = linear_simd([pixelx4[0], pixelx4[1], pixelx4[2], pixelx4[12]]);
+            let b = linear_simd([pixelx4[4], pixelx4[5], pixelx4[6], pixelx4[13]]);
+            let c = linear_simd([pixelx4[8], pixelx4[9], pixelx4[10], pixelx4[14]]);
+            let linear_alpha = alpha.cast::<f32>() * Simd::splat(1.0 / 255.0);
+
+            // Multiply by alpha and then convert back into srgb8.
+            let premultiply = |x: f32x4, i| {
+                let mut a = f32x4::splat(linear_alpha[i]);
+                a[3] = linear_alpha[3];
+                u8x4::from_array(f32x4_to_srgb8((x * a).to_array()))
+            };
+            let pa = premultiply(a, 0);
+            let pb = premultiply(b, 1);
+            let pc = premultiply(c, 2);
+
+            (pixelx4[0], pixelx4[1], pixelx4[2]) = (pa[0], pa[1], pa[2]);
+            (pixelx4[4], pixelx4[5], pixelx4[6]) = (pb[0], pb[1], pb[2]);
+            (pixelx4[8], pixelx4[9], pixelx4[10]) = (pc[0], pc[1], pc[2]);
+            (pixelx4[12], pixelx4[13], pixelx4[14]) = (pa[3], pb[3], pc[3]);
         }
     })
 }

From 1d51aae3b23ead32e3c6e167fbfb50f5d2fac66f Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Thu, 8 Sep 2022 01:04:41 -0400
Subject: [PATCH 06/14] Start attempt to premultiply alpha on the GPU.

---
 .../shaders/premultiply-alpha-compute.glsl    | 36 ++++++++++++++++
 voxygen/src/render/pipelines/ui.rs            | 41 +++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 assets/voxygen/shaders/premultiply-alpha-compute.glsl

diff --git a/assets/voxygen/shaders/premultiply-alpha-compute.glsl b/assets/voxygen/shaders/premultiply-alpha-compute.glsl
new file mode 100644
index 0000000000..36d11c2f30
--- /dev/null
+++ b/assets/voxygen/shaders/premultiply-alpha-compute.glsl
@@ -0,0 +1,36 @@
+#version 420 core
+
+// TODO: should we modify this based on the current device?
+// TODO: would it be better to have 2D workgroup for writing to a local area in the target image? 
+layout(local_size_x = 256) in;
+
+// TODO: writing all images into a single buffer?
+layout(set = 0, binding = 0) readonly buffer InputImage {
+    uint input_pixels[];
+};
+
+layout (std140, set = 0, binding = 1)
+uniform u_locals {
+    // Size of the input image.
+    uvec2 image_size;
+    // Offset to place the transformed input image at in the target
+    // image.
+    uvec2 target_offset;
+};
+
+layout(rgba8, set = 0, binding = 2) uniform writeonly image2D target_image;
+
+void main() {
+    uint global_id = gl_GlobalInvocationId.x;
+    uvec2 src_pixel_pos = uvec2(global_id % image_size.x, global_id / image_size.x);
+    // Otherwise this is is an out of bounds compute instance.
+    if (src_pixel_pos < image_size.y) {
+        uint pixel = input_pixels[global_id]; 
+        vec4 nonlinear = vec4((pixel >> 16) & 0xFFu, (pixel >> 8) & 0xFFu, (pixel >> 8) & 0xFFu, pixel & 0xFFu);
+        vec4 linear;
+        vec4 premultiplied_linear;
+        vec4 premultiplied_nonlinear;
+        // No free srgb with image store operations https://www.khronos.org/opengl/wiki/Image_Load_Store#Format_compatibility
+        imageStore(target_image, src_pixel_pos + target_offset, premultiplied_nonlinear);
+    }
+}
diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index a741020103..1bc3c7589a 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -394,3 +394,44 @@ pub fn create_tri(
         v([tri[2][0], tri[2][1]], [uv_tri[2][0], uv_tri[2][1]]),
     )
 }
+
+// Steps:
+// 1. Upload new image via `Device::create_buffer_init`, with `MAP_WRITE` flag
+//    to avoid staging buffer.
+// 2. Run compute pipeline to multiply by alpha reading from this buffer and
+//    writing to the final texture (this may be in an atlas or an independent
+//    texture if the image is over a certain size threshold).
+//
+// Info needed in compute shader:
+// * source buffer
+// * target texture
+// * image dimensions
+// * position in the target texture
+// (what is the overhead of compute call? at some point we may be better off
+// converting small images on the cpu)
+pub struct PremultiplyAlphaPipeline {
+    pub pipeline: wgpu::RenderPipeline,
+}
+
+impl PremultiplyAlphaPipeline {
+    pub fn new(
+        device: &wgpu::Device,
+        module: &wgpu::ShaderModule,
+        layout: &PremultiplAlphaLayout,
+    ) -> Self {
+        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("Premultiply alpha pipeline layout"),
+            push_constant_ranges: &[],
+            bind_group_layouts: &[layout],
+        });
+
+        let pipeline = device.create_compute_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: Some("Premultiply alpha pipeline"),
+            layout: Some(&pipeline_layout),
+            module,
+            entry_point: "main",
+        });
+
+        Self { pipeline }
+    }
+}

From efd932c71e6ac0d4a2483dbffc34e24051b15fd0 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Tue, 25 Oct 2022 00:31:35 -0400
Subject: [PATCH 07/14] Progress on GPU premultiplication.

* General progress in setting up code paths to support GPU
  premultiplication.
* Created `PremultiplyUpload` type to represent an initiated image
  upload where the premultiply pass needs to be ran to complete it.
* Converted from compute pass to render pass since current limitations
  make it difficult to write directly to a srgb image from a compute
  shader.
* Replace `CachedDetails::Immutable` with keeping track of the
  parameters used to create the texture (i.e. the border color).
* Create `TextureRequirements`, `TextureParamters`, and `CacheKey` types
  to encode parameters that go into texture creation and image caching
  and to determine when the space in texture memory should be reused
  when replacing a graphic.
* Add custom texture creation logic for the UI textures since those need
  certain usage combinations.
---
 assets/voxygen/shaders/include/srgb.glsl      |  10 +
 .../shaders/premultiply-alpha-compute.glsl    |  36 --
 .../shaders/premultiply-alpha-frag.glsl       |  16 +
 .../shaders/premultiply-alpha-vert.glsl       |  48 ++
 voxygen/src/render/mod.rs                     |   3 +-
 voxygen/src/render/pipelines/ui.rs            | 255 +++++++-
 voxygen/src/render/renderer.rs                |  17 +
 voxygen/src/render/renderer/drawer.rs         |  39 ++
 .../src/render/renderer/pipeline_creation.rs  |  35 +-
 voxygen/src/render/renderer/shaders.rs        |   2 +
 voxygen/src/render/texture.rs                 |   1 +
 voxygen/src/ui/cache.rs                       |   4 +-
 voxygen/src/ui/graphic/mod.rs                 | 573 ++++++++++++------
 voxygen/src/ui/ice/cache.rs                   |   4 +-
 voxygen/src/ui/ice/renderer/mod.rs            |   2 +-
 voxygen/src/ui/mod.rs                         |   2 +-
 16 files changed, 789 insertions(+), 258 deletions(-)
 delete mode 100644 assets/voxygen/shaders/premultiply-alpha-compute.glsl
 create mode 100644 assets/voxygen/shaders/premultiply-alpha-frag.glsl
 create mode 100644 assets/voxygen/shaders/premultiply-alpha-vert.glsl

diff --git a/assets/voxygen/shaders/include/srgb.glsl b/assets/voxygen/shaders/include/srgb.glsl
index b1abd93811..5aa033c126 100644
--- a/assets/voxygen/shaders/include/srgb.glsl
+++ b/assets/voxygen/shaders/include/srgb.glsl
@@ -43,6 +43,16 @@ vec3 linear_to_srgb(vec3 col) {
     );
 }
 
+vec4 srgba8_to_linear(uint srgba8) {
+    uvec4 nonlinear = vec4(uvec4(
+        (srgba8 >> 24) & 0xFFu,
+        (srgba8 >> 16) & 0xFFu,
+        (srgba8 >>  8) & 0xFFu,
+         srgba8        & 0xFFu
+    )) / 255.0;
+    return vec4(srgb_to_linear(nonlinear.rgb), nonlinear.a);
+}
+
 float pow5(float x) {
     float x2 = x * x;
     return x2 * x2 * x;
diff --git a/assets/voxygen/shaders/premultiply-alpha-compute.glsl b/assets/voxygen/shaders/premultiply-alpha-compute.glsl
deleted file mode 100644
index 36d11c2f30..0000000000
--- a/assets/voxygen/shaders/premultiply-alpha-compute.glsl
+++ /dev/null
@@ -1,36 +0,0 @@
-#version 420 core
-
-// TODO: should we modify this based on the current device?
-// TODO: would it be better to have 2D workgroup for writing to a local area in the target image? 
-layout(local_size_x = 256) in;
-
-// TODO: writing all images into a single buffer?
-layout(set = 0, binding = 0) readonly buffer InputImage {
-    uint input_pixels[];
-};
-
-layout (std140, set = 0, binding = 1)
-uniform u_locals {
-    // Size of the input image.
-    uvec2 image_size;
-    // Offset to place the transformed input image at in the target
-    // image.
-    uvec2 target_offset;
-};
-
-layout(rgba8, set = 0, binding = 2) uniform writeonly image2D target_image;
-
-void main() {
-    uint global_id = gl_GlobalInvocationId.x;
-    uvec2 src_pixel_pos = uvec2(global_id % image_size.x, global_id / image_size.x);
-    // Otherwise this is is an out of bounds compute instance.
-    if (src_pixel_pos < image_size.y) {
-        uint pixel = input_pixels[global_id]; 
-        vec4 nonlinear = vec4((pixel >> 16) & 0xFFu, (pixel >> 8) & 0xFFu, (pixel >> 8) & 0xFFu, pixel & 0xFFu);
-        vec4 linear;
-        vec4 premultiplied_linear;
-        vec4 premultiplied_nonlinear;
-        // No free srgb with image store operations https://www.khronos.org/opengl/wiki/Image_Load_Store#Format_compatibility
-        imageStore(target_image, src_pixel_pos + target_offset, premultiplied_nonlinear);
-    }
-}
diff --git a/assets/voxygen/shaders/premultiply-alpha-frag.glsl b/assets/voxygen/shaders/premultiply-alpha-frag.glsl
new file mode 100644
index 0000000000..b8a1aaee10
--- /dev/null
+++ b/assets/voxygen/shaders/premultiply-alpha-frag.glsl
@@ -0,0 +1,16 @@
+#version 420 core
+
+layout(set = 0, binding = 0)
+uniform texture2D source_texture;
+
+layout(location = 0) in vec2 source_coords;
+
+layout(location = 0) out vec4 target_color;
+
+void main() {
+    // We get free nonlinear -> linear conversion when sampling from srgb texture;
+    vec4 linear = texelFetch(source_texture, ivec2(source_coords), 0);
+    vec4 premultiplied_linear = vec4(linear.rgb * linear.a, linear.a);
+    // We get free linear -> nonlinear conversion rendering to srgb texture.
+    target_color = premultiplied_linear;
+}
diff --git a/assets/voxygen/shaders/premultiply-alpha-vert.glsl b/assets/voxygen/shaders/premultiply-alpha-vert.glsl
new file mode 100644
index 0000000000..c88977e6fb
--- /dev/null
+++ b/assets/voxygen/shaders/premultiply-alpha-vert.glsl
@@ -0,0 +1,48 @@
+#version 420 core
+
+layout(push_constant) uniform Params {
+    // Size of the source image.
+    uint source_size_xy;
+    // Offset to place the image at in the target texture.
+    //
+    // Origin is the top-left.
+    uint target_offset_xy;
+    // Size of the target texture.
+    uint target_size_xy;
+};
+
+layout(location = 0) out vec2 source_coords;
+
+uvec2 unpack(uint xy) {
+    return uvec2(
+        bitfieldExtract(xy, 0, 16), 
+        bitfieldExtract(xy, 16, 16), 
+    );
+}
+
+void main() {
+    vec2 source_size = vec2(unpack(source_size_xy));
+    vec2 target_offset = vec2(unpack(target_offset_size_xy));
+    vec2 target_size = vec2(unpack(target_size_xy));
+
+    // Generate rectangle (counter clockwise triangles)
+    //
+    // 0 0 1 1 1 0
+    float x_select = float(((uint(gl_VertexIndex) + 1u) / 3u) % 2u);
+    // 1 0 0 0 1 1
+    float y_select = float(((uint(gl_VertexIndex) + 5u) / 3u) % 2u);
+
+    source_coords = vec2(
+        // left -> right (on screen)
+        mix(0.0, 1.0, x_select),
+        // bottom -> top (on screen)
+        mix(1.0, 0.0, y_select),
+    );
+
+    vec2 target_coords_normalized = (target_offset + source_coords * source_size) / target_size;
+
+    // Flip y and transform [0.0, 1.0] -> [-1.0, 1.0] to get NDC coordinates.
+    vec2 v_pos = ((target_coords_normalized * 2.0) - vec2(1.0)) * vec2(1.0, -1.0); 
+
+    gl_Position = vec4(v_pos, 0.0, 1.0);
+}
diff --git a/voxygen/src/render/mod.rs b/voxygen/src/render/mod.rs
index 601200128f..4f3408497c 100644
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@@ -43,7 +43,8 @@ pub use self::{
             create_quad as create_ui_quad,
             create_quad_vert_gradient as create_ui_quad_vert_gradient, create_tri as create_ui_tri,
             BoundLocals as UiBoundLocals, Locals as UiLocals, Mode as UiMode,
-            TextureBindGroup as UiTextureBindGroup, Vertex as UiVertex,
+            PremultiplyUpload as UiPremultiplyUpload, TextureBindGroup as UiTextureBindGroup,
+            Vertex as UiVertex,
         },
         GlobalModel, Globals, GlobalsBindGroup, GlobalsLayouts, Light, Shadow,
     },
diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index 1bc3c7589a..82d89a5a7d 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -1,8 +1,21 @@
 use super::super::{Bound, Consts, GlobalsLayouts, Quad, Texture, Tri, Vertex as VertexTrait};
 use bytemuck::{Pod, Zeroable};
+use core::num::NonZeroU32;
 use std::mem;
 use vek::*;
 
+// TODO: profile UI rendering before and after on laptop.
+
+/// The format of textures that the UI sources image data from.
+///
+/// Note, the is not directly used in all relevant locations, but still helps to
+/// more clearly document the that this is the format being used. Notably,
+/// textures are created via `renderer.create_dynamic_texture(...)` and
+/// `renderer.create_texture(&DynamicImage::ImageRgba(image), ...)` (TODO:
+/// update if we have to refactor when implementing the RENDER_ATTACHMENT
+/// usage).
+const UI_IMAGE_FORMAT: wgpu::TextureFormat = wgpu::TextureFormat::Rgba8UnormSrgb;
+
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Zeroable, Pod)]
 pub struct Vertex {
@@ -132,8 +145,8 @@ pub struct TextureBindGroup {
 }
 
 pub struct UiLayout {
-    pub locals: wgpu::BindGroupLayout,
-    pub texture: wgpu::BindGroupLayout,
+    locals: wgpu::BindGroupLayout,
+    texture: wgpu::BindGroupLayout,
 }
 
 impl UiLayout {
@@ -395,20 +408,77 @@ pub fn create_tri(
     )
 }
 
-// Steps:
-// 1. Upload new image via `Device::create_buffer_init`, with `MAP_WRITE` flag
-//    to avoid staging buffer.
-// 2. Run compute pipeline to multiply by alpha reading from this buffer and
-//    writing to the final texture (this may be in an atlas or an independent
-//    texture if the image is over a certain size threshold).
+// Premultiplying alpha on the GPU before placing images into the textures that
+// will be sampled from in the UI pipeline.
 //
-// Info needed in compute shader:
-// * source buffer
-// * target texture
-// * image dimensions
-// * position in the target texture
-// (what is the overhead of compute call? at some point we may be better off
-// converting small images on the cpu)
+// Steps:
+//
+// 1. Upload new image via `Device::create_texture_with_data`.
+//
+//    (NOTE: Initially considered: Creating a storage buffer to read from in the
+// shader via    `Device::create_buffer_init`, with `MAP_WRITE` flag to avoid
+// staging buffer. However, with    dedicated GPUs combining usages other than
+// `COPY_SRC` with `MAP_WRITE` may be less ideal.    Plus, by copying into a
+// texture first we can get free srgb conversion when fetching colors
+//    from the texture. In the future, we may want to branch based on the
+// whether the GPU is    integrated and avoid this extra copy.)
+//
+// 2. Run render pipeline to multiply by alpha reading from this texture and
+// writing to the final    texture (this can either be in an atlas or in an
+// independent texture if the image is over a    certain size threshold).
+//
+//    (NOTE: Initially considered: using a compute pipeline and writing to the
+// final texture as a    storage texture. However, the srgb format can't be used
+// with storage texture and there is not    yet the capability to create
+// non-srgb views of srgb textures.)
+//
+// Info needed:
+//
+// * source texture (texture binding)
+// * target texture (render attachment)
+// * source image dimensions (push constant)
+// * target texture dimensions (push constant)
+// * position in the target texture (push constant)
+//
+// TODO: potential optimizations
+// * what is the overhead of this draw call call? at some point we may be better
+//   off converting very small images on the cpu and/or batching these into a
+//   single draw call
+// * what is the overhead of creating new small textures? for processing many
+//   small images would it be useful to create a single texture the same size as
+//   our cache texture and use Queue::write_texture?
+// * is using create_buffer_init and reading directly from that (with manual
+//   srgb conversion) worth avoiding staging buffer/copy-to-texture for
+//   integrated GPUs?
+// * premultipying alpha in a release asset preparation step
+
+pub struct PremultiplyAlphaLayout {
+    source_texture: wgpu::BindGroupLayout,
+}
+
+impl PremultiplyAlphaLayout {
+    pub fn new(device: &wgpu::Device) -> Self {
+        Self {
+            source_texture: device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: None,
+                entries: &[
+                    // source_texture
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStage::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type: wgpu::TextureSampleType::Float { filterable: false },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled: false,
+                        },
+                        count: None,
+                    },
+                ],
+            }),
+        }
+    }
+}
+
 pub struct PremultiplyAlphaPipeline {
     pub pipeline: wgpu::RenderPipeline,
 }
@@ -416,22 +486,163 @@ pub struct PremultiplyAlphaPipeline {
 impl PremultiplyAlphaPipeline {
     pub fn new(
         device: &wgpu::Device,
-        module: &wgpu::ShaderModule,
-        layout: &PremultiplAlphaLayout,
+        vs_module: &wgpu::ShaderModule,
+        fs_module: &wgpu::ShaderModule,
+        layout: &PremultiplyAlphaLayout,
     ) -> Self {
         let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
             label: Some("Premultiply alpha pipeline layout"),
-            push_constant_ranges: &[],
-            bind_group_layouts: &[layout],
+            bind_group_layouts: &[&layout.source_texture],
+            push_constant_ranges: &[wgpu::PushConstantRange {
+                stages: wgpu::ShaderStage::VERTEX,
+                range: 0..core::mem::size_of::<PremultiplyAlphaParams>() as u32,
+            }],
         });
 
-        let pipeline = device.create_compute_pipeline(&wgpu::RenderPipelineDescriptor {
+        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
             label: Some("Premultiply alpha pipeline"),
             layout: Some(&pipeline_layout),
-            module,
-            entry_point: "main",
+            vertex: wgpu::VertexState {
+                module: vs_module,
+                entry_point: "main",
+                buffers: &[],
+            },
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleList,
+                strip_index_format: None,
+                front_face: wgpu::FrontFace::Ccw,
+                cull_mode: Some(wgpu::Face::Back),
+                clamp_depth: false,
+                polygon_mode: wgpu::PolygonMode::Fill,
+                conservative: false,
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            fragment: Some(wgpu::FragmentState {
+                module: fs_module,
+                entry_point: "main",
+                targets: &[wgpu::ColorTargetState {
+                    format: UI_IMAGE_FORMAT,
+                    blend: None,
+                    write_mask: wgpu::ColorWrite::ALL,
+                }],
+            }),
         });
 
         Self { pipeline }
     }
 }
+
+/// Uploaded as push constant.
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Zeroable, Pod)]
+pub struct PremultiplyAlphaParams {
+    /// Size of the source image.
+    source_size_xy: u32,
+    /// Offset to place the image at in the target texture.
+    ///
+    /// Origin is the top-left.
+    target_offset_xy: u32,
+    /// Size of the target texture.
+    target_size_xy: u32,
+}
+
+/// An image upload that needs alpha premultiplication and which is in a pending
+/// state.
+///
+/// From here we will use the `PremultiplyAlpha` pipeline to premultiply the
+/// alpha while transfering the image to its destination texture.
+pub struct PremultiplyUpload {
+    source_bg: wgpu::BindGroup,
+    source_size_xy: u32,
+    /// The location in the final texture this will be placed at. Technically,
+    /// we don't need this information at this point but it is convenient to
+    /// store it here.
+    offset: Vec2<u16>,
+}
+
+impl PremultiplyUpload {
+    pub fn prepare(
+        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        layout: &PremultiplyAlphaLayout,
+        image: &image::RgbaImage,
+        offset: Vec2<u16>,
+    ) -> Self {
+        // TODO: duplicating some code from `Texture` since:
+        // 1. We don't need to create a sampler.
+        // 2. Texture::new accepts &DynamicImage which isn't possible to create from
+        //    &RgbaImage without cloning.
+        let image_size = wgpu::Extent3d {
+            width: image.width(),
+            height: image.height(),
+            depth_or_array_layers: 1,
+        };
+        let source_tex = device.create_texture(&wgpu::TextureDescriptor {
+            label: None,
+            size: image_size,
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: wgpu::TextureFormat::Rgba8UnormSrgb,
+            usage: wgpu::TextureUsage::SAMPLED | wgpu::TextureUsage::COPY_DST,
+        });
+        queue.write_texture(
+            wgpu::ImageCopyTexture {
+                texture: &source_tex,
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+            },
+            &(&**image)[..(image.width() as usize * image.height() as usize)],
+            wgpu::ImageDataLayout {
+                offset: 0,
+                bytes_per_row: NonZeroU32::new(image.width() * 4),
+                rows_per_image: NonZeroU32::new(image.height()),
+            },
+            image_size,
+        );
+        // Create view to use to create bind group
+        let view = source_tex.create_view(&wgpu::TextureViewDescriptor {
+            label: None,
+            format: Some(wgpu::TextureFormat::Rgba8UnormSrgb),
+            dimension: Some(wgpu::TextureViewDimension::D2),
+            aspect: wgpu::TextureAspect::All,
+            base_mip_level: 0,
+            mip_level_count: None,
+            base_array_layer: 0,
+            array_layer_count: None,
+        });
+        let source_bg = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: None,
+            layout: &layout.source_texture,
+            entries: &[wgpu::BindGroupEntry {
+                binding: 0,
+                resource: wgpu::BindingResource::TextureView(&view),
+            }],
+        });
+
+        // NOTE: We assume the max texture size is less than u16::MAX.
+        let source_size_xy = image_size.width + image_size.height << 16;
+
+        Self {
+            source_bg,
+            source_size_xy,
+            offset,
+        }
+    }
+
+    /// Semantically, this consumes the `PremultiplyUpload` but we need to keep
+    /// the bind group alive to the end of the render pass and don't want to
+    /// bother storing it somewhere else.
+    pub fn draw_data(&self, target: &Texture) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
+        let target_offset_xy = u32::from(self.offset.x) + u32::from(self.offset.y) << 16;
+        let target_dims = target.get_dimensions();
+        // NOTE: We assume the max texture size is less than u16::MAX.
+        let target_size_xy = target_dims.x + target_dims.y << 16;
+        (&self.source_bg, PremultiplyAlphaParams {
+            source_size_xy: self.source_size_xy,
+            target_offset_xy,
+            target_size_xy,
+        })
+    }
+}
diff --git a/voxygen/src/render/renderer.rs b/voxygen/src/render/renderer.rs
index cf0d703211..084e65fe0a 100644
--- a/voxygen/src/render/renderer.rs
+++ b/voxygen/src/render/renderer.rs
@@ -63,6 +63,7 @@ struct ImmutableLayouts {
     clouds: clouds::CloudsLayout,
     bloom: bloom::BloomLayout,
     ui: ui::UiLayout,
+    premultiply_alpha: ui::PremultiplyAlphaLayout,
     blit: blit::BlitLayout,
 }
 
@@ -393,6 +394,7 @@ impl Renderer {
                 &pipeline_modes,
             ));
             let ui = ui::UiLayout::new(&device);
+            let premultiply_alpha = ui::PremultiplyAlphaLayout::new(&device);
             let blit = blit::BlitLayout::new(&device);
 
             let immutable = Arc::new(ImmutableLayouts {
@@ -407,6 +409,7 @@ impl Renderer {
                 clouds,
                 bloom,
                 ui,
+                premultiply_alpha,
                 blit,
             });
 
@@ -1434,6 +1437,20 @@ impl Renderer {
         texture.update(&self.queue, offset, size, bytemuck::cast_slice(data))
     }
 
+    pub fn prepare_premultiply_upload(
+        &self,
+        image: &image::RgbaImage,
+        offset: Vec2<u16>,
+    ) -> ui::PremultiplyUpload {
+        ui::PremultiplyUpload::prepare(
+            &self.device,
+            &self.queue,
+            &self.layouts.premultiply_alpha,
+            image,
+            offset,
+        )
+    }
+
     /// Queue to obtain a screenshot on the next frame render
     pub fn create_screenshot(
         &mut self,
diff --git a/voxygen/src/render/renderer/drawer.rs b/voxygen/src/render/renderer/drawer.rs
index 18827b51fe..a1ceb29750 100644
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@@ -12,6 +12,7 @@ use super::{
     rain_occlusion_map::{RainOcclusionMap, RainOcclusionMapRenderer},
     Renderer, ShadowMap, ShadowMapRenderer,
 };
+use common_base::prof_span;
 use core::{num::NonZeroU32, ops::Range};
 use std::sync::Arc;
 use vek::Aabr;
@@ -424,6 +425,44 @@ impl<'frame> Drawer<'frame> {
         });
     }
 
+    pub fn run_ui_premultiply_passes<'a>(
+        &mut self,
+        targets: impl Iterator<Item = (&'a super::super::Texture, Vec<ui::PremultiplyUpload>)>,
+    ) {
+        let encoder = self.encoder.as_mut().unwrap();
+        let device = self.borrow.device;
+
+        // TODO: What is the CPU overhead of each renderpass?
+        for (i, (target_texture, uploads)) in targets.enumerate() {
+            prof_span!("ui premultiply pass");
+            tracing::info!("{} uploads", uploads.len());
+            let profile_name = format!("ui_premultiply_pass {}", i);
+            let label = format!("ui premultiply pass {}", i);
+            // TODO: a GPU profile scope on each of the passes here may be a bit too fine
+            // grained.
+            let mut render_pass =
+                encoder.scoped_render_pass(&profile_name, device, &wgpu::RenderPassDescriptor {
+                    label: Some(&label),
+                    color_attachments: &[wgpu::RenderPassColorAttachment {
+                        view: &target_texture.view,
+                        resolve_target: None,
+                        ops: wgpu::Operations {
+                            load: wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT),
+                            store: true,
+                        },
+                    }],
+                    depth_stencil_attachment: None,
+                });
+            for upload in &uploads {
+                let (source_bind_group, push_constant_data) = upload.draw_data(target_texture);
+                let bytes = bytemuck::bytes_of(&push_constant_data);
+                render_pass.set_bind_group(0, source_bind_group, &[]);
+                render_pass.set_push_constants(wgpu::ShaderStage::VERTEX, 0, bytes);
+                render_pass.draw_indexed(0..6, 0, 0..1);
+            }
+        }
+    }
+
     pub fn third_pass(&mut self) -> ThirdPassDrawer {
         let encoder = self.encoder.as_mut().unwrap();
         let device = self.borrow.device;
diff --git a/voxygen/src/render/renderer/pipeline_creation.rs b/voxygen/src/render/renderer/pipeline_creation.rs
index d14d494367..82eb152d3a 100644
--- a/voxygen/src/render/renderer/pipeline_creation.rs
+++ b/voxygen/src/render/renderer/pipeline_creation.rs
@@ -33,6 +33,7 @@ pub struct Pipelines {
     pub lod_object: lod_object::LodObjectPipeline,
     pub terrain: terrain::TerrainPipeline,
     pub ui: ui::UiPipeline,
+    pub premultiply_alpha: ui::PremultiplyAlphaPipeline,
     pub blit: blit::BlitPipeline,
 }
 
@@ -79,6 +80,7 @@ pub struct IngameAndShadowPipelines {
 /// Use to decouple interface pipeline creation when initializing the renderer
 pub struct InterfacePipelines {
     pub ui: ui::UiPipeline,
+    pub premultiply_alpha: ui::PremultiplyAlphaPipeline,
     pub blit: blit::BlitPipeline,
 }
 
@@ -100,6 +102,7 @@ impl Pipelines {
             lod_object: ingame.lod_object,
             terrain: ingame.terrain,
             ui: interface.ui,
+            premultiply_alpha: interface.premultiply_alpha,
             blit: interface.blit,
         }
     }
@@ -127,6 +130,8 @@ struct ShaderModules {
     trail_frag: wgpu::ShaderModule,
     ui_vert: wgpu::ShaderModule,
     ui_frag: wgpu::ShaderModule,
+    premultiply_alpha_vert: wgpu::ShaderModule,
+    premultiply_alpha_frag: wgpu::ShaderModule,
     lod_terrain_vert: wgpu::ShaderModule,
     lod_terrain_frag: wgpu::ShaderModule,
     clouds_vert: wgpu::ShaderModule,
@@ -336,6 +341,8 @@ impl ShaderModules {
             trail_frag: create_shader("trail-frag", ShaderKind::Fragment)?,
             ui_vert: create_shader("ui-vert", ShaderKind::Vertex)?,
             ui_frag: create_shader("ui-frag", ShaderKind::Fragment)?,
+            premultiply_alpha_vert: create_shader("premultiply-alpha-vert", ShaderKind::Vertex)?,
+            premultiply_alpha_frag: create_shader("premultiply-alpha-frag", ShaderKind::Fragment)?,
             lod_terrain_vert: create_shader("lod-terrain-vert", ShaderKind::Vertex)?,
             lod_terrain_frag: create_shader("lod-terrain-frag", ShaderKind::Fragment)?,
             clouds_vert: create_shader("clouds-vert", ShaderKind::Vertex)?,
@@ -416,11 +423,11 @@ struct PipelineNeeds<'a> {
 fn create_interface_pipelines(
     needs: PipelineNeeds,
     pool: &rayon::ThreadPool,
-    tasks: [Task; 2],
+    tasks: [Task; 3],
 ) -> InterfacePipelines {
     prof_span!(_guard, "create_interface_pipelines");
 
-    let [ui_task, blit_task] = tasks;
+    let [ui_task, premultiply_alpha_task, blit_task] = tasks;
     // Construct a pipeline for rendering UI elements
     let create_ui = || {
         ui_task.run(
@@ -438,6 +445,20 @@ fn create_interface_pipelines(
         )
     };
 
+    let create_premultiply_alpha = || {
+        premultiply_alpha_task.run(
+            || {
+                ui::PremultiplyAlphaPipeline::new(
+                    needs.device,
+                    &needs.shaders.premultiply_alpha_vert,
+                    &needs.shaders.premultiply_alpha_frag,
+                    &needs.layouts.premultiply_alpha,
+                )
+            },
+            "premultiply alpha pipeline creation",
+        )
+    };
+
     // Construct a pipeline for blitting, used during screenshotting
     let create_blit = || {
         blit_task.run(
@@ -454,9 +475,15 @@ fn create_interface_pipelines(
         )
     };
 
-    let (ui, blit) = pool.join(create_ui, create_blit);
+    let (ui, (premultiply_alpha, blit)) = pool.join(create_ui, || {
+        pool.join(create_premultiply_alpha, create_blit)
+    });
 
-    InterfacePipelines { ui, blit }
+    InterfacePipelines {
+        ui,
+        premultiply_alpha,
+        blit,
+    }
 }
 
 /// Create IngamePipelines and shadow pipelines in parallel
diff --git a/voxygen/src/render/renderer/shaders.rs b/voxygen/src/render/renderer/shaders.rs
index c2d4b645a2..26072469cf 100644
--- a/voxygen/src/render/renderer/shaders.rs
+++ b/voxygen/src/render/renderer/shaders.rs
@@ -73,6 +73,8 @@ impl assets::Compound for Shaders {
             "trail-frag",
             "ui-vert",
             "ui-frag",
+            "premultiply-alpha-vert",
+            "premultiply_alpha-frag",
             "lod-terrain-vert",
             "lod-terrain-frag",
             "clouds-vert",
diff --git a/voxygen/src/render/texture.rs b/voxygen/src/render/texture.rs
index fe07cb1921..75b777785f 100644
--- a/voxygen/src/render/texture.rs
+++ b/voxygen/src/render/texture.rs
@@ -224,6 +224,7 @@ impl Texture {
         );
     }
 
+    // TODO: remove `get` from this name
     /// Get dimensions of the represented image.
     pub fn get_dimensions(&self) -> vek::Vec3<u32> {
         vek::Vec3::new(
diff --git a/voxygen/src/ui/cache.rs b/voxygen/src/ui/cache.rs
index 774d24fcde..a09ce10069 100644
--- a/voxygen/src/ui/cache.rs
+++ b/voxygen/src/ui/cache.rs
@@ -51,7 +51,9 @@ impl Cache {
         })
     }
 
-    pub fn glyph_cache_tex(&self) -> &(Texture, UiTextureBindGroup) { &self.glyph_cache_tex }
+    pub fn glyph_cache_tex(&self) -> (&Texture, &UiTextureBindGroup) {
+        (&self.glyph_cache_tex.0, &self.glyph_cache_tex.1)
+    }
 
     pub fn cache_mut_and_tex(
         &mut self,
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 06db5fc6a0..1ffee4413d 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -4,7 +4,7 @@ pub mod renderer;
 pub use renderer::{SampleStrat, Transform};
 
 use crate::{
-    render::{Renderer, Texture, UiTextureBindGroup},
+    render::{Renderer, Texture, UiPremultiplyUpload, UiTextureBindGroup},
     ui::KeyedJobs,
 };
 use common::{figure::Segment, slowjob::SlowJobPool};
@@ -12,7 +12,7 @@ use guillotiere::{size2, SimpleAtlasAllocator};
 use hashbrown::{hash_map::Entry, HashMap};
 use image::{DynamicImage, RgbaImage};
 use slab::Slab;
-use std::{hash::Hash, sync::Arc};
+use std::{borrow::Cow, hash::Hash, sync::Arc};
 use tracing::{error, warn};
 use vek::*;
 
@@ -29,6 +29,7 @@ pub enum Graphic {
     Image(Arc<DynamicImage>, Option<Rgba<f32>>),
     // Note: none of the users keep this Arc currently
     Voxel(Arc<Segment>, Transform, SampleStrat),
+    // TODO: Re-evaluate whether we need this (especially outside conrod context)
     Blank,
 }
 
@@ -63,11 +64,11 @@ pub struct TexId(usize);
 
 enum CachedDetails {
     Atlas {
-        // Index of the atlas this is cached in
+        // Index of the atlas this is cached in.
         atlas_idx: usize,
         // Whether this texture is valid.
         valid: bool,
-        // Where in the cache texture this is
+        // Where in the cache texture this is.
         aabr: Aabr<u16>,
     },
     Texture {
@@ -76,10 +77,6 @@ enum CachedDetails {
         // Whether this texture is valid.
         valid: bool,
     },
-    Immutable {
-        // Index of the (unique, immutable, non-atlas) texture this is cached in.
-        index: usize,
-    },
 }
 
 impl CachedDetails {
@@ -89,10 +86,8 @@ impl CachedDetails {
     fn info(
         &self,
         atlases: &[(SimpleAtlasAllocator, usize)],
-        textures: &Slab<(Texture, UiTextureBindGroup)>,
+        textures: &Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
     ) -> (usize, bool, Aabr<u16>) {
-        // NOTE: We don't accept images larger than u16::MAX (rejected in `cache_res`)
-        // (and probably would not be able to create a texture this large).
         match *self {
             CachedDetails::Atlas {
                 atlas_idx,
@@ -102,38 +97,136 @@ impl CachedDetails {
             CachedDetails::Texture { index, valid } => {
                 (index, valid, Aabr {
                     min: Vec2::zero(),
-                    // Note texture should always match the cached dimensions
-                    max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
-                })
-            },
-            CachedDetails::Immutable { index } => {
-                (index, true, Aabr {
-                    min: Vec2::zero(),
-                    // Note texture should always match the cached dimensions
+                    // NOTE (as cast): We don't accept images larger than u16::MAX (rejected in
+                    // `cache_res`) (and probably would not be able to create a texture this
+                    // large).
+                    //
+                    // Note texture should always match the cached dimensions.
                     max: textures[index].0.get_dimensions().xy().map(|e| e as u16),
                 })
             },
         }
     }
 
-    /// Attempt to invalidate this cache entry.
-    /// If invalidation is not possible this returns the index of the texture to
-    /// deallocate
-    fn invalidate(&mut self) -> Result<(), usize> {
+    /// Invalidate this cache entry.
+    fn invalidate(&mut self) {
         match self {
             Self::Atlas { ref mut valid, .. } => {
                 *valid = false;
-                Ok(())
             },
             Self::Texture { ref mut valid, .. } => {
                 *valid = false;
-                Ok(())
             },
-            Self::Immutable { index } => Err(*index),
         }
     }
 }
 
+/// Requirements that a particular graphic has with respect to the atlas
+/// allocation or independent texture it will be stored in.
+///
+/// If this matches between an old graphic and a new one which is replacing it,
+/// we can reuse any of the corresponding locations where it is cached in
+/// textures on the GPU. That is we can invalidate such textures and upload the
+/// new graphic there, rather than needing to allocate a new texture (or new
+/// location in an atlas).
+#[derive(PartialEq)]
+enum TextureRequirements {
+    /// These are uploaded to the GPU in the original resolution of the image
+    /// supplied by the `Graphic` and any scaling is done during sampling in
+    /// the UI fragment shader.
+    Fixed {
+        size: Vec2<u16>,
+        /// Graphics with a border color specified are placed into their own
+        /// individual textures so that the border color can be set
+        /// there. (Note: this is partially a theoretical description as
+        /// border color options are limited in the current graphics API).
+        border_color: Option<Rgba<f32>>,
+    },
+    /// These are rasterized to the exact resolution that they will be displayed
+    /// at and then uploaded to the GPU. This corresponds to
+    /// `Graphic::Voxel`. There may be multiple copies on the GPU if
+    /// different resolutions are requested.
+    ///
+    /// It is expected that the requested sizes will generally not differ when
+    /// switching out a graphic. Thus, dependent cached depdendent should
+    /// always be invalidated since those cached locations will be reusable
+    /// if the requested size is the same.
+    Dependent,
+}
+
+/// These solely determine how a place in an atlas will be found or how a
+/// texture will be created to place the image for a graphic.
+struct TextureParameters {
+    size: Vec2<u16>,
+    border_color: Option<Rgba<f32>>,
+}
+
+/// Key used to refer to an instance of a graphic that has been uploaded to the
+/// GPU.
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+struct CacheKey {
+    graphic_id: Id,
+    /// This is `Some` for `TextureRequirements::Dependent`.
+    size: Option<Vec2<u16>>,
+}
+
+impl TextureRequirements {
+    fn from_graphic(graphic: &Graphic) -> Option<Self> {
+        match graphic {
+            Graphic::Image(image, border_color) => {
+                // Image sizes over u16::MAX are not supported (and we would probably not be
+                // able to create a texture large enough to hold them on the GPU anyway)!
+                let image_dims = match (u16::try_from(image.width()), u16::try_from(image.height()))
+                {
+                    (Ok(x), Ok(y)) if x != 0 && y != 0 => Vec2::new(x, y),
+                    _ => {
+                        error!(
+                            "Image dimensions greater than u16::MAX are not supported! Supplied \
+                             image size: ({}, {}).",
+                            image.width(),
+                            image.height(),
+                        );
+                        // TODO: reasonable to return None on this error case? We could potentially
+                        // validate images sizes on add_graphic/replace_graphic?
+                        return None;
+                    },
+                };
+
+                Some(Self::Fixed {
+                    size: image_dims,
+                    border_color: *border_color,
+                })
+            },
+            Graphic::Voxel(_, _, _) => Some(Self::Dependent),
+            Graphic::Blank => None,
+        }
+    }
+
+    // TODO: what if requested size is 0? Do we currently panic on this case and
+    // expect caller not to ask for 0 size? (if so document that)
+    fn to_key_and_tex_parameters(
+        self,
+        graphic_id: Id,
+        requested_size: Vec2<u16>,
+    ) -> (CacheKey, TextureParameters) {
+        // NOTE: Any external parameters which influence the value of the returned
+        // `TextureParameters` must be included in the `CacheKey`. Otherwise,
+        // invalidation and subsequent re-use of cache locations based on the
+        // value of `self` would be wrong.
+        let (size, border_color, key_size) = match self {
+            Self::Fixed { size, border_color } => (size, border_color, None),
+            Self::Dependent => (requested_size, None, Some(requested_size)),
+        };
+        (
+            CacheKey {
+                graphic_id,
+                size: key_size,
+            },
+            TextureParameters { size, border_color },
+        )
+    }
+}
+
 // Caches graphics, only deallocates when changing screen resolution (completely
 // cleared)
 pub struct GraphicCache {
@@ -142,27 +235,35 @@ pub struct GraphicCache {
     /// Next id to use when a new graphic is added
     next_id: u32,
 
-    /// Atlases with the index of their texture in the textures vec
+    /// Atlases with the index of their texture in the textures slab.
     atlases: Vec<(SimpleAtlasAllocator, usize)>,
-    textures: Slab<(Texture, UiTextureBindGroup)>,
+    /// Third tuple element is a list of pending premultiply + upload operations
+    /// for this frame. The purpose of this is to collect all the operations
+    /// together so that a single renderpass is performed for each target
+    /// texture.
+    textures: Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
     /// The location and details of graphics cached on the GPU.
     ///
     /// Graphic::Voxel images include the dimensions they were rasterized at in
     /// the key. Other images are scaled as part of sampling them on the
     /// GPU.
-    cache_map: HashMap<(Id, Option<Vec2<u16>>), CachedDetails>,
+    cache_map: HashMap<CacheKey, CachedDetails>,
 
-    keyed_jobs: KeyedJobs<(Id, Option<Vec2<u16>>), (RgbaImage, Option<Rgba<f32>>)>,
+    keyed_jobs: KeyedJobs<CacheKey, RgbaImage>,
 }
+
 impl GraphicCache {
     pub fn new(renderer: &mut Renderer) -> Self {
-        let (atlas, texture) = create_atlas_texture(renderer);
+        let (atlas, (tex, bind)) = create_atlas_texture(renderer);
+
+        let mut textures = Slab::new();
+        let tex_id = textures.insert((tex, bind, Vec::new()));
 
         Self {
             graphic_map: HashMap::default(),
             next_id: 0,
-            atlases: vec![(atlas, 0)],
-            textures: core::iter::once((0, texture)).collect(),
+            atlases: vec![(atlas, tex_id)],
+            textures,
             cache_map: HashMap::default(),
             keyed_jobs: KeyedJobs::new("IMAGE_PROCESSING"),
         }
@@ -179,29 +280,64 @@ impl GraphicCache {
     }
 
     pub fn replace_graphic(&mut self, id: Id, graphic: Graphic) {
-        if self.graphic_map.insert(id, graphic).is_none() {
-            // This was not an update, so no need to search for keys.
-            return;
-        }
+        let (old, new) = match self.graphic_map.entry(id) {
+            Entry::Occupied(o) => {
+                let slot_mut = o.into_mut();
+                let old = core::mem::replace(slot_mut, graphic);
+                (old, slot_mut)
+            },
+            Entry::Vacant(v) => {
+                // This was not an update, so no need to cleanup caches.
+                v.insert(graphic);
+                return;
+            },
+        };
 
-        // Remove from caches
+        let old_requirements = TextureRequirements::from_graphic(&old);
+        let new_requirements = TextureRequirements::from_graphic(&new);
+        let should_invalidate = old_requirements == new_requirements && old_requirements.is_some();
+
+        // Invalidate if possible or remove from caches.
         // Maybe make this more efficient if replace graphic is used more often
-        self.cache_map.retain(|&(key_id, _), details| {
-            // If the entry does not reference id, or it does but we can successfully
-            // invalidate, retain the entry; otherwise, discard this entry completely.
-            key_id != id
-                || details
-                    .invalidate()
-                    .map_err(|index| self.textures.remove(index))
-                    .is_ok()
-        });
+        // (especially since we should know the exact key for non-voxel
+        // graphics).
+        //
+        // NOTE: at the time of writing, replace_graphic is only used for voxel minimap
+        // updates and item image reloading.
+        if should_invalidate {
+            self.cache_map.iter_mut().for_each(|(key, details)| {
+                if key.graphic_id == id {
+                    details.invalidate();
+                }
+            });
+        } else {
+            self.cache_map.drain_filter(|key, details| {
+                if key.graphic_id == id {
+                    match details {
+                        // TODO: if replace_graphic is used continously for small images (i.e.
+                        // images placed into an atlas) of different sizes, that can use up our
+                        // atlas space since spots in the atlas can't be reused. (this scenario is
+                        // now possible with scaling being done during sampling rather than placing
+                        // resized version into the atlas)
+                        CachedDetails::Atlas { .. } => {},
+                        CachedDetails::Texture { index, .. } => {
+                            self.textures.remove(*index);
+                        },
+                    };
+                    true
+                } else {
+                    false
+                }
+            });
+        }
     }
 
     pub fn get_graphic(&self, id: Id) -> Option<&Graphic> { self.graphic_map.get(&id) }
 
     /// Used to acquire textures for rendering
-    pub fn get_tex(&self, id: TexId) -> &(Texture, UiTextureBindGroup) {
-        self.textures.get(id.0).expect("Invalid TexId used")
+    pub fn get_tex(&self, id: TexId) -> (&Texture, &UiTextureBindGroup) {
+        let (tex, bind, _uploads) = self.textures.get(id.0).expect("Invalid TexId used");
+        (tex, bind)
     }
 
     pub fn get_graphic_dims(&self, (id, rot): (Id, Rotation)) -> Option<(u32, u32)> {
@@ -230,20 +366,28 @@ impl GraphicCache {
     pub fn clear_cache(&mut self, renderer: &mut Renderer) {
         self.cache_map.clear();
 
-        let (atlas, texture) = create_atlas_texture(renderer);
-        self.atlases = vec![(atlas, 0)];
-        self.textures = core::iter::once((0, texture)).collect();
+        let (atlas, (tex, bind)) = create_atlas_texture(renderer);
+        let mut textures = Slab::new();
+        let tex_id = textures.insert((tex, bind, Vec::new()));
+        self.atlases = vec![(atlas, tex_id)];
+        self.textures = textures;
     }
 
     /// Source rectangle should be from 0 to 1, and represents a bounding box
     /// for the source image of the graphic.
+    ///
+    /// [`complete_premultiply_uploads`](Self::complete_premultiply_uploads)
+    /// needs to be called to finalize updates on the GPU that are initiated
+    /// here. Thus, ideally that would be called before drawing UI elements
+    /// using the images cached here.
     pub fn cache_res(
         &mut self,
         renderer: &mut Renderer,
         pool: Option<&SlowJobPool>,
         graphic_id: Id,
-        // TODO: if we aren't resizing here we can upload image earlier... (as long as this doesn't
-        // lead to uploading too much unused stuff).
+        // TODO: if we aren't resizing here we can potentially upload the image earlier... (as long
+        // as this doesn't lead to uploading too much unused stuff). (currently not sure whether it
+        // would be an overall gain to pursue this.)
         requested_dims: Vec2<u16>,
         source: Aabr<f64>,
         rotation: Rotation,
@@ -290,6 +434,7 @@ impl GraphicCache {
             // S-TODO: A bit hacky inserting this here, just to get things working initially
             let scale = requested_dims_upright.map2(
                 Vec2::from(scaled.size()),
+                // S-TODO div by zero potential? If so, is NaN an issue in that case?
                 |screen_pixels, sample_pixels: f64| screen_pixels as f32 / sample_pixels as f32,
             );
             let transformed = rotated_aabr(scaled);
@@ -315,13 +460,9 @@ impl GraphicCache {
             },
         };
 
-        let key = (
-            graphic_id,
-            // Dimensions only included in the key for voxel graphics which we rasterize at the
-            // size that they will be displayed at (other images are scaled when sampling them on
-            // the GPU).
-            matches!(graphic, Graphic::Voxel { .. }).then(|| requested_dims_upright),
-        );
+        let requirements = TextureRequirements::from_graphic(&graphic)?;
+        let (key, texture_parameters) =
+            requirements.to_key_and_tex_parameters(graphic_id, requested_dims_upright);
 
         let details = match cache_map.entry(key) {
             Entry::Occupied(details) => {
@@ -332,20 +473,23 @@ impl GraphicCache {
                 // graphic
                 if !valid {
                     // Create image
-                    let (image, border) = prepare_graphic(
+                    let image = prepare_graphic(
                         graphic,
-                        graphic_id,
+                        key,
                         requested_dims_upright,
+                        false,
                         &mut self.keyed_jobs,
                         pool,
                     )?;
-                    // If the cache location is invalid, we know the underlying texture is mutable,
-                    // so we should be able to replace the graphic.  However, we still want to make
-                    // sure that we are not reusing textures for images that specify a border
-                    // color.
-                    assert!(border.is_none());
+                    // Ensure we don't have any bugs causing the size used to determine if the
+                    // cached version is reusable to not match the size of the image produced by
+                    // prepare_graphic.
+                    assert_eq!(
+                        image.dimensions(),
+                        texture_parameters.size.map(u32::from).into_tuple()
+                    );
                     // Transfer to the gpu
-                    upload_image(renderer, aabr, &textures[idx].0, &image);
+                    upload_image(renderer, aabr, &mut textures[idx].2, &image);
                 }
 
                 return Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)));
@@ -354,62 +498,49 @@ impl GraphicCache {
         };
 
         // Construct image in an optional threadpool.
-        let (image, border_color) = prepare_graphic(
+        let image = prepare_graphic(
             graphic,
-            graphic_id,
+            key,
             requested_dims_upright,
+            false,
             &mut self.keyed_jobs,
             pool,
         )?;
+        // Assert dimensions of image from `prepare_graphic` are as expected!
+        assert_eq!(
+            image.dimensions(),
+            texture_parameters.size.map(u32::from).into_tuple()
+        );
+        // Image dimensions in the format used by the allocator crate.
+        let image_dims_size2d = size2(
+            i32::from(texture_parameters.size.x),
+            i32::from(texture_parameters.size.y),
+        );
 
-        // Image sizes over u16::MAX are not supported (and we would probably not be
-        // able to create a texture large enough to hold them on the GPU anyway)!
-        let image_dims = match {
-            let (x, y) = image.dimensions();
-            (u16::try_from(x), u16::try_from(y))
-        } {
-            (Ok(x), Ok(y)) => Vec2::new(x, y),
-            _ => {
-                error!(
-                    "Image dimensions greater than u16::MAX are not supported! Supplied image \
-                     size: {:?}.",
-                    image.dimensions()
-                );
-                return None;
-            },
-        };
+        // Now we allocate space on the gpu (either in an atlas or an independent
+        // texture) and upload the image to that location.
 
-        // Upload
         let atlas_size = atlas_size(renderer);
-
-        // Allocate space on the gpu.
-        //
-        // Graphics with a border color.
-        let location = if let Some(border_color) = border_color {
-            // Create a new immutable texture.
-            let texture = create_image(renderer, image, border_color);
-            // NOTE: All mutations happen only after the upload succeeds!
-            let index = textures.insert(texture);
-            CachedDetails::Immutable { index }
-        // Graphics over a particular size compared to the atlas size are sent
-        // to their own textures. Here we check for ones under that
-        // size.
-        } else if atlas_size
-            .map2(image_dims, |a, d| a as f32 * ATLAS_CUTOFF_FRAC >= d as f32)
-            .reduce_and()
-        {
+        // Graphics that request a border color or which are over a particular size
+        // compared to the atlas size are sent to their own textures.
+        let can_place_in_atlas = texture_parameters.border_color.is_none()
+            && atlas_size
+                .map2(texture_parameters.size, |a, d| {
+                    a as f32 * ATLAS_CUTOFF_FRAC >= d as f32
+                })
+                .reduce_and();
+        let location = if can_place_in_atlas {
             // Fit into an atlas
             let mut loc = None;
             for (atlas_idx, &mut (ref mut atlas, texture_idx)) in atlases.iter_mut().enumerate() {
-                let clamped_dims = image_dims.map(|e| i32::from(e.max(1)));
-                if let Some(rectangle) = atlas.allocate(size2(clamped_dims.x, clamped_dims.y)) {
+                if let Some(rectangle) = atlas.allocate(image_dims_size2d) {
                     let aabr = aabr_from_alloc_rect(rectangle);
                     loc = Some(CachedDetails::Atlas {
                         atlas_idx,
                         valid: true,
                         aabr,
                     });
-                    upload_image(renderer, aabr, &textures[texture_idx].0, &image);
+                    upload_image(renderer, aabr, &mut textures[texture_idx].2, &image);
                     break;
                 }
             }
@@ -418,17 +549,16 @@ impl GraphicCache {
                 Some(loc) => loc,
                 // Create a new atlas
                 None => {
-                    let (mut atlas, texture) = create_atlas_texture(renderer);
-                    let clamped_dims = image_dims.map(|e| i32::from(e.max(1)));
+                    let (mut atlas, (tex, bind)) = create_atlas_texture(renderer);
                     let aabr = atlas
-                        .allocate(size2(clamped_dims.x, clamped_dims.y))
+                        .allocate(image_dims_size2d)
                         .map(aabr_from_alloc_rect)
                         .unwrap();
                     // NOTE: All mutations happen only after the texture creation succeeds!
-                    let tex_idx = textures.insert(texture);
+                    let tex_idx = textures.insert((tex, bind, Vec::new()));
                     let atlas_idx = atlases.len();
                     atlases.push((atlas, tex_idx));
-                    upload_image(renderer, aabr, &textures[tex_idx].0, &image);
+                    upload_image(renderer, aabr, &mut textures[tex_idx].2, &image);
                     CachedDetails::Atlas {
                         atlas_idx,
                         valid: true,
@@ -438,23 +568,11 @@ impl GraphicCache {
             }
         } else {
             // Create a texture just for this
-            let texture = {
-                let tex = renderer.create_dynamic_texture(image_dims.map(u32::from));
-                let bind = renderer.ui_bind_texture(&tex);
-                (tex, bind)
-            };
-            // NOTE: All mutations happen only after the texture creation succeeds!
-            let index = textures.insert(texture);
-            upload_image(
-                renderer,
-                Aabr {
-                    min: Vec2::zero(),
-                    // Note texture should always match the cached dimensions
-                    max: image_dims,
-                },
-                &textures[index].0,
-                &image,
-            );
+            let (tex, bind, uploads) = create_image(renderer, &image, texture_parameters);
+            // NOTE: All mutations happen only after the texture creation and upload
+            // initiation succeeds! (completing the upload does not have any failure cases
+            // afaik)
+            let index = textures.insert((tex, bind, uploads));
             CachedDetails::Texture { index, valid: true }
         };
 
@@ -466,54 +584,77 @@ impl GraphicCache {
 
         Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)))
     }
+
+    /// Runs render passes with alpha premultiplication pipeline to complete any
+    /// pending uploads.
+    ///
+    /// This should be called before starting the pass where the ui is rendered.
+    pub fn complete_premultiply_uploads(&mut self, drawer: &mut crate::render::Drawer<'_>) {
+        drawer.run_ui_premultiply_passes(
+            self.textures
+                .iter_mut()
+                .map(|(_tex_id, (texture, _, uploads))| (&*texture, core::mem::take(uploads))),
+        );
+    }
 }
 
 /// Prepare the graphic into the form that will be uploaded to the GPU.
 ///
 /// For voxel graphics, draws the graphic at the specified dimensions.
 ///
-/// Also pre-multiplies alpha in images so they can be linearly filtered on the
-/// GPU.
-fn prepare_graphic(
-    graphic: &Graphic,
-    graphic_id: Id,
+/// Also can pre-multiplies alpha in images so they can be linearly filtered on
+/// the GPU (this is optional since we also have a path to do this
+/// premultiplication on the GPU).
+fn prepare_graphic<'graphic>(
+    graphic: &'graphic Graphic,
+    cache_key: CacheKey,
     dims: Vec2<u16>,
-    keyed_jobs: &mut KeyedJobs<(Id, Option<Vec2<u16>>), (RgbaImage, Option<Rgba<f32>>)>,
+    premultiply_on_cpu: bool, // TODO: currently unused
+    keyed_jobs: &mut KeyedJobs<CacheKey, RgbaImage>,
     pool: Option<&SlowJobPool>,
-) -> Option<(RgbaImage, Option<Rgba<f32>>)> {
+) -> Option<Cow<'graphic, RgbaImage>> {
     match graphic {
         // Short-circuit spawning a job on the threadpool for blank graphics
         Graphic::Blank => None,
-        // Dimensions are only included in the key for Graphic::Voxel since otherwise we will
-        // resize on the GPU.
-        Graphic::Image(image, border_color) => keyed_jobs
-            .spawn(pool, (graphic_id, None), || {
-                let image = Arc::clone(image);
-                let border_color = *border_color;
-                move |_| {
-                    // Image will be rescaled when sampling from it on the GPU so we don't
-                    // need to resize it here.
-                    let mut image = image.to_rgba8();
-                    // TODO: could potentially do this when loading the image and for voxel
-                    // images maybe at some point in the `draw_vox` processing. Or we could
-                    // push it in the other direction and do conversion on the GPU.
-                    premultiply_alpha(&mut image);
-                    (image, border_color)
-                }
-            })
-            .map(|(_, v)| v),
+        Graphic::Image(image, _border_color) => {
+            if premultiply_on_cpu {
+                keyed_jobs
+                    .spawn(pool, cache_key, || {
+                        let image = Arc::clone(image);
+                        move |_| {
+                            // Image will be rescaled when sampling from it on the GPU so we don't
+                            // need to resize it here.
+                            let mut image = image.to_rgba8();
+                            // TODO: could potentially do this when loading the image and for voxel
+                            // images maybe at some point in the `draw_vox` processing. Or we could
+                            // push it in the other direction and do conversion on the GPU.
+                            premultiply_alpha(&mut image);
+                            image
+                        }
+                    })
+                    .map(|(_, v)| Cow::Owned(v))
+            } else if let Some(rgba) = image.as_rgba8() {
+                Some(Cow::Borrowed(rgba))
+            } else {
+                // TODO: we should require rgba8 format
+                warn!("Non-rgba8 image in UI used this may be deprecated.");
+                Some(Cow::Owned(image.to_rgba8()))
+            }
+        },
         Graphic::Voxel(segment, trans, sample_strat) => keyed_jobs
-            .spawn(pool, (graphic_id, Some(dims)), || {
+            .spawn(pool, cache_key, || {
                 let segment = Arc::clone(segment);
                 let (trans, sample_strat) = (*trans, *sample_strat);
                 move |_| {
                     // Render voxel model at requested resolution
                     let mut image = renderer::draw_vox(&segment, dims, trans, sample_strat);
-                    premultiply_alpha(&mut image);
-                    (image, None)
+                    if premultiply_on_cpu {
+                        premultiply_alpha(&mut image);
+                    }
+                    image
                 }
             })
-            .map(|(_, v)| v),
+            .map(|(_, v)| Cow::Owned(v)),
     }
 }
 
@@ -525,19 +666,52 @@ fn atlas_size(renderer: &Renderer) -> Vec2<u32> {
         .map(|e| (e * GRAPHIC_CACHE_RELATIVE_SIZE).clamp(512, max_texture_size))
 }
 
+/// This creates a texture suitable for sampling from during the UI pass and
+/// rendering too during alpha premultiplication upload passes.
+fn create_image_texture(
+    renderer: &mut Renderer,
+    size: Vec2<u32>,
+    address_mode: Option<wgpu::AddressMode>,
+) -> (Texture, UiTextureBindGroup) {
+    let tex_info = wgpu::TextureDescriptor {
+        label: None,
+        size: wgpu::Extent3d {
+            width: size.x,
+            height: size.y,
+            depth_or_array_layers: 1,
+        },
+        mip_level_count: 1,
+        sample_count: 1,
+        dimension: wgpu::TextureDimension::D2,
+        format: wgpu::TextureFormat::Rgba8UnormSrgb,
+        usage: wgpu::TextureUsage::RENDER_ATTACHMENT | wgpu::TextureUsage::SAMPLED,
+    };
+    let view_info = wgpu::TextureViewDescriptor {
+        format: Some(tex_info.format),
+        dimension: Some(wgpu::TextureViewDimension::D2),
+        ..Default::default()
+    };
+    let address_mode = address_mode.unwrap_or(wgpu::AddressMode::ClampToEdge);
+    let sampler_info = wgpu::SamplerDescriptor {
+        address_mode_u: address_mode,
+        address_mode_v: address_mode,
+        mag_filter: wgpu::FilterMode::Linear,
+        min_filter: wgpu::FilterMode::Linear,
+        ..Default::default()
+    };
+    let tex = renderer.create_texture_raw(&tex_info, &view_info, &sampler_info);
+    let bind = renderer.ui_bind_texture(&tex);
+    (tex, bind)
+}
+
 fn create_atlas_texture(
     renderer: &mut Renderer,
 ) -> (SimpleAtlasAllocator, (Texture, UiTextureBindGroup)) {
     let size = atlas_size(renderer);
     // Note: here we assume the max texture size is under i32::MAX.
     let atlas = SimpleAtlasAllocator::new(size2(size.x as i32, size.y as i32));
-    let texture = {
-        let tex = renderer.create_dynamic_texture(size);
-        let bind = renderer.ui_bind_texture(&tex);
-        (tex, bind)
-    };
-
-    (atlas, texture)
+    let (tex, bind) = create_image_texture(renderer, size, None);
+    (atlas, (tex, bind))
 }
 
 fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
@@ -550,37 +724,49 @@ fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
     }
 }
 
-fn upload_image(renderer: &mut Renderer, aabr: Aabr<u16>, tex: &Texture, image: &RgbaImage) {
+fn upload_image(
+    renderer: &mut Renderer,
+    aabr: Aabr<u16>,
+    target_texture_uploads: &mut Vec<UiPremultiplyUpload>,
+    image: &RgbaImage,
+) {
     let aabr = aabr.map(u32::from);
+    // Check that this image and the target aabr are the same size (otherwise there
+    // is a bug in this module).
+    debug_assert_eq!(aabr.size().into_tuple(), image.dimensions());
     let offset = aabr.min.into_array();
-    let size = aabr.size().into_array();
-    renderer.update_texture(
-        tex,
-        offset,
-        size,
-        // NOTE: Rgba texture, so each pixel is 4 bytes, ergo this cannot fail.
-        // We make the cast parameters explicit for clarity.
-        bytemuck::cast_slice::<u8, [u8; 4]>(image),
-    );
+
+    // TODO: can we transparently have cpu based version behind this (actually this
+    // would introduce more complexity to be able to do it in the background,
+    // but we could to it not in the background here especially for smaller
+    // things this would work well)
+    let upload = UiPremultiplyUpload::prepare(renderer, image, offset);
+    target_texture_uploads.push(upload);
+    //todo!()
 }
 
+// This is used for border_color.is_some() images (ie the map image).
 fn create_image(
     renderer: &mut Renderer,
-    image: RgbaImage,
-    _border_color: Rgba<f32>, // See TODO below
-) -> (Texture, UiTextureBindGroup) {
-    let tex = renderer
-        .create_texture(
-            &DynamicImage::ImageRgba8(image),
-            Some(wgpu::FilterMode::Linear),
+    image: &RgbaImage,
+    texture_parameters: TextureParameters,
+) -> (Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>) {
+    let (tex, bind) = create_image_texture(
+        renderer,
+        texture_parameters.size.map(u32::from),
+        texture_parameters
+            .border_color
             // TODO: either use the desktop only border color or just emulate this
-            // Some(border_color.into_array().into()),
-            Some(wgpu::AddressMode::ClampToBorder),
-        )
-        .expect("create_texture only panics if non ImageRbga8 is passed");
-    let bind = renderer.ui_bind_texture(&tex);
-
-    (tex, bind)
+            //.map(|c| c.into_array().into()),
+            .map(|_| wgpu::AddressMode::ClampToBorder),
+    );
+    let mut uploads = Vec::new();
+    let aabr = Aabr {
+        min: Vec2::zero(),
+        max: texture_parameters.size,
+    };
+    upload_image(renderer, aabr, &mut uploads, image);
+    (tex, bind, uploads)
 }
 
 fn premultiply_alpha(image: &mut RgbaImage) {
@@ -592,7 +778,7 @@ fn premultiply_alpha(image: &mut RgbaImage) {
     // https://github.com/image-rs/image/blob/a1ce569afd476e881acafdf9e7a5bce294d0db9a/src/buffer.rs#L664
     let dims = image.dimensions();
     let image_buffer_len = dims.0 as usize * dims.1 as usize * 4;
-    let (arrays, end) = image[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
+    let (arrays, end) = (&mut **image)[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
     // Rgba8 has 4 bytes per pixel they should be no remainder when dividing by 4.
     let (end, _) = end.as_chunks_mut::<4>();
     end.iter_mut().for_each(|pixel| {
@@ -637,3 +823,8 @@ fn premultiply_alpha(image: &mut RgbaImage) {
         }
     })
 }
+
+// Next step: Handling invalidation / removal of old textures when
+// replace_graphic is used under new resizing scheme.
+//
+// TODO: does screenshot texture have COPY_DST? I don't think it needs this.
diff --git a/voxygen/src/ui/ice/cache.rs b/voxygen/src/ui/ice/cache.rs
index 8ac2d529d5..984369b49a 100644
--- a/voxygen/src/ui/ice/cache.rs
+++ b/voxygen/src/ui/ice/cache.rs
@@ -61,7 +61,9 @@ impl Cache {
         })
     }
 
-    pub fn glyph_cache_tex(&self) -> &(Texture, UiTextureBindGroup) { &self.glyph_cache_tex }
+    pub fn glyph_cache_tex(&self) -> (&Texture, &UiTextureBindGroup) {
+        (&self.glyph_cache_tex.0, &self.glyph_cache_tex.1)
+    }
 
     pub fn glyph_cache_mut_and_tex(&mut self) -> (&mut GlyphBrush, &(Texture, UiTextureBindGroup)) {
         (self.glyph_brush.get_mut(), &self.glyph_cache_tex)
diff --git a/voxygen/src/ui/ice/renderer/mod.rs b/voxygen/src/ui/ice/renderer/mod.rs
index eb7ea61237..cafa592aaa 100644
--- a/voxygen/src/ui/ice/renderer/mod.rs
+++ b/voxygen/src/ui/ice/renderer/mod.rs
@@ -791,7 +791,7 @@ impl IcedRenderer {
                         DrawKind::Image(tex_id) => self.cache.graphic_cache().get_tex(*tex_id),
                         DrawKind::Plain => self.cache.glyph_cache_tex(),
                     };
-                    drawer.draw(&tex.1, verts.clone()); // Note: trivial clone
+                    drawer.draw(tex.1, verts.clone()); // Note: trivial clone
                 },
             }
         }
diff --git a/voxygen/src/ui/mod.rs b/voxygen/src/ui/mod.rs
index c48726c025..8f9daef7ec 100644
--- a/voxygen/src/ui/mod.rs
+++ b/voxygen/src/ui/mod.rs
@@ -1073,7 +1073,7 @@ impl Ui {
                         DrawKind::Image(tex_id) => self.cache.graphic_cache().get_tex(*tex_id),
                         DrawKind::Plain => self.cache.glyph_cache_tex(),
                     };
-                    drawer.draw(&tex.1, verts.clone()); // Note: trivial clone
+                    drawer.draw(tex.1, verts.clone()); // Note: trivial clone
                 },
             }
         }

From 63096b20424f6d3d7de6063f7b5b1f38bed61312 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Sun, 13 Nov 2022 18:52:13 -0500
Subject: [PATCH 08/14] Complete GPU based alpha premultiplication impl and
 make the CPU version even faster.

* GPU based version started in previous commit, but this fixes errors
  and bugs and gets it actually compiling and running.
* Add a way to batch together images to use the same render pass for GPU
  premultiplication if they all target the same texture.
* Pending premultiplication uploads are automatically done when calling
  `Drawer::third_pass`.
* `fast-srgb8` dep removed, we no longer convert to `f32`s to do the
  premultiplication. Two `[u16; 256]` tables are combined to compute the
  alpa premultiplied color within the same error bounds used by the
  `fast-srgb8` crate. We also no longer use explicit simd.
* Remove explicit lifetimes from `PlayState::render` since `&self` and
  `Drawer<'_>` don't need to have the same lifetime.
* Fix existing bug where invalidated cache entries were never set to
  valid when reusing them.
* `prepare_graphic` now runs some heuristics to determine whether
  premultiplication should be executed CPU side or GPU side and then
  returns a bool indicating if GPU premultiplication is needed.
---
 Cargo.lock                                    |   7 -
 assets/voxygen/shaders/include/srgb.glsl      |   2 +-
 .../shaders/premultiply-alpha-frag.glsl       |   1 +
 .../shaders/premultiply-alpha-vert.glsl       |  12 +-
 voxygen/Cargo.toml                            |   1 -
 voxygen/src/lib.rs                            |   4 +-
 voxygen/src/menu/char_selection/mod.rs        |   2 +-
 voxygen/src/menu/main/mod.rs                  |   2 +-
 voxygen/src/render/mod.rs                     |   2 +-
 voxygen/src/render/pipelines/ui.rs            |  73 ++-
 voxygen/src/render/renderer.rs                |  19 +-
 voxygen/src/render/renderer/drawer.rs         |  40 +-
 voxygen/src/render/renderer/shaders.rs        |   2 +-
 voxygen/src/scene/mod.rs                      |   6 +-
 voxygen/src/session/mod.rs                    |   2 +-
 voxygen/src/ui/cache.rs                       |   3 +
 voxygen/src/ui/graphic/mod.rs                 | 501 +++++++++++++-----
 voxygen/src/ui/ice/cache.rs                   |   3 +
 18 files changed, 505 insertions(+), 177 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 59d38d0088..ae40df69e3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1959,12 +1959,6 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
 
-[[package]]
-name = "fast-srgb8"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd2e7510819d6fbf51a5545c8f922716ecfb14df168a3242f7d33e0239efe6a1"
-
 [[package]]
 name = "fastrand"
 version = "1.8.0"
@@ -7014,7 +7008,6 @@ dependencies = [
  "enum-iterator 1.1.3",
  "etagere",
  "euc",
- "fast-srgb8",
  "gilrs",
  "glyph_brush",
  "guillotiere",
diff --git a/assets/voxygen/shaders/include/srgb.glsl b/assets/voxygen/shaders/include/srgb.glsl
index 5aa033c126..6f2fbca6b9 100644
--- a/assets/voxygen/shaders/include/srgb.glsl
+++ b/assets/voxygen/shaders/include/srgb.glsl
@@ -44,7 +44,7 @@ vec3 linear_to_srgb(vec3 col) {
 }
 
 vec4 srgba8_to_linear(uint srgba8) {
-    uvec4 nonlinear = vec4(uvec4(
+    vec4 nonlinear = vec4(uvec4(
         (srgba8 >> 24) & 0xFFu,
         (srgba8 >> 16) & 0xFFu,
         (srgba8 >>  8) & 0xFFu,
diff --git a/assets/voxygen/shaders/premultiply-alpha-frag.glsl b/assets/voxygen/shaders/premultiply-alpha-frag.glsl
index b8a1aaee10..53eba6a8a7 100644
--- a/assets/voxygen/shaders/premultiply-alpha-frag.glsl
+++ b/assets/voxygen/shaders/premultiply-alpha-frag.glsl
@@ -1,4 +1,5 @@
 #version 420 core
+#extension GL_EXT_samplerless_texture_functions : enable
 
 layout(set = 0, binding = 0)
 uniform texture2D source_texture;
diff --git a/assets/voxygen/shaders/premultiply-alpha-vert.glsl b/assets/voxygen/shaders/premultiply-alpha-vert.glsl
index c88977e6fb..82107fc0eb 100644
--- a/assets/voxygen/shaders/premultiply-alpha-vert.glsl
+++ b/assets/voxygen/shaders/premultiply-alpha-vert.glsl
@@ -15,14 +15,14 @@ layout(location = 0) out vec2 source_coords;
 
 uvec2 unpack(uint xy) {
     return uvec2(
-        bitfieldExtract(xy, 0, 16), 
-        bitfieldExtract(xy, 16, 16), 
+        bitfieldExtract(xy,  0, 16),
+        bitfieldExtract(xy, 16, 16)
     );
 }
 
 void main() {
     vec2 source_size = vec2(unpack(source_size_xy));
-    vec2 target_offset = vec2(unpack(target_offset_size_xy));
+    vec2 target_offset = vec2(unpack(target_offset_xy));
     vec2 target_size = vec2(unpack(target_size_xy));
 
     // Generate rectangle (counter clockwise triangles)
@@ -36,10 +36,10 @@ void main() {
         // left -> right (on screen)
         mix(0.0, 1.0, x_select),
         // bottom -> top (on screen)
-        mix(1.0, 0.0, y_select),
-    );
+        mix(1.0, 0.0, y_select)
+    ) * source_size;
 
-    vec2 target_coords_normalized = (target_offset + source_coords * source_size) / target_size;
+    vec2 target_coords_normalized = (target_offset + source_coords) / target_size;
 
     // Flip y and transform [0.0, 1.0] -> [-1.0, 1.0] to get NDC coordinates.
     vec2 v_pos = ((target_coords_normalized * 2.0) - vec2(1.0)) * vec2(1.0, -1.0); 
diff --git a/voxygen/Cargo.toml b/voxygen/Cargo.toml
index 0b3d7cd63a..c630cfc619 100644
--- a/voxygen/Cargo.toml
+++ b/voxygen/Cargo.toml
@@ -134,7 +134,6 @@ num_cpus = "1.0"
 # vec_map = { version = "0.8.2" }
 inline_tweak = "1.0.2"
 itertools = "0.10.0"
-fast-srgb8 = "1.0.0"
 
 # Tracy
 tracing = "0.1"
diff --git a/voxygen/src/lib.rs b/voxygen/src/lib.rs
index 4c0a525091..ccb5eb34d3 100644
--- a/voxygen/src/lib.rs
+++ b/voxygen/src/lib.rs
@@ -5,7 +5,6 @@
 #![feature(
     array_methods,
     array_zip,
-    array_from_fn,
     drain_filter,
     once_cell,
     trait_alias,
@@ -13,7 +12,6 @@
     map_try_insert,
     slice_as_chunks,
     let_chains
-    portable_simd
 )]
 #![recursion_limit = "2048"]
 
@@ -157,7 +155,7 @@ pub trait PlayState {
     fn globals_bind_group(&self) -> &GlobalsBindGroup;
 
     /// Draw the play state.
-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, settings: &Settings);
+    fn render(&self, drawer: &mut Drawer<'_>, settings: &Settings);
 
     /// Determines whether egui will be rendered for this play state
     fn egui_enabled(&self) -> bool;
diff --git a/voxygen/src/menu/char_selection/mod.rs b/voxygen/src/menu/char_selection/mod.rs
index 777965066c..784adc249b 100644
--- a/voxygen/src/menu/char_selection/mod.rs
+++ b/voxygen/src/menu/char_selection/mod.rs
@@ -275,7 +275,7 @@ impl PlayState for CharSelectionState {
 
     fn globals_bind_group(&self) -> &GlobalsBindGroup { self.scene.global_bind_group() }
 
-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, _: &Settings) {
+    fn render(&self, drawer: &mut Drawer<'_>, _: &Settings) {
         let client = self.client.borrow();
         let (humanoid_body, loadout) =
             Self::get_humanoid_body_inventory(&self.char_selection_ui, &client);
diff --git a/voxygen/src/menu/main/mod.rs b/voxygen/src/menu/main/mod.rs
index 5cd3201780..21e76f0f9e 100644
--- a/voxygen/src/menu/main/mod.rs
+++ b/voxygen/src/menu/main/mod.rs
@@ -394,7 +394,7 @@ impl PlayState for MainMenuState {
 
     fn globals_bind_group(&self) -> &GlobalsBindGroup { self.scene.global_bind_group() }
 
-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, _: &Settings) {
+    fn render(&self, drawer: &mut Drawer<'_>, _: &Settings) {
         // Draw the UI to the screen.
         let mut third_pass = drawer.third_pass();
         if let Some(mut ui_drawer) = third_pass.draw_ui() {
diff --git a/voxygen/src/render/mod.rs b/voxygen/src/render/mod.rs
index 4f3408497c..c607f633f0 100644
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@@ -43,7 +43,7 @@ pub use self::{
             create_quad as create_ui_quad,
             create_quad_vert_gradient as create_ui_quad_vert_gradient, create_tri as create_ui_tri,
             BoundLocals as UiBoundLocals, Locals as UiLocals, Mode as UiMode,
-            PremultiplyUpload as UiPremultiplyUpload, TextureBindGroup as UiTextureBindGroup,
+            TextureBindGroup as UiTextureBindGroup, UploadBatchId as UiUploadBatchId,
             Vertex as UiVertex,
         },
         GlobalModel, Globals, GlobalsBindGroup, GlobalsLayouts, Light, Shadow,
diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index 82d89a5a7d..062fcd5ea3 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -552,7 +552,7 @@ pub struct PremultiplyAlphaParams {
 ///
 /// From here we will use the `PremultiplyAlpha` pipeline to premultiply the
 /// alpha while transfering the image to its destination texture.
-pub struct PremultiplyUpload {
+pub(in super::super) struct PremultiplyUpload {
     source_bg: wgpu::BindGroup,
     source_size_xy: u32,
     /// The location in the final texture this will be placed at. Technically,
@@ -562,7 +562,7 @@ pub struct PremultiplyUpload {
 }
 
 impl PremultiplyUpload {
-    pub fn prepare(
+    pub(in super::super) fn prepare(
         device: &wgpu::Device,
         queue: &wgpu::Queue,
         layout: &PremultiplyAlphaLayout,
@@ -593,7 +593,7 @@ impl PremultiplyUpload {
                 mip_level: 0,
                 origin: wgpu::Origin3d::ZERO,
             },
-            &(&**image)[..(image.width() as usize * image.height() as usize)],
+            &(&**image)[..(image.width() as usize * image.height() as usize * 4)],
             wgpu::ImageDataLayout {
                 offset: 0,
                 bytes_per_row: NonZeroU32::new(image.width() * 4),
@@ -622,7 +622,7 @@ impl PremultiplyUpload {
         });
 
         // NOTE: We assume the max texture size is less than u16::MAX.
-        let source_size_xy = image_size.width + image_size.height << 16;
+        let source_size_xy = image_size.width + (image_size.height << 16);
 
         Self {
             source_bg,
@@ -634,15 +634,74 @@ impl PremultiplyUpload {
     /// Semantically, this consumes the `PremultiplyUpload` but we need to keep
     /// the bind group alive to the end of the render pass and don't want to
     /// bother storing it somewhere else.
-    pub fn draw_data(&self, target: &Texture) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
-        let target_offset_xy = u32::from(self.offset.x) + u32::from(self.offset.y) << 16;
+    pub(in super::super) fn draw_data(
+        &self,
+        target: &Texture,
+    ) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
+        let target_offset_xy = u32::from(self.offset.x) + (u32::from(self.offset.y) << 16);
         let target_dims = target.get_dimensions();
         // NOTE: We assume the max texture size is less than u16::MAX.
-        let target_size_xy = target_dims.x + target_dims.y << 16;
+        let target_size_xy = target_dims.x + (target_dims.y << 16);
         (&self.source_bg, PremultiplyAlphaParams {
             source_size_xy: self.source_size_xy,
             target_offset_xy,
             target_size_xy,
         })
     }
+
+    pub fn area_dbg(&self) -> f32 {
+        (self.source_size_xy & 0xFFFF) as f32 * (self.source_size_xy >> 16) as f32
+    }
+}
+
+use std::sync::Arc;
+/// Per-target texture batched uploads
+#[derive(Default)]
+pub(in super::super) struct BatchedUploads {
+    batches: Vec<(Arc<Texture>, Vec<PremultiplyUpload>)>,
+}
+#[derive(Default, Clone, Copy)]
+pub struct UploadBatchId(usize);
+
+impl BatchedUploads {
+    /// Adds the provided upload to the batch indicated by the provided target
+    /// texture and optional batch id. A new batch will be created if the batch
+    /// id is invalid (doesn't refer to an existing batch) or the provided
+    /// target texture isn't the same as the one associated with the
+    /// provided batch id. Creating a new batch involves cloning the
+    /// provided texture `Arc`.
+    ///
+    /// The id of the batch where the upload is ultimately submitted will be
+    /// returned. This id can be used in subsequent calls to add items to
+    /// the same batch (i.e. uploads for the same texture).
+    ///
+    /// Batch ids will reset every frame, however since we check that the
+    /// texture matches, it is perfectly fine to use a stale id (just keep
+    /// in mind that this will create a new batch). This also means that it is
+    /// sufficient to use `UploadBatchId::default()` when calling this with
+    /// new textures.
+    pub(in super::super) fn submit(
+        &mut self,
+        target_texture: &Arc<Texture>,
+        batch_id: UploadBatchId,
+        upload: PremultiplyUpload,
+    ) -> UploadBatchId {
+        if let Some(batch) = self
+            .batches
+            .get_mut(batch_id.0)
+            .filter(|b| Arc::ptr_eq(&b.0, target_texture))
+        {
+            batch.1.push(upload);
+            batch_id
+        } else {
+            let new_batch_id = UploadBatchId(self.batches.len());
+            self.batches
+                .push((Arc::clone(target_texture), vec![upload]));
+            new_batch_id
+        }
+    }
+
+    pub(in super::super) fn take(&mut self) -> Vec<(Arc<Texture>, Vec<PremultiplyUpload>)> {
+        core::mem::take(&mut self.batches)
+    }
 }
diff --git a/voxygen/src/render/renderer.rs b/voxygen/src/render/renderer.rs
index 084e65fe0a..cdc80e140b 100644
--- a/voxygen/src/render/renderer.rs
+++ b/voxygen/src/render/renderer.rs
@@ -178,6 +178,8 @@ pub struct Renderer {
     profile_times: Vec<wgpu_profiler::GpuTimerScopeResult>,
     profiler_features_enabled: bool,
 
+    ui_premultiply_uploads: ui::BatchedUploads,
+
     #[cfg(feature = "egui-ui")]
     egui_renderpass: egui_wgpu_backend::RenderPass,
 
@@ -545,6 +547,8 @@ impl Renderer {
             profile_times: Vec::new(),
             profiler_features_enabled,
 
+            ui_premultiply_uploads: Default::default(),
+
             #[cfg(feature = "egui-ui")]
             egui_renderpass,
 
@@ -1437,18 +1441,23 @@ impl Renderer {
         texture.update(&self.queue, offset, size, bytemuck::cast_slice(data))
     }
 
-    pub fn prepare_premultiply_upload(
-        &self,
+    /// See docs on [`ui::BatchedUploads::submit`].
+    pub fn ui_premultiply_upload(
+        &mut self,
+        target_texture: &Arc<Texture>,
+        batch: ui::UploadBatchId,
         image: &image::RgbaImage,
         offset: Vec2<u16>,
-    ) -> ui::PremultiplyUpload {
-        ui::PremultiplyUpload::prepare(
+    ) -> ui::UploadBatchId {
+        let upload = ui::PremultiplyUpload::prepare(
             &self.device,
             &self.queue,
             &self.layouts.premultiply_alpha,
             image,
             offset,
-        )
+        );
+        self.ui_premultiply_uploads
+            .submit(target_texture, batch, upload)
     }
 
     /// Queue to obtain a screenshot on the next frame render
diff --git a/voxygen/src/render/renderer/drawer.rs b/voxygen/src/render/renderer/drawer.rs
index a1ceb29750..798ac4f70b 100644
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@@ -37,6 +37,14 @@ impl<'frame> Pipelines<'frame> {
         }
     }
 
+    fn premultiply_alpha(&self) -> Option<&ui::PremultiplyAlphaPipeline> {
+        match self {
+            Pipelines::Interface(pipelines) => Some(&pipelines.premultiply_alpha),
+            Pipelines::All(pipelines) => Some(&pipelines.premultiply_alpha),
+            Pipelines::None => None,
+        }
+    }
+
     fn blit(&self) -> Option<&blit::BlitPipeline> {
         match self {
             Pipelines::Interface(pipelines) => Some(&pipelines.blit),
@@ -67,6 +75,7 @@ struct RendererBorrow<'frame> {
     pipeline_modes: &'frame super::PipelineModes,
     quad_index_buffer_u16: &'frame Buffer<u16>,
     quad_index_buffer_u32: &'frame Buffer<u32>,
+    ui_premultiply_uploads: &'frame mut ui::BatchedUploads,
     #[cfg(feature = "egui-ui")]
     egui_render_pass: &'frame mut egui_wgpu_backend::RenderPass,
 }
@@ -118,6 +127,7 @@ impl<'frame> Drawer<'frame> {
             pipeline_modes: &renderer.pipeline_modes,
             quad_index_buffer_u16: &renderer.quad_index_buffer_u16,
             quad_index_buffer_u32: &renderer.quad_index_buffer_u32,
+            ui_premultiply_uploads: &mut renderer.ui_premultiply_uploads,
             #[cfg(feature = "egui-ui")]
             egui_render_pass: &mut renderer.egui_renderpass,
         };
@@ -425,15 +435,19 @@ impl<'frame> Drawer<'frame> {
         });
     }
 
-    pub fn run_ui_premultiply_passes<'a>(
-        &mut self,
-        targets: impl Iterator<Item = (&'a super::super::Texture, Vec<ui::PremultiplyUpload>)>,
-    ) {
+    /// Runs render passes with alpha premultiplication pipeline to complete any
+    /// pending uploads.
+    fn run_ui_premultiply_passes<'a>(&mut self) {
+        prof_span!("run_ui_premultiply_passes");
+        let Some(premultiply_alpha) = self.borrow.pipelines.premultiply_alpha() else { return };
         let encoder = self.encoder.as_mut().unwrap();
         let device = self.borrow.device;
 
+        let targets = self.borrow.ui_premultiply_uploads.take();
+
         // TODO: What is the CPU overhead of each renderpass?
-        for (i, (target_texture, uploads)) in targets.enumerate() {
+        for (i, (target_texture, uploads)) in targets.into_iter().enumerate() {
+            let mut area = 0.0;
             prof_span!("ui premultiply pass");
             tracing::info!("{} uploads", uploads.len());
             let profile_name = format!("ui_premultiply_pass {}", i);
@@ -447,23 +461,31 @@ impl<'frame> Drawer<'frame> {
                         view: &target_texture.view,
                         resolve_target: None,
                         ops: wgpu::Operations {
-                            load: wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT),
+                            load: wgpu::LoadOp::Load,
                             store: true,
                         },
                     }],
                     depth_stencil_attachment: None,
                 });
+            render_pass.set_pipeline(&premultiply_alpha.pipeline);
             for upload in &uploads {
-                let (source_bind_group, push_constant_data) = upload.draw_data(target_texture);
+                area += upload.area_dbg();
+                let (source_bind_group, push_constant_data) = upload.draw_data(&target_texture);
                 let bytes = bytemuck::bytes_of(&push_constant_data);
                 render_pass.set_bind_group(0, source_bind_group, &[]);
                 render_pass.set_push_constants(wgpu::ShaderStage::VERTEX, 0, bytes);
-                render_pass.draw_indexed(0..6, 0, 0..1);
+                render_pass.draw(0..6, 0..1);
             }
+            let avg_area = area as f32 / uploads.len() as f32;
+            tracing::info!("avg area sqrt {}", f32::sqrt(avg_area));
         }
     }
 
+    /// Note, this automatically calls the internal `run_ui_premultiply_passes`
+    /// to complete any pending image uploads for the UI.
     pub fn third_pass(&mut self) -> ThirdPassDrawer {
+        self.run_ui_premultiply_passes();
+
         let encoder = self.encoder.as_mut().unwrap();
         let device = self.borrow.device;
         let mut render_pass =
@@ -537,7 +559,7 @@ impl<'frame> Drawer<'frame> {
 
     /// Does nothing if the shadow pipelines are not available or shadow map
     /// rendering is disabled
-    pub fn draw_point_shadows<'data: 'frame>(
+    pub fn draw_point_shadows<'data>(
         &mut self,
         matrices: &[shadow::PointLightMatrix; 126],
         chunks: impl Clone
diff --git a/voxygen/src/render/renderer/shaders.rs b/voxygen/src/render/renderer/shaders.rs
index 26072469cf..f2468fc0b0 100644
--- a/voxygen/src/render/renderer/shaders.rs
+++ b/voxygen/src/render/renderer/shaders.rs
@@ -74,7 +74,7 @@ impl assets::Compound for Shaders {
             "ui-vert",
             "ui-frag",
             "premultiply-alpha-vert",
-            "premultiply_alpha-frag",
+            "premultiply-alpha-frag",
             "lod-terrain-vert",
             "lod-terrain-frag",
             "clouds-vert",
diff --git a/voxygen/src/scene/mod.rs b/voxygen/src/scene/mod.rs
index ff9fbf3c47..d5d81f10a7 100644
--- a/voxygen/src/scene/mod.rs
+++ b/voxygen/src/scene/mod.rs
@@ -1232,9 +1232,9 @@ impl Scene {
     pub fn global_bind_group(&self) -> &GlobalsBindGroup { &self.globals_bind_group }
 
     /// Render the scene using the provided `Drawer`.
-    pub fn render<'a>(
-        &'a self,
-        drawer: &mut Drawer<'a>,
+    pub fn render(
+        &self,
+        drawer: &mut Drawer<'_>,
         state: &State,
         viewpoint_entity: EcsEntity,
         tick: u64,
diff --git a/voxygen/src/session/mod.rs b/voxygen/src/session/mod.rs
index 4aef117d82..4d2fccd532 100644
--- a/voxygen/src/session/mod.rs
+++ b/voxygen/src/session/mod.rs
@@ -1896,7 +1896,7 @@ impl PlayState for SessionState {
     /// Render the session to the screen.
     ///
     /// This method should be called once per frame.
-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, settings: &Settings) {
+    fn render(&self, drawer: &mut Drawer<'_>, settings: &Settings) {
         span!(_guard, "render", "<Session as PlayState>::render");
 
         let client = self.client.borrow();
diff --git a/voxygen/src/ui/cache.rs b/voxygen/src/ui/cache.rs
index a09ce10069..8cd357bf88 100644
--- a/voxygen/src/ui/cache.rs
+++ b/voxygen/src/ui/cache.rs
@@ -7,6 +7,9 @@ use conrod_core::{text::GlyphCache, widget::Id};
 use hashbrown::HashMap;
 use vek::*;
 
+// TODO: probably make cache fields where we have mut getters into just public
+// fields
+
 // Multiplied by current window size
 const GLYPH_CACHE_SIZE: u32 = 1;
 // Glyph cache tolerances
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 1ffee4413d..6540f3c461 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -4,10 +4,11 @@ pub mod renderer;
 pub use renderer::{SampleStrat, Transform};
 
 use crate::{
-    render::{Renderer, Texture, UiPremultiplyUpload, UiTextureBindGroup},
+    render::{Renderer, Texture, UiTextureBindGroup, UiUploadBatchId},
     ui::KeyedJobs,
 };
 use common::{figure::Segment, slowjob::SlowJobPool};
+use common_base::prof_span;
 use guillotiere::{size2, SimpleAtlasAllocator};
 use hashbrown::{hash_map::Entry, HashMap};
 use image::{DynamicImage, RgbaImage};
@@ -86,7 +87,7 @@ impl CachedDetails {
     fn info(
         &self,
         atlases: &[(SimpleAtlasAllocator, usize)],
-        textures: &Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
+        textures: &Slab<(Arc<Texture>, UiTextureBindGroup, UiUploadBatchId)>,
     ) -> (usize, bool, Aabr<u16>) {
         match *self {
             CachedDetails::Atlas {
@@ -119,6 +120,17 @@ impl CachedDetails {
             },
         }
     }
+
+    fn set_valid(&mut self) {
+        match self {
+            Self::Atlas { ref mut valid, .. } => {
+                *valid = true;
+            },
+            Self::Texture { ref mut valid, .. } => {
+                *valid = true;
+            },
+        }
+    }
 }
 
 /// Requirements that a particular graphic has with respect to the atlas
@@ -241,7 +253,7 @@ pub struct GraphicCache {
     /// for this frame. The purpose of this is to collect all the operations
     /// together so that a single renderpass is performed for each target
     /// texture.
-    textures: Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
+    textures: Slab<(Arc<Texture>, UiTextureBindGroup, UiUploadBatchId)>,
     /// The location and details of graphics cached on the GPU.
     ///
     /// Graphic::Voxel images include the dimensions they were rasterized at in
@@ -257,7 +269,7 @@ impl GraphicCache {
         let (atlas, (tex, bind)) = create_atlas_texture(renderer);
 
         let mut textures = Slab::new();
-        let tex_id = textures.insert((tex, bind, Vec::new()));
+        let tex_id = textures.insert((tex, bind, UiUploadBatchId::default()));
 
         Self {
             graphic_map: HashMap::default(),
@@ -336,7 +348,7 @@ impl GraphicCache {
 
     /// Used to acquire textures for rendering
     pub fn get_tex(&self, id: TexId) -> (&Texture, &UiTextureBindGroup) {
-        let (tex, bind, _uploads) = self.textures.get(id.0).expect("Invalid TexId used");
+        let (tex, bind, _upload_batch) = self.textures.get(id.0).expect("Invalid TexId used");
         (tex, bind)
     }
 
@@ -368,18 +380,13 @@ impl GraphicCache {
 
         let (atlas, (tex, bind)) = create_atlas_texture(renderer);
         let mut textures = Slab::new();
-        let tex_id = textures.insert((tex, bind, Vec::new()));
+        let tex_id = textures.insert((tex, bind, UiUploadBatchId::default()));
         self.atlases = vec![(atlas, tex_id)];
         self.textures = textures;
     }
 
     /// Source rectangle should be from 0 to 1, and represents a bounding box
     /// for the source image of the graphic.
-    ///
-    /// [`complete_premultiply_uploads`](Self::complete_premultiply_uploads)
-    /// needs to be called to finalize updates on the GPU that are initiated
-    /// here. Thus, ideally that would be called before drawing UI elements
-    /// using the images cached here.
     pub fn cache_res(
         &mut self,
         renderer: &mut Renderer,
@@ -465,19 +472,18 @@ impl GraphicCache {
             requirements.to_key_and_tex_parameters(graphic_id, requested_dims_upright);
 
         let details = match cache_map.entry(key) {
-            Entry::Occupied(details) => {
-                let details = details.get();
+            Entry::Occupied(mut details) => {
+                let details = details.get_mut();
                 let (idx, valid, aabr) = details.info(atlases, textures);
 
                 // Check if the cached version has been invalidated by replacing the underlying
                 // graphic
                 if !valid {
                     // Create image
-                    let image = prepare_graphic(
+                    let (image, gpu_premul) = prepare_graphic(
                         graphic,
                         key,
                         requested_dims_upright,
-                        false,
                         &mut self.keyed_jobs,
                         pool,
                     )?;
@@ -489,7 +495,9 @@ impl GraphicCache {
                         texture_parameters.size.map(u32::from).into_tuple()
                     );
                     // Transfer to the gpu
-                    upload_image(renderer, aabr, &mut textures[idx].2, &image);
+                    let (ref texture, _, ref mut upload_batch) = &mut textures[idx];
+                    upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
+                    details.set_valid();
                 }
 
                 return Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)));
@@ -498,11 +506,10 @@ impl GraphicCache {
         };
 
         // Construct image in an optional threadpool.
-        let image = prepare_graphic(
+        let (image, gpu_premul) = prepare_graphic(
             graphic,
             key,
             requested_dims_upright,
-            false,
             &mut self.keyed_jobs,
             pool,
         )?;
@@ -540,7 +547,8 @@ impl GraphicCache {
                         valid: true,
                         aabr,
                     });
-                    upload_image(renderer, aabr, &mut textures[texture_idx].2, &image);
+                    let (ref texture, _, ref mut upload_batch) = &mut textures[texture_idx];
+                    upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
                     break;
                 }
             }
@@ -555,10 +563,11 @@ impl GraphicCache {
                         .map(aabr_from_alloc_rect)
                         .unwrap();
                     // NOTE: All mutations happen only after the texture creation succeeds!
-                    let tex_idx = textures.insert((tex, bind, Vec::new()));
+                    let tex_idx = textures.insert((tex, bind, UiUploadBatchId::default()));
                     let atlas_idx = atlases.len();
                     atlases.push((atlas, tex_idx));
-                    upload_image(renderer, aabr, &mut textures[tex_idx].2, &image);
+                    let (ref texture, _, ref mut upload_batch) = &mut textures[tex_idx];
+                    upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
                     CachedDetails::Atlas {
                         atlas_idx,
                         valid: true,
@@ -568,11 +577,12 @@ impl GraphicCache {
             }
         } else {
             // Create a texture just for this
-            let (tex, bind, uploads) = create_image(renderer, &image, texture_parameters);
+            let (tex, bind, upload_batch) =
+                create_image(renderer, &image, texture_parameters, gpu_premul);
             // NOTE: All mutations happen only after the texture creation and upload
-            // initiation succeeds! (completing the upload does not have any failure cases
-            // afaik)
-            let index = textures.insert((tex, bind, uploads));
+            // initiation succeeds! (completing the upload does not have any
+            // failure cases afaik)
+            let index = textures.insert((tex, bind, upload_batch));
             CachedDetails::Texture { index, valid: true }
         };
 
@@ -584,77 +594,76 @@ impl GraphicCache {
 
         Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)))
     }
-
-    /// Runs render passes with alpha premultiplication pipeline to complete any
-    /// pending uploads.
-    ///
-    /// This should be called before starting the pass where the ui is rendered.
-    pub fn complete_premultiply_uploads(&mut self, drawer: &mut crate::render::Drawer<'_>) {
-        drawer.run_ui_premultiply_passes(
-            self.textures
-                .iter_mut()
-                .map(|(_tex_id, (texture, _, uploads))| (&*texture, core::mem::take(uploads))),
-        );
-    }
 }
 
 /// Prepare the graphic into the form that will be uploaded to the GPU.
 ///
 /// For voxel graphics, draws the graphic at the specified dimensions.
 ///
-/// Also can pre-multiplies alpha in images so they can be linearly filtered on
-/// the GPU (this is optional since we also have a path to do this
-/// premultiplication on the GPU).
+/// Alpha premultiplication is necessary so that  images so they can be linearly
+/// filtered on the GPU. Premultiplication can either occur here or on the GPU
+/// depending on the size of the image and other factors. If premultiplication
+/// on the GPU is needed the returned bool will be `true`.
 fn prepare_graphic<'graphic>(
     graphic: &'graphic Graphic,
     cache_key: CacheKey,
     dims: Vec2<u16>,
-    premultiply_on_cpu: bool, // TODO: currently unused
     keyed_jobs: &mut KeyedJobs<CacheKey, RgbaImage>,
     pool: Option<&SlowJobPool>,
-) -> Option<Cow<'graphic, RgbaImage>> {
+) -> Option<(Cow<'graphic, RgbaImage>, bool)> {
+    prof_span!("prepare_graphic");
     match graphic {
-        // Short-circuit spawning a job on the threadpool for blank graphics
         Graphic::Blank => None,
         Graphic::Image(image, _border_color) => {
-            if premultiply_on_cpu {
-                keyed_jobs
-                    .spawn(pool, cache_key, || {
-                        let image = Arc::clone(image);
-                        move |_| {
-                            // Image will be rescaled when sampling from it on the GPU so we don't
-                            // need to resize it here.
-                            let mut image = image.to_rgba8();
-                            // TODO: could potentially do this when loading the image and for voxel
-                            // images maybe at some point in the `draw_vox` processing. Or we could
-                            // push it in the other direction and do conversion on the GPU.
-                            premultiply_alpha(&mut image);
-                            image
-                        }
-                    })
-                    .map(|(_, v)| Cow::Owned(v))
-            } else if let Some(rgba) = image.as_rgba8() {
-                Some(Cow::Borrowed(rgba))
-            } else {
-                // TODO: we should require rgba8 format
-                warn!("Non-rgba8 image in UI used this may be deprecated.");
-                Some(Cow::Owned(image.to_rgba8()))
-            }
+            // Image will be rescaled when sampling from it on the GPU so we don't
+            // need to resize it here.
+            //
+            // TODO: We could potentially push premultiplication even earlier (e.g. to the
+            // time of loading images or packaging veloren for distribution).
+            let mut rgba_cow = image.as_rgba8().map_or_else(
+                || {
+                    // TODO: we may want to require loading in as the rgba8 format so we don't have
+                    // to perform conversion here. On the other hand, we can take advantage of
+                    // certain formats to know that alpha premultiplication doesn't need to be
+                    // performed (but we would probably just want to store that with the loaded
+                    // rgba8 format).
+                    Cow::Owned(image.to_rgba8())
+                },
+                Cow::Borrowed,
+            );
+            // NOTE: We do premultiplication on the main thread since if it would be
+            // expensive enough to do in the background we would just do it on
+            // the GPU. Could still use `rayon` to parallelize this work, if
+            // needed.
+            let premultiply_strategy = PremultiplyStrategy::determine(&*rgba_cow);
+            let needs_gpu_premultiply = match premultiply_strategy {
+                PremultiplyStrategy::UseGpu => true,
+                PremultiplyStrategy::NotNeeded => false,
+                PremultiplyStrategy::UseCpu => {
+                    // NOTE: to_mut will clone the image if it was Cow::Borrowed
+                    premultiply_alpha(rgba_cow.to_mut());
+                    false
+                },
+            };
+
+            Some((rgba_cow, needs_gpu_premultiply))
         },
         Graphic::Voxel(segment, trans, sample_strat) => keyed_jobs
             .spawn(pool, cache_key, || {
                 let segment = Arc::clone(segment);
                 let (trans, sample_strat) = (*trans, *sample_strat);
                 move |_| {
+                    // TODO: for now we always use CPU premultiplication for these, may want to
+                    // re-evaluate this after zoomy worldgen branch is merged (and it is more clear
+                    // when these jobs go to the background thread pool or not).
+
                     // Render voxel model at requested resolution
                     let mut image = renderer::draw_vox(&segment, dims, trans, sample_strat);
-                    if premultiply_on_cpu {
-                        premultiply_alpha(&mut image);
-                    }
+                    premultiply_alpha(&mut image);
                     image
                 }
             })
-            .map(|(_, v)| Cow::Owned(v)),
+            .map(|(_, v)| (Cow::Owned(v), false)),
     }
 }
 
@@ -672,7 +681,11 @@ fn create_image_texture(
     renderer: &mut Renderer,
     size: Vec2<u32>,
     address_mode: Option<wgpu::AddressMode>,
-) -> (Texture, UiTextureBindGroup) {
+) -> (Arc<Texture>, UiTextureBindGroup) {
+    // TODO: Right now we have to manually clear images to workaround AMD DX bug,
+    // for this we use Queue::write_texture which needs this usage. I think this
+    // may be fixed in newer wgpu versions that auto-clear the texture.
+    let workaround_usage = wgpu::TextureUsage::COPY_DST;
     let tex_info = wgpu::TextureDescriptor {
         label: None,
         size: wgpu::Extent3d {
@@ -684,7 +697,10 @@ fn create_image_texture(
         sample_count: 1,
         dimension: wgpu::TextureDimension::D2,
         format: wgpu::TextureFormat::Rgba8UnormSrgb,
-        usage: wgpu::TextureUsage::RENDER_ATTACHMENT | wgpu::TextureUsage::SAMPLED,
+        usage: wgpu::TextureUsage::RENDER_ATTACHMENT // GPU premultiply
+            | wgpu::TextureUsage::COPY_DST // CPU premultiply
+            | wgpu::TextureUsage::SAMPLED // using image in ui rendering
+            | workaround_usage,
     };
     let view_info = wgpu::TextureViewDescriptor {
         format: Some(tex_info.format),
@@ -701,12 +717,12 @@ fn create_image_texture(
     };
     let tex = renderer.create_texture_raw(&tex_info, &view_info, &sampler_info);
     let bind = renderer.ui_bind_texture(&tex);
-    (tex, bind)
+    (Arc::new(tex), bind)
 }
 
 fn create_atlas_texture(
     renderer: &mut Renderer,
-) -> (SimpleAtlasAllocator, (Texture, UiTextureBindGroup)) {
+) -> (SimpleAtlasAllocator, (Arc<Texture>, UiTextureBindGroup)) {
     let size = atlas_size(renderer);
     // Note: here we assume the max texture size is under i32::MAX.
     let atlas = SimpleAtlasAllocator::new(size2(size.x as i32, size.y as i32));
@@ -726,23 +742,34 @@ fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
 
 fn upload_image(
     renderer: &mut Renderer,
-    aabr: Aabr<u16>,
-    target_texture_uploads: &mut Vec<UiPremultiplyUpload>,
+    target_texture: &Arc<Texture>,
+    upload_batch: &mut UiUploadBatchId,
     image: &RgbaImage,
+    aabr: Aabr<u16>,
+    premultiply_on_gpu: bool,
 ) {
-    let aabr = aabr.map(u32::from);
     // Check that this image and the target aabr are the same size (otherwise there
     // is a bug in this module).
-    debug_assert_eq!(aabr.size().into_tuple(), image.dimensions());
-    let offset = aabr.min.into_array();
-
-    // TODO: can we transparently have cpu based version behind this (actually this
-    // would introduce more complexity to be able to do it in the background,
-    // but we could to it not in the background here especially for smaller
-    // things this would work well)
-    let upload = UiPremultiplyUpload::prepare(renderer, image, offset);
-    target_texture_uploads.push(upload);
-    //todo!()
+    debug_assert_eq!(aabr.map(u32::from).size().into_tuple(), image.dimensions());
+    if premultiply_on_gpu {
+        *upload_batch =
+            renderer.ui_premultiply_upload(target_texture, *upload_batch, image, aabr.min);
+    } else {
+        let aabr = aabr.map(u32::from);
+        let offset = aabr.min.into_array();
+        let size = aabr.size().into_array();
+        // upload directly
+        renderer.update_texture(
+            &*target_texture,
+            offset,
+            size,
+            // NOTE: Rgba texture, so each pixel is 4 bytes, ergo this cannot fail.
+            // We make the cast parameters explicit for clarity.
+            bytemuck::cast_slice::<u8, [u8; 4]>(
+                &(&**image)[..size[0] as usize * size[1] as usize * 4],
+            ),
+        )
+    }
 }
 
 // This is used for border_color.is_some() images (ie the map image).
@@ -750,7 +777,8 @@ fn create_image(
     renderer: &mut Renderer,
     image: &RgbaImage,
     texture_parameters: TextureParameters,
-) -> (Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>) {
+    premultiply_on_gpu: bool,
+) -> (Arc<Texture>, UiTextureBindGroup, UiUploadBatchId) {
     let (tex, bind) = create_image_texture(
         renderer,
         texture_parameters.size.map(u32::from),
@@ -760,17 +788,82 @@ fn create_image(
             //.map(|c| c.into_array().into()),
             .map(|_| wgpu::AddressMode::ClampToBorder),
     );
-    let mut uploads = Vec::new();
+    let mut upload_batch = UiUploadBatchId::default();
     let aabr = Aabr {
         min: Vec2::zero(),
         max: texture_parameters.size,
     };
-    upload_image(renderer, aabr, &mut uploads, image);
-    (tex, bind, uploads)
+    upload_image(
+        renderer,
+        &tex,
+        &mut upload_batch,
+        image,
+        aabr,
+        premultiply_on_gpu,
+    );
+    (tex, bind, upload_batch)
+}
+
+// CPU-side alpha premultiplication implementation.
+
+pub struct PremultiplyLookupTable {
+    alpha: [u16; 256],
+    // This is for both colors that are always below the linear transform threshold (of the
+    // transform between linear/non-linear srgb) and colors that start above the threshold when
+    // transforming into linear srgb and then fall below it after being multiplied by alpha (before
+    // being transformed out of linear srgb).
+    color: [u16; 256],
+}
+
+impl Default for PremultiplyLookupTable {
+    fn default() -> Self {
+        #[rustfmt::skip]
+        fn accurate_to_linear(c: u8) -> f32 {
+            let c = c as f32 / 255.0;
+            // https://en.wikipedia.org/wiki/SRGB#Transformation
+            if c <= 0.04045 {
+                c / 12.92
+            } else {
+                // 0.055 ~= 14
+                ((c + 0.055) / 1.055).powf(2.4)
+            }
+        }
+
+        use core::array;
+        let alpha = array::from_fn(|alpha| {
+            // NOTE: u16::MAX + 1 here relies on the max alpha being short-circuited (and
+            // not using this table). We multiply by this factor since it is a
+            // power of 2, which means later demultiplying it will optimize to a
+            // bitshift.
+            (((alpha as f32 / 255.0).powf(1.0 / 2.4) * (u16::MAX as f32 + 1.0)) + 0.5) as u16
+        });
+        let color = array::from_fn(|color| {
+            (if color <= 10 {
+                //  <= 10 means the transform is linear!
+                color as f32 / 255.0
+            } else {
+                // Here the transform into linear srgb isn't linear but the transform out of it is. 
+                //
+                // This is transform into and out of linear srgb with the theoretical alpha
+                // multiplication factored out.
+                accurate_to_linear(color as u8) * 12.92
+            }
+            // take advantage of the precision offered by u16
+            * (1 << 13) as f32
+            // round to the nearest integer when the cast truncates
+            + 0.5) as u16
+        });
+        Self { alpha, color }
+    }
 }
 
 fn premultiply_alpha(image: &mut RgbaImage) {
-    use fast_srgb8::{f32x4_to_srgb8, srgb8_to_f32};
+    prof_span!("premultiply alpha");
+
+    lazy_static::lazy_static! {
+        static ref LOOKUP: PremultiplyLookupTable = Default::default();
+    }
+    let lookup = &*LOOKUP;
     // TODO: Apparently it is possible for ImageBuffer raw vec to have more pixels
     // than the dimensions of the actual image (I don't think we actually have
     // this occuring but we should probably fix other spots that use the raw
@@ -779,52 +872,200 @@ fn premultiply_alpha(image: &mut RgbaImage) {
     let dims = image.dimensions();
     let image_buffer_len = dims.0 as usize * dims.1 as usize * 4;
     let (arrays, end) = (&mut **image)[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
-    // Rgba8 has 4 bytes per pixel they should be no remainder when dividing by 4.
+    // Rgba8 has 4 bytes per pixel there should be no remainder when dividing by 4.
     let (end, _) = end.as_chunks_mut::<4>();
     end.iter_mut().for_each(|pixel| {
         let alpha = pixel[3];
         if alpha == 0 {
             *pixel = [0; 4];
-        } else if alpha != 255 {
-            let linear_alpha = alpha as f32 / 255.0;
-            let [r, g, b] = core::array::from_fn(|i| srgb8_to_f32(pixel[i]) * linear_alpha);
-            let srgb8 = f32x4_to_srgb8([r, g, b, 0.0]);
-            (pixel[0], pixel[1], pixel[3]) = (srgb8[0], srgb8[1], srgb8[3]);
+            return;
+        } else if alpha == 255 {
+            return;
+        };
+
+        for color in &mut pixel[..3] {
+            let predicted = ((lookup.alpha[alpha as usize] as u32) * (*color as u32 + 14) + 32433)
+                / (u16::MAX as u32 + 1);
+            let multiplied_color = (if predicted < 9 + 14 {
+                (lookup.color[*color as usize] as u32 * alpha as u32 + 4096) >> 13
+            } else {
+                predicted - 14
+            }) as u8;
+            *color = multiplied_color;
         }
     });
     arrays.iter_mut().for_each(|pixelx4| {
-        use core::simd::{f32x4, u8x4, Simd};
-        let alpha = Simd::from_array([pixelx4[3], pixelx4[7], pixelx4[11], pixelx4[15]]);
-        if alpha == Simd::splat(0) {
-            *pixelx4 = [0; 16];
-        } else if alpha != Simd::splat(255) {
-            let linear_simd = |array: [u8; 4]| Simd::from_array(array.map(srgb8_to_f32));
-            // Pack rgb components from the 4th pixel into the the last position for each of
-            // the other 3 pixels.
-            let a = linear_simd([pixelx4[0], pixelx4[1], pixelx4[2], pixelx4[12]]);
-            let b = linear_simd([pixelx4[4], pixelx4[5], pixelx4[6], pixelx4[13]]);
-            let c = linear_simd([pixelx4[8], pixelx4[9], pixelx4[10], pixelx4[14]]);
-            let linear_alpha = alpha.cast::<f32>() * Simd::splat(1.0 / 255.0);
-
-            // Multiply by alpha and then convert back into srgb8.
-            let premultiply = |x: f32x4, i| {
-                let mut a = f32x4::splat(linear_alpha[i]);
-                a[3] = linear_alpha[3];
-                u8x4::from_array(f32x4_to_srgb8((x * a).to_array()))
-            };
-            let pa = premultiply(a, 0);
-            let pb = premultiply(b, 1);
-            let pc = premultiply(c, 2);
-
-            (pixelx4[0], pixelx4[1], pixelx4[2]) = (pa[0], pa[1], pa[2]);
-            (pixelx4[4], pixelx4[5], pixelx4[6]) = (pb[0], pb[1], pb[2]);
-            (pixelx4[8], pixelx4[9], pixelx4[10]) = (pc[0], pc[1], pc[2]);
-            (pixelx4[12], pixelx4[13], pixelx4[14]) = (pa[3], pb[3], pc[3]);
+        // Short-circuit for alpha == 0 or 255
+        // This adds ~7 us (worst case) for a 256x256 image.
+        // Best case is decreased to 20 us total time.
+        if pixelx4[3] == pixelx4[7] && pixelx4[3] == pixelx4[11] && pixelx4[3] == pixelx4[15] {
+            if pixelx4[3] == 0 {
+                *pixelx4 = [0; 16];
+                return;
+            } else if pixelx4[3] == u8::MAX {
+                return;
+            }
         }
-    })
+
+        // Lookup transformed alpha values for each pixel first.
+        // Putting this here seems to make things slightly faster.
+        let factors = [
+            lookup.alpha[pixelx4[3] as usize],
+            lookup.alpha[pixelx4[7] as usize],
+            lookup.alpha[pixelx4[11] as usize],
+            lookup.alpha[pixelx4[15] as usize],
+        ];
+        for pixel_index in 0..4 {
+            let alpha_factor = factors[pixel_index];
+            let alpha = pixelx4[pixel_index * 4 + 3];
+            // Putting this code outside the loop makes things take ~25% less time.
+            let color_factors = [
+                lookup.color[pixelx4[pixel_index * 4 + 0] as usize] as u32 * alpha as u32 + 4096,
+                lookup.color[pixelx4[pixel_index * 4 + 1] as usize] as u32 * alpha as u32 + 4096,
+                lookup.color[pixelx4[pixel_index * 4 + 2] as usize] as u32 * alpha as u32 + 4096,
+            ];
+            for i in 0..3 {
+                let color = &mut pixelx4[pixel_index * 4 + i];
+                // Loosely based on transform to linear and back (above threshold) (this is
+                // where use of 14 comes from).
+                // `32433` selected via trial and error to reduce the number of mismatches.
+                // `/ (u16::MAX as u32 + 1)` transforms back to `u8` precision (we add 1 so it
+                // will be a division by a power of 2 which optimizes well).
+                let predicted =
+                    ((alpha_factor as u32) * (*color as u32 + 14) + 32328) / (u16::MAX as u32 + 1);
+                let multiplied_color = (if predicted < 9 + 14 {
+                    // Here we handle two cases:
+                    // 1. When the transform starts and ends as linear.
+                    // 2. When the color is over the linear threshold for the transform into linear
+                    //    space but below this threshold when transforming back out (due to being
+                    //    multiplied with a small alpha).
+                    // (in both cases the result is linearly related to alpha and we can encode how
+                    // it is related to the color in a lookup table)
+                    // NOTE: 212 is the largest color value used here (when alpha isn't 0)
+                    color_factors[i] >> 13
+                } else {
+                    predicted - 14
+                }) as u8;
+                *color = multiplied_color;
+            }
+        }
+    });
 }
 
-// Next step: Handling invalidation / removal of old textures when
-// replace_graphic is used under new resizing scheme.
-//
-// TODO: does screenshot texture have COPY_DST? I don't think it needs this.
+/// Strategy for how alpha premultiplication will be applied to an image.
+enum PremultiplyStrategy {
+    UseCpu,
+    UseGpu,
+    // Image is fully opaque.
+    NotNeeded,
+}
+
+impl PremultiplyStrategy {
+    #[rustfmt::skip] // please don't format comment with 'ns/pixel' to a separate line from the value
+    fn determine(image: &RgbaImage) -> Self {
+        // TODO: Would be useful to re-time this after a wgpu update.
+        //
+        // Thresholds below are based on the timing measurements of the CPU based premultiplication
+        // vs ovehead of interacting with the GPU API to perform premultiplication on the GPU.
+        // These timings are quite circumstantial and could vary between machines, wgpu updates,
+        // and changes to the structure of the GPU based path.  
+        //
+        // GPU path costs (For calculations I used `57.6 us` as a roughly reasonable estimate of
+        // total time here but that can vary lower and higher. Everything is a bit imprecise here
+        // so I won't list individual timings. The key takeaway is that this can be made more
+        // efficient by avoidiing the create/drop of a texture, texture view, and bind group for
+        // each image. Also, if we didn't need a separate render pass for each target image that
+        // would be helpful as well. Using compute passes and passing data in as a raw buffer may
+        // help with both of these but initial attempts with that ran into issues (e.g. when we get
+        // the ability to have non-srgb views of srgb textures that will be useful)):
+        // * create/drop texture
+        // * create/drop texture view
+        // * create/drop bind group
+        // * run render pass (NOTE: if many images are processed at once with the same target
+        //   texture this portion of the cost can be split between them)
+        //
+        // CPU path costs:
+        // * clone image (0.17 ns/pixel (benchmark) - 0.73 ns/pixel (in voxygen))
+        // * run premultiplication (0.305 ns/pixel (when shortcircuits are always hit) -
+        //   3.81 ns/pixel (with random alpha))
+        //
+        // Shared costs include:
+        // * write_texture
+        // * (optional) check for fraction of shortcircuit blocks in image (0.223 ns/pixel)
+        //
+        // `ALWAYS_CPU_THRESHOLD` is roughly:
+        // ("cost of GPU path" + "shortcircuit count cost") / "worst case cost of CPU path per pixel"
+        //
+        // `ALWAYS_GPU_THRESHOLD` is NOT: "cost of GPU path" / "best case cost of CPU path per pixel"
+        // since the cost of checking for whether the CPU path is better at this quantity of pixels
+        // becomes more than the on the amount of overhead we are willing to add to the worst case
+        // scenario where we run the short-circuit count check and end up using the GPU path. The
+        // currently selected value of 200x200 adds at most about ~20% of the cost of the GPU path.
+        // (TODO: maybe we could have the check bail out early if the results aren't looking
+        // favorable for the CPU path and/or sample a random subset of the pixels).
+        //
+        // `CHECKED_THRESHOLD` is roughly: "cost of GPU path / "best case cost of CPU path per pixel"
+        const ALWAYS_CPU_THRESHOLD: usize = 120 * 120;
+        const ALWAYS_GPU_THRESHOLD: usize = 200 * 200;
+        const CHECKED_THRESHOLD: usize = 240 * 240;
+
+        let dims = image.dimensions();
+        let pixel_count = dims.0 as usize * dims.1 as usize;
+        if pixel_count <= ALWAYS_CPU_THRESHOLD {
+            Self::UseCpu
+        } else if pixel_count > ALWAYS_GPU_THRESHOLD {
+            Self::UseGpu
+        } else if let Some(fraction) = fraction_shortcircuit_blocks(image) {
+            // This seems correct...?
+            // TODO: I think we technically can exit the fraction checking early if we know the
+            // total fraction value will be over: (threshold - ALWAYS_CPU_THRESHOLD) /
+            // (CHECKED_THRESHOLD - ALWAYS_CPU_THRESHOLD).
+            let threshold = fraction * CHECKED_THRESHOLD as f32
+                + (1.0 - fraction) * ALWAYS_CPU_THRESHOLD as f32;
+            if pixel_count as f32 <= threshold {
+                Self::UseCpu
+            } else {
+                Self::UseGpu
+            }
+        } else {
+            Self::NotNeeded
+        }
+    }
+}
+
+/// Useful to estimates cost of premultiplying alpha in the provided image via
+/// the CPU method.
+///
+/// Computes the fraction of 4 pixel chunks that are fully translucent or
+/// opaque. Returns `None` if no premultiplication is needed (i.e. all alpha
+/// values are 255).
+fn fraction_shortcircuit_blocks(image: &RgbaImage) -> Option<f32> {
+    let dims = image.dimensions();
+    let pixel_count = dims.0 as usize * dims.1 as usize;
+    let (arrays, end) = (&**image)[..pixel_count * 4].as_chunks::<{ 4 * 4 }>();
+
+    // Rgba8 has 4 bytes per pixel there should be no remainder when dividing by 4.
+    let (end, _) = end.as_chunks::<4>();
+    let end_is_opaque = end.iter().all(|pixel| pixel[3] == 255);
+
+    // 14.6 us for 256x256 image
+    let num_chunks = arrays.len();
+    let mut num_translucent = 0;
+    let mut num_opaque = 0;
+    arrays.iter().for_each(|pixelx4| {
+        let v = u128::from_ne_bytes(*pixelx4);
+        let alpha_mask = 0x000000FF_000000FF_000000FF_000000FF;
+        let masked = v & alpha_mask;
+        if masked == 0 {
+            num_translucent += 1;
+        } else if masked == alpha_mask {
+            num_opaque += 1;
+        }
+    });
+
+    if num_chunks == num_opaque && num_translucent == 0 && end_is_opaque {
+        None
+    } else {
+        Some((num_translucent as f32 + num_opaque as f32) / num_chunks as f32)
+    }
+}
diff --git a/voxygen/src/ui/ice/cache.rs b/voxygen/src/ui/ice/cache.rs
index 984369b49a..316e05ea70 100644
--- a/voxygen/src/ui/ice/cache.rs
+++ b/voxygen/src/ui/ice/cache.rs
@@ -8,6 +8,9 @@ use glyph_brush::GlyphBrushBuilder;
 use std::cell::{RefCell, RefMut};
 use vek::*;
 
+// TODO: probably make cache fields where we have mut getters into just public
+// fields
+
 // Multiplied by current window size
 const GLYPH_CACHE_SIZE: u32 = 1;
 // Glyph cache tolerances

From 7205d4c275a8bb9af1ad50c8f7b686892fd335cc Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Sun, 13 Nov 2022 21:29:50 -0500
Subject: [PATCH 09/14] Remove extra performance debugging code

---
 voxygen/src/render/pipelines/ui.rs    | 4 ----
 voxygen/src/render/renderer/drawer.rs | 8 +-------
 voxygen/src/ui/graphic/mod.rs         | 2 --
 3 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index 062fcd5ea3..7a7f0332a6 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -648,10 +648,6 @@ impl PremultiplyUpload {
             target_size_xy,
         })
     }
-
-    pub fn area_dbg(&self) -> f32 {
-        (self.source_size_xy & 0xFFFF) as f32 * (self.source_size_xy >> 16) as f32
-    }
 }
 
 use std::sync::Arc;
diff --git a/voxygen/src/render/renderer/drawer.rs b/voxygen/src/render/renderer/drawer.rs
index 798ac4f70b..2dc8a86a41 100644
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@@ -445,14 +445,11 @@ impl<'frame> Drawer<'frame> {
 
         let targets = self.borrow.ui_premultiply_uploads.take();
 
-        // TODO: What is the CPU overhead of each renderpass?
         for (i, (target_texture, uploads)) in targets.into_iter().enumerate() {
-            let mut area = 0.0;
             prof_span!("ui premultiply pass");
-            tracing::info!("{} uploads", uploads.len());
             let profile_name = format!("ui_premultiply_pass {}", i);
             let label = format!("ui premultiply pass {}", i);
-            // TODO: a GPU profile scope on each of the passes here may be a bit too fine
+            // S-TODO: a GPU profile scope on each of the passes here may be a bit too fine
             // grained.
             let mut render_pass =
                 encoder.scoped_render_pass(&profile_name, device, &wgpu::RenderPassDescriptor {
@@ -469,15 +466,12 @@ impl<'frame> Drawer<'frame> {
                 });
             render_pass.set_pipeline(&premultiply_alpha.pipeline);
             for upload in &uploads {
-                area += upload.area_dbg();
                 let (source_bind_group, push_constant_data) = upload.draw_data(&target_texture);
                 let bytes = bytemuck::bytes_of(&push_constant_data);
                 render_pass.set_bind_group(0, source_bind_group, &[]);
                 render_pass.set_push_constants(wgpu::ShaderStage::VERTEX, 0, bytes);
                 render_pass.draw(0..6, 0..1);
             }
-            let avg_area = area as f32 / uploads.len() as f32;
-            tracing::info!("avg area sqrt {}", f32::sqrt(avg_area));
         }
     }
 
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 6540f3c461..9bed1fad55 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -858,8 +858,6 @@ impl Default for PremultiplyLookupTable {
 }
 
 fn premultiply_alpha(image: &mut RgbaImage) {
-    prof_span!("premultiply alpha");
-
     lazy_static::lazy_static! {
         static ref LOOKUP: PremultiplyLookupTable = Default::default();
     }

From 9cdfb6a4aadb7ee6e6e7cdabfe887affcd1eeb4b Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Mon, 14 Nov 2022 21:30:28 -0500
Subject: [PATCH 10/14] Resolve some remaining TODOs for ui image processing
 changes

* Explicitly assert that neither of the requested dimensions for an
  image are 0. (I think this used to fail later on anyway)
* Don't show the UI alpha premultiply pass in GPU timings in the HUD
  debug info display since it only very transiently appears (since this
  doesn't run every frame).
---
 voxygen/src/hud/mod.rs                | 12 +++++++-----
 voxygen/src/render/mod.rs             |  2 +-
 voxygen/src/render/renderer/drawer.rs |  9 +++++----
 voxygen/src/ui/graphic/mod.rs         | 21 ++++++++++++++-------
 4 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/voxygen/src/hud/mod.rs b/voxygen/src/hud/mod.rs
index 375a8f0a73..8bced4a057 100644
--- a/voxygen/src/hud/mod.rs
+++ b/voxygen/src/hud/mod.rs
@@ -2769,11 +2769,13 @@ impl Hud {
                 }
 
                 for (i, timing) in gpu_timings.iter().enumerate() {
-                    let timings_text = &format!(
-                        "{:16}{:.3} ms",
-                        &format!("{}:", timing.1),
-                        timing.2 * 1000.0,
-                    );
+                    let label = timing.1;
+                    // We skip displaying these since they aren't present every frame.
+                    if label.starts_with(crate::render::UI_PREMULTIPLY_PASS) {
+                        continue;
+                    }
+                    let timings_text =
+                        &format!("{:16}{:.3} ms", &format!("{label}:"), timing.2 * 1000.0,);
                     let timings_widget = Text::new(timings_text)
                         .color(TEXT_COLOR)
                         .down(V_PAD)
diff --git a/voxygen/src/render/mod.rs b/voxygen/src/render/mod.rs
index c607f633f0..d77cf8806a 100644
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@@ -53,7 +53,7 @@ pub use self::{
             DebugDrawer, DebugShadowDrawer, Drawer, FigureDrawer, FigureShadowDrawer,
             FirstPassDrawer, ParticleDrawer, PreparedUiDrawer, ShadowPassDrawer, SpriteDrawer,
             TerrainDrawer, TerrainShadowDrawer, ThirdPassDrawer, TrailDrawer,
-            TransparentPassDrawer, UiDrawer, VolumetricPassDrawer,
+            TransparentPassDrawer, UiDrawer, VolumetricPassDrawer, UI_PREMULTIPLY_PASS,
         },
         AltIndices, ColLightInfo, CullingMode, Renderer,
     },
diff --git a/voxygen/src/render/renderer/drawer.rs b/voxygen/src/render/renderer/drawer.rs
index 2dc8a86a41..c350e99235 100644
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@@ -20,6 +20,9 @@ use wgpu_profiler::scope::{ManualOwningScope, OwningScope, Scope};
 #[cfg(feature = "egui-ui")]
 use {common_base::span, egui_wgpu_backend::ScreenDescriptor, egui_winit_platform::Platform};
 
+/// Gpu timing label prefix associated with the UI alpha premultiplication pass.
+pub const UI_PREMULTIPLY_PASS: &str = "ui_premultiply_pass";
+
 // Currently available pipelines
 enum Pipelines<'frame> {
     Interface(&'frame super::InterfacePipelines),
@@ -447,10 +450,8 @@ impl<'frame> Drawer<'frame> {
 
         for (i, (target_texture, uploads)) in targets.into_iter().enumerate() {
             prof_span!("ui premultiply pass");
-            let profile_name = format!("ui_premultiply_pass {}", i);
-            let label = format!("ui premultiply pass {}", i);
-            // S-TODO: a GPU profile scope on each of the passes here may be a bit too fine
-            // grained.
+            let profile_name = format!("{UI_PREMULTIPLY_PASS} {i}");
+            let label = format!("ui premultiply pass {i}");
             let mut render_pass =
                 encoder.scoped_render_pass(&profile_name, device, &wgpu::RenderPassDescriptor {
                     label: Some(&label),
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 9bed1fad55..53fd4b48f7 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -214,8 +214,6 @@ impl TextureRequirements {
         }
     }
 
-    // TODO: what if requested size is 0? Do we currently panic on this case and
-    // expect caller not to ask for 0 size? (if so document that)
     fn to_key_and_tex_parameters(
         self,
         graphic_id: Id,
@@ -387,6 +385,10 @@ impl GraphicCache {
 
     /// Source rectangle should be from 0 to 1, and represents a bounding box
     /// for the source image of the graphic.
+    ///
+    /// # Panics
+    ///
+    /// Panics if one of the lengths in requested_dims is zero.
     pub fn cache_res(
         &mut self,
         renderer: &mut Renderer,
@@ -399,6 +401,7 @@ impl GraphicCache {
         source: Aabr<f64>,
         rotation: Rotation,
     ) -> Option<((Aabr<f64>, Vec2<f32>), TexId)> {
+        assert!(requested_dims.map(|e| e != 0).reduce_and());
         let requested_dims_upright = match rotation {
             // The image is stored on the GPU with no rotation, so we need to swap the dimensions
             // here to get the resolution that the image will be displayed at but re-oriented into
@@ -433,15 +436,13 @@ impl GraphicCache {
         };
         // Apply all transformations.
         // TODO: Verify rotation is being applied correctly.
-        let transformed_aabr = |aabr| {
+        let transformed_aabr_and_scale = |aabr| {
             let scaled = scaled_aabr(aabr);
             // Calculate how many displayed pixels there are for each pixel in the source
             // image. We need this to calculate where to sample in the shader to
             // retain crisp pixel borders when scaling the image.
-            // S-TODO: A bit hacky inserting this here, just to get things working initially
             let scale = requested_dims_upright.map2(
                 Vec2::from(scaled.size()),
-                // S-TODO div by zero potential? If so, is NaN an issue in that case?
                 |screen_pixels, sample_pixels: f64| screen_pixels as f32 / sample_pixels as f32,
             );
             let transformed = rotated_aabr(scaled);
@@ -500,7 +501,10 @@ impl GraphicCache {
                     details.set_valid();
                 }
 
-                return Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)));
+                return Some((
+                    transformed_aabr_and_scale(aabr.map(|e| e as f64)),
+                    TexId(idx),
+                ));
             },
             Entry::Vacant(details) => details,
         };
@@ -592,7 +596,10 @@ impl GraphicCache {
         // Insert into cached map
         details.insert(location);
 
-        Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)))
+        Some((
+            transformed_aabr_and_scale(aabr.map(|e| e as f64)),
+            TexId(idx),
+        ))
     }
 }
 

From 07d3242b2619c576aeecd588fa29828983f07367 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Mon, 14 Nov 2022 22:18:38 -0500
Subject: [PATCH 11/14] Update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ccc82d68a..608f9b1cda 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Climbing no longer requires having 10 energy
 - Castles will now be placed close to towns
 - Sword
+- Rescaling of images for the UI is now done when sampling from them on the GPU. Improvements are
+  particularily noticeable when opening the map screen (which involves rescaling a few large
+  images) and also when using the voxel minimap view (where a medium size image is updated often). 
 
 ### Removed
 

From 132ce72246bc204cb72997750472a812b71e0976 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Mon, 14 Nov 2022 22:21:45 -0500
Subject: [PATCH 12/14] Fix some comments that were mis-formatted by rustfmt

and remove TODO that is now documented as a review comment
---
 voxygen/src/render/pipelines/ui.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index 7a7f0332a6..986bd534c9 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -416,21 +416,21 @@ pub fn create_tri(
 // 1. Upload new image via `Device::create_texture_with_data`.
 //
 //    (NOTE: Initially considered: Creating a storage buffer to read from in the
-// shader via    `Device::create_buffer_init`, with `MAP_WRITE` flag to avoid
-// staging buffer. However, with    dedicated GPUs combining usages other than
-// `COPY_SRC` with `MAP_WRITE` may be less ideal.    Plus, by copying into a
-// texture first we can get free srgb conversion when fetching colors
-//    from the texture. In the future, we may want to branch based on the
-// whether the GPU is    integrated and avoid this extra copy.)
+//    shader via `Device::create_buffer_init`, with `MAP_WRITE` flag to avoid
+//    staging buffer. However, with GPUs combining usages other than `COPY_SRC`
+//    with `MAP_WRITE` may be less ideal. Plus, by copying into a texture first
+//    we can get free srgb conversion when fetching colors from the texture. In
+//    the future, we may want to branch based on the whether the GPU is
+//    integrated and avoid this extra copy.)
 //
 // 2. Run render pipeline to multiply by alpha reading from this texture and
-// writing to the final    texture (this can either be in an atlas or in an
-// independent texture if the image is over a    certain size threshold).
+//    writing to the final texture (this can either be in an atlas or in an
+//    independent texture if the image is over a certain size threshold).
 //
 //    (NOTE: Initially considered: using a compute pipeline and writing to the
-// final texture as a    storage texture. However, the srgb format can't be used
-// with storage texture and there is not    yet the capability to create
-// non-srgb views of srgb textures.)
+//     final texture as a storage texture. However, the srgb format can't be
+//     used with storage texture and there is not yet the capability to create
+//     non-srgb views of srgb textures.)
 //
 // Info needed:
 //

From 2a9a63a60ed0f8c1be3eaf289dfb016c70444586 Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Mon, 14 Nov 2022 22:43:21 -0500
Subject: [PATCH 13/14] Clippy fixes

---
 assets/voxygen/shaders/ui-frag.glsl   |  4 ----
 voxygen/src/render/pipelines/ui.rs    |  2 --
 voxygen/src/render/renderer/drawer.rs |  2 +-
 voxygen/src/ui/graphic/mod.rs         | 10 ++++++----
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/assets/voxygen/shaders/ui-frag.glsl b/assets/voxygen/shaders/ui-frag.glsl
index ee04285033..a881b2b542 100644
--- a/assets/voxygen/shaders/ui-frag.glsl
+++ b/assets/voxygen/shaders/ui-frag.glsl
@@ -180,10 +180,6 @@ void main() {
         // worry about bleeding in the atlas and/or what the border behavior
         // should be.
 
-        // TODO: benchmark before and after by viewing zoomed out map in the UI
-        // (at particular ui scale and map zoom out, far enough to trigger downscaling)
-        // (seems like ~10% increase if any)
-        
         // Convert to sampled pixel coordinates.
         vec2 uv_pixel = f_uv * texture_size;
         vec4 image_color;
diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index 986bd534c9..ce3e4ef8f4 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -4,8 +4,6 @@ use core::num::NonZeroU32;
 use std::mem;
 use vek::*;
 
-// TODO: profile UI rendering before and after on laptop.
-
 /// The format of textures that the UI sources image data from.
 ///
 /// Note, the is not directly used in all relevant locations, but still helps to
diff --git a/voxygen/src/render/renderer/drawer.rs b/voxygen/src/render/renderer/drawer.rs
index c350e99235..181e52756c 100644
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@@ -440,7 +440,7 @@ impl<'frame> Drawer<'frame> {
 
     /// Runs render passes with alpha premultiplication pipeline to complete any
     /// pending uploads.
-    fn run_ui_premultiply_passes<'a>(&mut self) {
+    fn run_ui_premultiply_passes(&mut self) {
         prof_span!("run_ui_premultiply_passes");
         let Some(premultiply_alpha) = self.borrow.pipelines.premultiply_alpha() else { return };
         let encoder = self.encoder.as_mut().unwrap();
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index 53fd4b48f7..ac2276a45a 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -214,6 +214,7 @@ impl TextureRequirements {
         }
     }
 
+    #[allow(clippy::wrong_self_convention)] // type is spiritually Copy
     fn to_key_and_tex_parameters(
         self,
         graphic_id: Id,
@@ -304,7 +305,7 @@ impl GraphicCache {
         };
 
         let old_requirements = TextureRequirements::from_graphic(&old);
-        let new_requirements = TextureRequirements::from_graphic(&new);
+        let new_requirements = TextureRequirements::from_graphic(new);
         let should_invalidate = old_requirements == new_requirements && old_requirements.is_some();
 
         // Invalidate if possible or remove from caches.
@@ -468,7 +469,7 @@ impl GraphicCache {
             },
         };
 
-        let requirements = TextureRequirements::from_graphic(&graphic)?;
+        let requirements = TextureRequirements::from_graphic(graphic)?;
         let (key, texture_parameters) =
             requirements.to_key_and_tex_parameters(graphic_id, requested_dims_upright);
 
@@ -642,7 +643,7 @@ fn prepare_graphic<'graphic>(
             // expensive enough to do in the background we would just do it on
             // the GPU. Could still use `rayon` to parallelize this work, if
             // needed.
-            let premultiply_strategy = PremultiplyStrategy::determine(&*rgba_cow);
+            let premultiply_strategy = PremultiplyStrategy::determine(&rgba_cow);
             let needs_gpu_premultiply = match premultiply_strategy {
                 PremultiplyStrategy::UseGpu => true,
                 PremultiplyStrategy::NotNeeded => false,
@@ -767,7 +768,7 @@ fn upload_image(
         let size = aabr.size().into_array();
         // upload directly
         renderer.update_texture(
-            &*target_texture,
+            target_texture,
             offset,
             size,
             // NOTE: Rgba texture, so each pixel is 4 bytes, ergo this cannot fail.
@@ -1044,6 +1045,7 @@ impl PremultiplyStrategy {
 /// Computes the fraction of 4 pixel chunks that are fully translucent or
 /// opaque. Returns `None` if no premultiplication is needed (i.e. all alpha
 /// values are 255).
+#[allow(clippy::unusual_byte_groupings)]
 fn fraction_shortcircuit_blocks(image: &RgbaImage) -> Option<f32> {
     let dims = image.dimensions();
     let pixel_count = dims.0 as usize * dims.1 as usize;

From 5881e44e61a549fd6ca27bae3c3a48d22e4b2b9c Mon Sep 17 00:00:00 2001
From: Imbris <imbrisf@gmail.com>
Date: Sat, 8 Apr 2023 02:19:14 -0400
Subject: [PATCH 14/14] Address review on 3573

---
 assets/voxygen/shaders/include/srgb.glsl | 10 ----------
 assets/voxygen/shaders/ui-frag.glsl      |  3 +--
 voxygen/src/render/pipelines/ui.rs       |  3 ++-
 voxygen/src/ui/graphic/mod.rs            |  6 ++++--
 4 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/assets/voxygen/shaders/include/srgb.glsl b/assets/voxygen/shaders/include/srgb.glsl
index 6f2fbca6b9..b1abd93811 100644
--- a/assets/voxygen/shaders/include/srgb.glsl
+++ b/assets/voxygen/shaders/include/srgb.glsl
@@ -43,16 +43,6 @@ vec3 linear_to_srgb(vec3 col) {
     );
 }
 
-vec4 srgba8_to_linear(uint srgba8) {
-    vec4 nonlinear = vec4(uvec4(
-        (srgba8 >> 24) & 0xFFu,
-        (srgba8 >> 16) & 0xFFu,
-        (srgba8 >>  8) & 0xFFu,
-         srgba8        & 0xFFu
-    )) / 255.0;
-    return vec4(srgb_to_linear(nonlinear.rgb), nonlinear.a);
-}
-
 float pow5(float x) {
     float x2 = x * x;
     return x2 * x2 * x;
diff --git a/assets/voxygen/shaders/ui-frag.glsl b/assets/voxygen/shaders/ui-frag.glsl
index a881b2b542..ce2f5d1d02 100644
--- a/assets/voxygen/shaders/ui-frag.glsl
+++ b/assets/voxygen/shaders/ui-frag.glsl
@@ -13,7 +13,6 @@ uniform u_locals {
     vec4 w_pos;
 };
 
-// TODO: swap with u_locals because that may change more frequently?
 layout(set = 2, binding = 0)
 uniform texture2D t_tex;
 layout(set = 2, binding = 1)
@@ -71,7 +70,7 @@ void downscale_params(float pos, float scale, out vec2 weights, out vec2 offsets
     // 4 pixels (within a single dimension) in the sampled texture. So we can't
     // perfectly compute the contribution of each covered pixel in the sampled
     // texture with only 2 samples (along each dimension). Thus, we fallback to
-    // an imperfect technique of just sampling a 1 pixel length from the center
+    // an imperfect technique of just sampling 1 pixel length from the center
     // on each side of the nearest pixel edge. An alternative might be to
     // pre-compute mipmap levels that could be sampled from, although this
     // could interact poorly with the atlas.
diff --git a/voxygen/src/render/pipelines/ui.rs b/voxygen/src/render/pipelines/ui.rs
index ce3e4ef8f4..da00de4555 100644
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@@ -570,7 +570,8 @@ impl PremultiplyUpload {
         // TODO: duplicating some code from `Texture` since:
         // 1. We don't need to create a sampler.
         // 2. Texture::new accepts &DynamicImage which isn't possible to create from
-        //    &RgbaImage without cloning.
+        //    &RgbaImage without cloning. (this might be addressed on zoomy worldgen
+        //    branch)
         let image_size = wgpu::Extent3d {
             width: image.width(),
             height: image.height(),
diff --git a/voxygen/src/ui/graphic/mod.rs b/voxygen/src/ui/graphic/mod.rs
index ac2276a45a..5d7b35bac6 100644
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@@ -325,11 +325,13 @@ impl GraphicCache {
             self.cache_map.drain_filter(|key, details| {
                 if key.graphic_id == id {
                     match details {
-                        // TODO: if replace_graphic is used continously for small images (i.e.
+                        // NOTE: if replace_graphic is used continously for small images (i.e.
                         // images placed into an atlas) of different sizes, that can use up our
                         // atlas space since spots in the atlas can't be reused. (this scenario is
                         // now possible with scaling being done during sampling rather than placing
-                        // resized version into the atlas)
+                        // resized version into the atlas). This is expected to not occur in all
+                        // pratical cases we plan to support here (i.e. the size of the replacement
+                        // image will always be the same).
                         CachedDetails::Atlas { .. } => {},
                         CachedDetails::Texture { index, .. } => {
                             self.textures.remove(*index);