Complete GPU based alpha premultiplication impl and make the CPU version

even faster. * GPU based version started in previous commit, but this fixes errors and bugs and gets it actually compiling and running. * Add a way to batch together images to use the same render pass for GPU premultiplication if they all target the same texture. * Pending premultiplication uploads are automatically done when calling `Drawer::third_pass`. * `fast-srgb8` dep removed, we no longer convert to `f32`s to do the premultiplication. Two `[u16; 256]` tables are combined to compute the alpa premultiplied color within the same error bounds used by the `fast-srgb8` crate. We also no longer use explicit simd. * Remove explicit lifetimes from `PlayState::render` since `&self` and `Drawer<'_>` don't need to have the same lifetime. * Fix existing bug where invalidated cache entries were never set to valid when reusing them. * `prepare_graphic` now runs some heuristics to determine whether premultiplication should be executed CPU side or GPU side and then returns a bool indicating if GPU premultiplication is needed.
2024-08-30 18:12:32 +00:00 · 2022-11-13 18:52:13 -05:00 · 2022-11-13 18:52:13 -05:00 · 63096b2042
commit 63096b2042
parent efd932c71e
18 changed files with 505 additions and 177 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1959,12 +1959,6 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"

-[[package]]
-name = "fast-srgb8"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd2e7510819d6fbf51a5545c8f922716ecfb14df168a3242f7d33e0239efe6a1"
-
 [[package]]
 name = "fastrand"
 version = "1.8.0"
@ -7014,7 +7008,6 @@ dependencies = [
 "enum-iterator 1.1.3",
 "etagere",
 "euc",
- "fast-srgb8",
 "gilrs",
 "glyph_brush",
 "guillotiere",
--- a/assets/voxygen/shaders/include/srgb.glsl
+++ b/assets/voxygen/shaders/include/srgb.glsl
@ -44,7 +44,7 @@ vec3 linear_to_srgb(vec3 col) {
 }

 vec4 srgba8_to_linear(uint srgba8) {
-    uvec4 nonlinear = vec4(uvec4(
+    vec4 nonlinear = vec4(uvec4(
        (srgba8 >> 24) & 0xFFu,
        (srgba8 >> 16) & 0xFFu,
        (srgba8 >>  8) & 0xFFu,
--- a/assets/voxygen/shaders/premultiply-alpha-frag.glsl
+++ b/assets/voxygen/shaders/premultiply-alpha-frag.glsl
@ -1,4 +1,5 @@
 #version 420 core
+#extension GL_EXT_samplerless_texture_functions : enable

 layout(set = 0, binding = 0)
 uniform texture2D source_texture;
--- a/assets/voxygen/shaders/premultiply-alpha-vert.glsl
+++ b/assets/voxygen/shaders/premultiply-alpha-vert.glsl
@ -15,14 +15,14 @@ layout(location = 0) out vec2 source_coords;

 uvec2 unpack(uint xy) {
    return uvec2(
-        bitfieldExtract(xy, 0, 16), 
-        bitfieldExtract(xy, 16, 16), 
+        bitfieldExtract(xy,  0, 16),
+        bitfieldExtract(xy, 16, 16)
    );
 }

 void main() {
    vec2 source_size = vec2(unpack(source_size_xy));
-    vec2 target_offset = vec2(unpack(target_offset_size_xy));
+    vec2 target_offset = vec2(unpack(target_offset_xy));
    vec2 target_size = vec2(unpack(target_size_xy));

    // Generate rectangle (counter clockwise triangles)
@ -36,10 +36,10 @@ void main() {
        // left -> right (on screen)
        mix(0.0, 1.0, x_select),
        // bottom -> top (on screen)
-        mix(1.0, 0.0, y_select),
-    );
+        mix(1.0, 0.0, y_select)
+    ) * source_size;

-    vec2 target_coords_normalized = (target_offset + source_coords * source_size) / target_size;
+    vec2 target_coords_normalized = (target_offset + source_coords) / target_size;

    // Flip y and transform [0.0, 1.0] -> [-1.0, 1.0] to get NDC coordinates.
    vec2 v_pos = ((target_coords_normalized * 2.0) - vec2(1.0)) * vec2(1.0, -1.0); 
--- a/voxygen/Cargo.toml
+++ b/voxygen/Cargo.toml
@ -134,7 +134,6 @@ num_cpus = "1.0"
 # vec_map = { version = "0.8.2" }
 inline_tweak = "1.0.2"
 itertools = "0.10.0"
-fast-srgb8 = "1.0.0"

 # Tracy
 tracing = "0.1"
--- a/voxygen/src/lib.rs
+++ b/voxygen/src/lib.rs
@ -5,7 +5,6 @@
 #![feature(
    array_methods,
    array_zip,
-    array_from_fn,
    drain_filter,
    once_cell,
    trait_alias,
@ -13,7 +12,6 @@
    map_try_insert,
    slice_as_chunks,
    let_chains
-    portable_simd
 )]
 #![recursion_limit = "2048"]

@ -157,7 +155,7 @@ pub trait PlayState {
    fn globals_bind_group(&self) -> &GlobalsBindGroup;

    /// Draw the play state.
-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, settings: &Settings);
+    fn render(&self, drawer: &mut Drawer<'_>, settings: &Settings);

    /// Determines whether egui will be rendered for this play state
    fn egui_enabled(&self) -> bool;
--- a/voxygen/src/menu/char_selection/mod.rs
+++ b/voxygen/src/menu/char_selection/mod.rs
@ -275,7 +275,7 @@ impl PlayState for CharSelectionState {

    fn globals_bind_group(&self) -> &GlobalsBindGroup { self.scene.global_bind_group() }

-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, _: &Settings) {
+    fn render(&self, drawer: &mut Drawer<'_>, _: &Settings) {
        let client = self.client.borrow();
        let (humanoid_body, loadout) =
            Self::get_humanoid_body_inventory(&self.char_selection_ui, &client);
--- a/voxygen/src/menu/main/mod.rs
+++ b/voxygen/src/menu/main/mod.rs
@ -394,7 +394,7 @@ impl PlayState for MainMenuState {

    fn globals_bind_group(&self) -> &GlobalsBindGroup { self.scene.global_bind_group() }

-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, _: &Settings) {
+    fn render(&self, drawer: &mut Drawer<'_>, _: &Settings) {
        // Draw the UI to the screen.
        let mut third_pass = drawer.third_pass();
        if let Some(mut ui_drawer) = third_pass.draw_ui() {
--- a/voxygen/src/render/mod.rs
+++ b/voxygen/src/render/mod.rs
@ -43,7 +43,7 @@ pub use self::{
            create_quad as create_ui_quad,
            create_quad_vert_gradient as create_ui_quad_vert_gradient, create_tri as create_ui_tri,
            BoundLocals as UiBoundLocals, Locals as UiLocals, Mode as UiMode,
-            PremultiplyUpload as UiPremultiplyUpload, TextureBindGroup as UiTextureBindGroup,
+            TextureBindGroup as UiTextureBindGroup, UploadBatchId as UiUploadBatchId,
            Vertex as UiVertex,
        },
        GlobalModel, Globals, GlobalsBindGroup, GlobalsLayouts, Light, Shadow,
--- a/voxygen/src/render/pipelines/ui.rs
+++ b/voxygen/src/render/pipelines/ui.rs
@ -552,7 +552,7 @@ pub struct PremultiplyAlphaParams {
 ///
 /// From here we will use the `PremultiplyAlpha` pipeline to premultiply the
 /// alpha while transfering the image to its destination texture.
-pub struct PremultiplyUpload {
+pub(in super::super) struct PremultiplyUpload {
    source_bg: wgpu::BindGroup,
    source_size_xy: u32,
    /// The location in the final texture this will be placed at. Technically,
@ -562,7 +562,7 @@ pub struct PremultiplyUpload {
 }

 impl PremultiplyUpload {
-    pub fn prepare(
+    pub(in super::super) fn prepare(
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        layout: &PremultiplyAlphaLayout,
@ -593,7 +593,7 @@ impl PremultiplyUpload {
                mip_level: 0,
                origin: wgpu::Origin3d::ZERO,
            },
-            &(&**image)[..(image.width() as usize * image.height() as usize)],
+            &(&**image)[..(image.width() as usize * image.height() as usize * 4)],
            wgpu::ImageDataLayout {
                offset: 0,
                bytes_per_row: NonZeroU32::new(image.width() * 4),
@ -622,7 +622,7 @@ impl PremultiplyUpload {
        });

        // NOTE: We assume the max texture size is less than u16::MAX.
-        let source_size_xy = image_size.width + image_size.height << 16;
+        let source_size_xy = image_size.width + (image_size.height << 16);

        Self {
            source_bg,
@ -634,15 +634,74 @@ impl PremultiplyUpload {
    /// Semantically, this consumes the `PremultiplyUpload` but we need to keep
    /// the bind group alive to the end of the render pass and don't want to
    /// bother storing it somewhere else.
-    pub fn draw_data(&self, target: &Texture) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
-        let target_offset_xy = u32::from(self.offset.x) + u32::from(self.offset.y) << 16;
+    pub(in super::super) fn draw_data(
+        &self,
+        target: &Texture,
+    ) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
+        let target_offset_xy = u32::from(self.offset.x) + (u32::from(self.offset.y) << 16);
        let target_dims = target.get_dimensions();
        // NOTE: We assume the max texture size is less than u16::MAX.
-        let target_size_xy = target_dims.x + target_dims.y << 16;
+        let target_size_xy = target_dims.x + (target_dims.y << 16);
        (&self.source_bg, PremultiplyAlphaParams {
            source_size_xy: self.source_size_xy,
            target_offset_xy,
            target_size_xy,
        })
    }
+
+    pub fn area_dbg(&self) -> f32 {
+        (self.source_size_xy & 0xFFFF) as f32 * (self.source_size_xy >> 16) as f32
+    }
+}
+
+use std::sync::Arc;
+/// Per-target texture batched uploads
+#[derive(Default)]
+pub(in super::super) struct BatchedUploads {
+    batches: Vec<(Arc<Texture>, Vec<PremultiplyUpload>)>,
+}
+#[derive(Default, Clone, Copy)]
+pub struct UploadBatchId(usize);
+
+impl BatchedUploads {
+    /// Adds the provided upload to the batch indicated by the provided target
+    /// texture and optional batch id. A new batch will be created if the batch
+    /// id is invalid (doesn't refer to an existing batch) or the provided
+    /// target texture isn't the same as the one associated with the
+    /// provided batch id. Creating a new batch involves cloning the
+    /// provided texture `Arc`.
+    ///
+    /// The id of the batch where the upload is ultimately submitted will be
+    /// returned. This id can be used in subsequent calls to add items to
+    /// the same batch (i.e. uploads for the same texture).
+    ///
+    /// Batch ids will reset every frame, however since we check that the
+    /// texture matches, it is perfectly fine to use a stale id (just keep
+    /// in mind that this will create a new batch). This also means that it is
+    /// sufficient to use `UploadBatchId::default()` when calling this with
+    /// new textures.
+    pub(in super::super) fn submit(
+        &mut self,
+        target_texture: &Arc<Texture>,
+        batch_id: UploadBatchId,
+        upload: PremultiplyUpload,
+    ) -> UploadBatchId {
+        if let Some(batch) = self
+            .batches
+            .get_mut(batch_id.0)
+            .filter(|b| Arc::ptr_eq(&b.0, target_texture))
+        {
+            batch.1.push(upload);
+            batch_id
+        } else {
+            let new_batch_id = UploadBatchId(self.batches.len());
+            self.batches
+                .push((Arc::clone(target_texture), vec![upload]));
+            new_batch_id
+        }
+    }
+
+    pub(in super::super) fn take(&mut self) -> Vec<(Arc<Texture>, Vec<PremultiplyUpload>)> {
+        core::mem::take(&mut self.batches)
+    }
 }
--- a/voxygen/src/render/renderer.rs
+++ b/voxygen/src/render/renderer.rs
@ -178,6 +178,8 @@ pub struct Renderer {
    profile_times: Vec<wgpu_profiler::GpuTimerScopeResult>,
    profiler_features_enabled: bool,

+    ui_premultiply_uploads: ui::BatchedUploads,
+
    #[cfg(feature = "egui-ui")]
    egui_renderpass: egui_wgpu_backend::RenderPass,

@ -545,6 +547,8 @@ impl Renderer {
            profile_times: Vec::new(),
            profiler_features_enabled,

+            ui_premultiply_uploads: Default::default(),
+
            #[cfg(feature = "egui-ui")]
            egui_renderpass,

@ -1437,18 +1441,23 @@ impl Renderer {
        texture.update(&self.queue, offset, size, bytemuck::cast_slice(data))
    }

-    pub fn prepare_premultiply_upload(
-        &self,
+    /// See docs on [`ui::BatchedUploads::submit`].
+    pub fn ui_premultiply_upload(
+        &mut self,
+        target_texture: &Arc<Texture>,
+        batch: ui::UploadBatchId,
        image: &image::RgbaImage,
        offset: Vec2<u16>,
-    ) -> ui::PremultiplyUpload {
-        ui::PremultiplyUpload::prepare(
+    ) -> ui::UploadBatchId {
+        let upload = ui::PremultiplyUpload::prepare(
            &self.device,
            &self.queue,
            &self.layouts.premultiply_alpha,
            image,
            offset,
-        )
+        );
+        self.ui_premultiply_uploads
+            .submit(target_texture, batch, upload)
    }

    /// Queue to obtain a screenshot on the next frame render
--- a/voxygen/src/render/renderer/drawer.rs
+++ b/voxygen/src/render/renderer/drawer.rs
@ -37,6 +37,14 @@ impl<'frame> Pipelines<'frame> {
        }
    }

+    fn premultiply_alpha(&self) -> Option<&ui::PremultiplyAlphaPipeline> {
+        match self {
+            Pipelines::Interface(pipelines) => Some(&pipelines.premultiply_alpha),
+            Pipelines::All(pipelines) => Some(&pipelines.premultiply_alpha),
+            Pipelines::None => None,
+        }
+    }
+
    fn blit(&self) -> Option<&blit::BlitPipeline> {
        match self {
            Pipelines::Interface(pipelines) => Some(&pipelines.blit),
@ -67,6 +75,7 @@ struct RendererBorrow<'frame> {
    pipeline_modes: &'frame super::PipelineModes,
    quad_index_buffer_u16: &'frame Buffer<u16>,
    quad_index_buffer_u32: &'frame Buffer<u32>,
+    ui_premultiply_uploads: &'frame mut ui::BatchedUploads,
    #[cfg(feature = "egui-ui")]
    egui_render_pass: &'frame mut egui_wgpu_backend::RenderPass,
 }
@ -118,6 +127,7 @@ impl<'frame> Drawer<'frame> {
            pipeline_modes: &renderer.pipeline_modes,
            quad_index_buffer_u16: &renderer.quad_index_buffer_u16,
            quad_index_buffer_u32: &renderer.quad_index_buffer_u32,
+            ui_premultiply_uploads: &mut renderer.ui_premultiply_uploads,
            #[cfg(feature = "egui-ui")]
            egui_render_pass: &mut renderer.egui_renderpass,
        };
@ -425,15 +435,19 @@ impl<'frame> Drawer<'frame> {
        });
    }

-    pub fn run_ui_premultiply_passes<'a>(
-        &mut self,
-        targets: impl Iterator<Item = (&'a super::super::Texture, Vec<ui::PremultiplyUpload>)>,
-    ) {
+    /// Runs render passes with alpha premultiplication pipeline to complete any
+    /// pending uploads.
+    fn run_ui_premultiply_passes<'a>(&mut self) {
+        prof_span!("run_ui_premultiply_passes");
+        let Some(premultiply_alpha) = self.borrow.pipelines.premultiply_alpha() else { return };
        let encoder = self.encoder.as_mut().unwrap();
        let device = self.borrow.device;

+        let targets = self.borrow.ui_premultiply_uploads.take();
+
        // TODO: What is the CPU overhead of each renderpass?
-        for (i, (target_texture, uploads)) in targets.enumerate() {
+        for (i, (target_texture, uploads)) in targets.into_iter().enumerate() {
+            let mut area = 0.0;
            prof_span!("ui premultiply pass");
            tracing::info!("{} uploads", uploads.len());
            let profile_name = format!("ui_premultiply_pass {}", i);
@ -447,23 +461,31 @@ impl<'frame> Drawer<'frame> {
                        view: &target_texture.view,
                        resolve_target: None,
                        ops: wgpu::Operations {
-                            load: wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT),
+                            load: wgpu::LoadOp::Load,
                            store: true,
                        },
                    }],
                    depth_stencil_attachment: None,
                });
+            render_pass.set_pipeline(&premultiply_alpha.pipeline);
            for upload in &uploads {
-                let (source_bind_group, push_constant_data) = upload.draw_data(target_texture);
+                area += upload.area_dbg();
+                let (source_bind_group, push_constant_data) = upload.draw_data(&target_texture);
                let bytes = bytemuck::bytes_of(&push_constant_data);
                render_pass.set_bind_group(0, source_bind_group, &[]);
                render_pass.set_push_constants(wgpu::ShaderStage::VERTEX, 0, bytes);
-                render_pass.draw_indexed(0..6, 0, 0..1);
+                render_pass.draw(0..6, 0..1);
            }
+            let avg_area = area as f32 / uploads.len() as f32;
+            tracing::info!("avg area sqrt {}", f32::sqrt(avg_area));
        }
    }

+    /// Note, this automatically calls the internal `run_ui_premultiply_passes`
+    /// to complete any pending image uploads for the UI.
    pub fn third_pass(&mut self) -> ThirdPassDrawer {
+        self.run_ui_premultiply_passes();
+
        let encoder = self.encoder.as_mut().unwrap();
        let device = self.borrow.device;
        let mut render_pass =
@ -537,7 +559,7 @@ impl<'frame> Drawer<'frame> {

    /// Does nothing if the shadow pipelines are not available or shadow map
    /// rendering is disabled
-    pub fn draw_point_shadows<'data: 'frame>(
+    pub fn draw_point_shadows<'data>(
        &mut self,
        matrices: &[shadow::PointLightMatrix; 126],
        chunks: impl Clone
--- a/voxygen/src/render/renderer/shaders.rs
+++ b/voxygen/src/render/renderer/shaders.rs
@ -74,7 +74,7 @@ impl assets::Compound for Shaders {
            "ui-vert",
            "ui-frag",
            "premultiply-alpha-vert",
-            "premultiply_alpha-frag",
+            "premultiply-alpha-frag",
            "lod-terrain-vert",
            "lod-terrain-frag",
            "clouds-vert",
--- a/voxygen/src/scene/mod.rs
+++ b/voxygen/src/scene/mod.rs
@ -1232,9 +1232,9 @@ impl Scene {
    pub fn global_bind_group(&self) -> &GlobalsBindGroup { &self.globals_bind_group }

    /// Render the scene using the provided `Drawer`.
-    pub fn render<'a>(
-        &'a self,
-        drawer: &mut Drawer<'a>,
+    pub fn render(
+        &self,
+        drawer: &mut Drawer<'_>,
        state: &State,
        viewpoint_entity: EcsEntity,
        tick: u64,
--- a/voxygen/src/session/mod.rs
+++ b/voxygen/src/session/mod.rs
@ -1896,7 +1896,7 @@ impl PlayState for SessionState {
    /// Render the session to the screen.
    ///
    /// This method should be called once per frame.
-    fn render<'a>(&'a self, drawer: &mut Drawer<'a>, settings: &Settings) {
+    fn render(&self, drawer: &mut Drawer<'_>, settings: &Settings) {
        span!(_guard, "render", "<Session as PlayState>::render");

        let client = self.client.borrow();
--- a/voxygen/src/ui/cache.rs
+++ b/voxygen/src/ui/cache.rs
@ -7,6 +7,9 @@ use conrod_core::{text::GlyphCache, widget::Id};
 use hashbrown::HashMap;
 use vek::*;

+// TODO: probably make cache fields where we have mut getters into just public
+// fields
+
 // Multiplied by current window size
 const GLYPH_CACHE_SIZE: u32 = 1;
 // Glyph cache tolerances
--- a/voxygen/src/ui/graphic/mod.rs
+++ b/voxygen/src/ui/graphic/mod.rs
@ -4,10 +4,11 @@ pub mod renderer;
 pub use renderer::{SampleStrat, Transform};

 use crate::{
-    render::{Renderer, Texture, UiPremultiplyUpload, UiTextureBindGroup},
+    render::{Renderer, Texture, UiTextureBindGroup, UiUploadBatchId},
    ui::KeyedJobs,
 };
 use common::{figure::Segment, slowjob::SlowJobPool};
+use common_base::prof_span;
 use guillotiere::{size2, SimpleAtlasAllocator};
 use hashbrown::{hash_map::Entry, HashMap};
 use image::{DynamicImage, RgbaImage};
@ -86,7 +87,7 @@ impl CachedDetails {
    fn info(
        &self,
        atlases: &[(SimpleAtlasAllocator, usize)],
-        textures: &Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
+        textures: &Slab<(Arc<Texture>, UiTextureBindGroup, UiUploadBatchId)>,
    ) -> (usize, bool, Aabr<u16>) {
        match *self {
            CachedDetails::Atlas {
@ -119,6 +120,17 @@ impl CachedDetails {
            },
        }
    }
+
+    fn set_valid(&mut self) {
+        match self {
+            Self::Atlas { ref mut valid, .. } => {
+                *valid = true;
+            },
+            Self::Texture { ref mut valid, .. } => {
+                *valid = true;
+            },
+        }
+    }
 }

 /// Requirements that a particular graphic has with respect to the atlas
@ -241,7 +253,7 @@ pub struct GraphicCache {
    /// for this frame. The purpose of this is to collect all the operations
    /// together so that a single renderpass is performed for each target
    /// texture.
-    textures: Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
+    textures: Slab<(Arc<Texture>, UiTextureBindGroup, UiUploadBatchId)>,
    /// The location and details of graphics cached on the GPU.
    ///
    /// Graphic::Voxel images include the dimensions they were rasterized at in
@ -257,7 +269,7 @@ impl GraphicCache {
        let (atlas, (tex, bind)) = create_atlas_texture(renderer);

        let mut textures = Slab::new();
-        let tex_id = textures.insert((tex, bind, Vec::new()));
+        let tex_id = textures.insert((tex, bind, UiUploadBatchId::default()));

        Self {
            graphic_map: HashMap::default(),
@ -336,7 +348,7 @@ impl GraphicCache {

    /// Used to acquire textures for rendering
    pub fn get_tex(&self, id: TexId) -> (&Texture, &UiTextureBindGroup) {
-        let (tex, bind, _uploads) = self.textures.get(id.0).expect("Invalid TexId used");
+        let (tex, bind, _upload_batch) = self.textures.get(id.0).expect("Invalid TexId used");
        (tex, bind)
    }

@ -368,18 +380,13 @@ impl GraphicCache {

        let (atlas, (tex, bind)) = create_atlas_texture(renderer);
        let mut textures = Slab::new();
-        let tex_id = textures.insert((tex, bind, Vec::new()));
+        let tex_id = textures.insert((tex, bind, UiUploadBatchId::default()));
        self.atlases = vec![(atlas, tex_id)];
        self.textures = textures;
    }

    /// Source rectangle should be from 0 to 1, and represents a bounding box
    /// for the source image of the graphic.
-    ///
-    /// [`complete_premultiply_uploads`](Self::complete_premultiply_uploads)
-    /// needs to be called to finalize updates on the GPU that are initiated
-    /// here. Thus, ideally that would be called before drawing UI elements
-    /// using the images cached here.
    pub fn cache_res(
        &mut self,
        renderer: &mut Renderer,
@ -465,19 +472,18 @@ impl GraphicCache {
            requirements.to_key_and_tex_parameters(graphic_id, requested_dims_upright);

        let details = match cache_map.entry(key) {
-            Entry::Occupied(details) => {
-                let details = details.get();
+            Entry::Occupied(mut details) => {
+                let details = details.get_mut();
                let (idx, valid, aabr) = details.info(atlases, textures);

                // Check if the cached version has been invalidated by replacing the underlying
                // graphic
                if !valid {
                    // Create image
-                    let image = prepare_graphic(
+                    let (image, gpu_premul) = prepare_graphic(
                        graphic,
                        key,
                        requested_dims_upright,
-                        false,
                        &mut self.keyed_jobs,
                        pool,
                    )?;
@ -489,7 +495,9 @@ impl GraphicCache {
                        texture_parameters.size.map(u32::from).into_tuple()
                    );
                    // Transfer to the gpu
-                    upload_image(renderer, aabr, &mut textures[idx].2, &image);
+                    let (ref texture, _, ref mut upload_batch) = &mut textures[idx];
+                    upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
+                    details.set_valid();
                }

                return Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)));
@ -498,11 +506,10 @@ impl GraphicCache {
        };

        // Construct image in an optional threadpool.
-        let image = prepare_graphic(
+        let (image, gpu_premul) = prepare_graphic(
            graphic,
            key,
            requested_dims_upright,
-            false,
            &mut self.keyed_jobs,
            pool,
        )?;
@ -540,7 +547,8 @@ impl GraphicCache {
                        valid: true,
                        aabr,
                    });
-                    upload_image(renderer, aabr, &mut textures[texture_idx].2, &image);
+                    let (ref texture, _, ref mut upload_batch) = &mut textures[texture_idx];
+                    upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
                    break;
                }
            }
@ -555,10 +563,11 @@ impl GraphicCache {
                        .map(aabr_from_alloc_rect)
                        .unwrap();
                    // NOTE: All mutations happen only after the texture creation succeeds!
-                    let tex_idx = textures.insert((tex, bind, Vec::new()));
+                    let tex_idx = textures.insert((tex, bind, UiUploadBatchId::default()));
                    let atlas_idx = atlases.len();
                    atlases.push((atlas, tex_idx));
-                    upload_image(renderer, aabr, &mut textures[tex_idx].2, &image);
+                    let (ref texture, _, ref mut upload_batch) = &mut textures[tex_idx];
+                    upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
                    CachedDetails::Atlas {
                        atlas_idx,
                        valid: true,
@ -568,11 +577,12 @@ impl GraphicCache {
            }
        } else {
            // Create a texture just for this
-            let (tex, bind, uploads) = create_image(renderer, &image, texture_parameters);
+            let (tex, bind, upload_batch) =
+                create_image(renderer, &image, texture_parameters, gpu_premul);
            // NOTE: All mutations happen only after the texture creation and upload
-            // initiation succeeds! (completing the upload does not have any failure cases
-            // afaik)
-            let index = textures.insert((tex, bind, uploads));
+            // initiation succeeds! (completing the upload does not have any
+            // failure cases afaik)
+            let index = textures.insert((tex, bind, upload_batch));
            CachedDetails::Texture { index, valid: true }
        };

@ -584,77 +594,76 @@ impl GraphicCache {

        Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)))
    }
-
-    /// Runs render passes with alpha premultiplication pipeline to complete any
-    /// pending uploads.
-    ///
-    /// This should be called before starting the pass where the ui is rendered.
-    pub fn complete_premultiply_uploads(&mut self, drawer: &mut crate::render::Drawer<'_>) {
-        drawer.run_ui_premultiply_passes(
-            self.textures
-                .iter_mut()
-                .map(|(_tex_id, (texture, _, uploads))| (&*texture, core::mem::take(uploads))),
-        );
-    }
 }

 /// Prepare the graphic into the form that will be uploaded to the GPU.
 ///
 /// For voxel graphics, draws the graphic at the specified dimensions.
 ///
-/// Also can pre-multiplies alpha in images so they can be linearly filtered on
-/// the GPU (this is optional since we also have a path to do this
-/// premultiplication on the GPU).
+/// Alpha premultiplication is necessary so that  images so they can be linearly
+/// filtered on the GPU. Premultiplication can either occur here or on the GPU
+/// depending on the size of the image and other factors. If premultiplication
+/// on the GPU is needed the returned bool will be `true`.
 fn prepare_graphic<'graphic>(
    graphic: &'graphic Graphic,
    cache_key: CacheKey,
    dims: Vec2<u16>,
-    premultiply_on_cpu: bool, // TODO: currently unused
    keyed_jobs: &mut KeyedJobs<CacheKey, RgbaImage>,
    pool: Option<&SlowJobPool>,
-) -> Option<Cow<'graphic, RgbaImage>> {
+) -> Option<(Cow<'graphic, RgbaImage>, bool)> {
+    prof_span!("prepare_graphic");
    match graphic {
-        // Short-circuit spawning a job on the threadpool for blank graphics
        Graphic::Blank => None,
        Graphic::Image(image, _border_color) => {
-            if premultiply_on_cpu {
-                keyed_jobs
-                    .spawn(pool, cache_key, || {
-                        let image = Arc::clone(image);
-                        move |_| {
-                            // Image will be rescaled when sampling from it on the GPU so we don't
-                            // need to resize it here.
-                            let mut image = image.to_rgba8();
-                            // TODO: could potentially do this when loading the image and for voxel
-                            // images maybe at some point in the `draw_vox` processing. Or we could
-                            // push it in the other direction and do conversion on the GPU.
-                            premultiply_alpha(&mut image);
-                            image
-                        }
-                    })
-                    .map(|(_, v)| Cow::Owned(v))
-            } else if let Some(rgba) = image.as_rgba8() {
-                Some(Cow::Borrowed(rgba))
-            } else {
-                // TODO: we should require rgba8 format
-                warn!("Non-rgba8 image in UI used this may be deprecated.");
-                Some(Cow::Owned(image.to_rgba8()))
-            }
+            // Image will be rescaled when sampling from it on the GPU so we don't
+            // need to resize it here.
+            //
+            // TODO: We could potentially push premultiplication even earlier (e.g. to the
+            // time of loading images or packaging veloren for distribution).
+            let mut rgba_cow = image.as_rgba8().map_or_else(
+                || {
+                    // TODO: we may want to require loading in as the rgba8 format so we don't have
+                    // to perform conversion here. On the other hand, we can take advantage of
+                    // certain formats to know that alpha premultiplication doesn't need to be
+                    // performed (but we would probably just want to store that with the loaded
+                    // rgba8 format).
+                    Cow::Owned(image.to_rgba8())
+                },
+                Cow::Borrowed,
+            );
+            // NOTE: We do premultiplication on the main thread since if it would be
+            // expensive enough to do in the background we would just do it on
+            // the GPU. Could still use `rayon` to parallelize this work, if
+            // needed.
+            let premultiply_strategy = PremultiplyStrategy::determine(&*rgba_cow);
+            let needs_gpu_premultiply = match premultiply_strategy {
+                PremultiplyStrategy::UseGpu => true,
+                PremultiplyStrategy::NotNeeded => false,
+                PremultiplyStrategy::UseCpu => {
+                    // NOTE: to_mut will clone the image if it was Cow::Borrowed
+                    premultiply_alpha(rgba_cow.to_mut());
+                    false
+                },
+            };
+
+            Some((rgba_cow, needs_gpu_premultiply))
        },
        Graphic::Voxel(segment, trans, sample_strat) => keyed_jobs
            .spawn(pool, cache_key, || {
                let segment = Arc::clone(segment);
                let (trans, sample_strat) = (*trans, *sample_strat);
                move |_| {
+                    // TODO: for now we always use CPU premultiplication for these, may want to
+                    // re-evaluate this after zoomy worldgen branch is merged (and it is more clear
+                    // when these jobs go to the background thread pool or not).
+
                    // Render voxel model at requested resolution
                    let mut image = renderer::draw_vox(&segment, dims, trans, sample_strat);
-                    if premultiply_on_cpu {
-                        premultiply_alpha(&mut image);
-                    }
+                    premultiply_alpha(&mut image);
                    image
                }
            })
-            .map(|(_, v)| Cow::Owned(v)),
+            .map(|(_, v)| (Cow::Owned(v), false)),
    }
 }

@ -672,7 +681,11 @@ fn create_image_texture(
    renderer: &mut Renderer,
    size: Vec2<u32>,
    address_mode: Option<wgpu::AddressMode>,
-) -> (Texture, UiTextureBindGroup) {
+) -> (Arc<Texture>, UiTextureBindGroup) {
+    // TODO: Right now we have to manually clear images to workaround AMD DX bug,
+    // for this we use Queue::write_texture which needs this usage. I think this
+    // may be fixed in newer wgpu versions that auto-clear the texture.
+    let workaround_usage = wgpu::TextureUsage::COPY_DST;
    let tex_info = wgpu::TextureDescriptor {
        label: None,
        size: wgpu::Extent3d {
@ -684,7 +697,10 @@ fn create_image_texture(
        sample_count: 1,
        dimension: wgpu::TextureDimension::D2,
        format: wgpu::TextureFormat::Rgba8UnormSrgb,
-        usage: wgpu::TextureUsage::RENDER_ATTACHMENT | wgpu::TextureUsage::SAMPLED,
+        usage: wgpu::TextureUsage::RENDER_ATTACHMENT // GPU premultiply
+            | wgpu::TextureUsage::COPY_DST // CPU premultiply
+            | wgpu::TextureUsage::SAMPLED // using image in ui rendering
+            | workaround_usage,
    };
    let view_info = wgpu::TextureViewDescriptor {
        format: Some(tex_info.format),
@ -701,12 +717,12 @@ fn create_image_texture(
    };
    let tex = renderer.create_texture_raw(&tex_info, &view_info, &sampler_info);
    let bind = renderer.ui_bind_texture(&tex);
-    (tex, bind)
+    (Arc::new(tex), bind)
 }

 fn create_atlas_texture(
    renderer: &mut Renderer,
-) -> (SimpleAtlasAllocator, (Texture, UiTextureBindGroup)) {
+) -> (SimpleAtlasAllocator, (Arc<Texture>, UiTextureBindGroup)) {
    let size = atlas_size(renderer);
    // Note: here we assume the max texture size is under i32::MAX.
    let atlas = SimpleAtlasAllocator::new(size2(size.x as i32, size.y as i32));
@ -726,23 +742,34 @@ fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {

 fn upload_image(
    renderer: &mut Renderer,
-    aabr: Aabr<u16>,
-    target_texture_uploads: &mut Vec<UiPremultiplyUpload>,
+    target_texture: &Arc<Texture>,
+    upload_batch: &mut UiUploadBatchId,
    image: &RgbaImage,
+    aabr: Aabr<u16>,
+    premultiply_on_gpu: bool,
 ) {
-    let aabr = aabr.map(u32::from);
    // Check that this image and the target aabr are the same size (otherwise there
    // is a bug in this module).
-    debug_assert_eq!(aabr.size().into_tuple(), image.dimensions());
-    let offset = aabr.min.into_array();
-
-    // TODO: can we transparently have cpu based version behind this (actually this
-    // would introduce more complexity to be able to do it in the background,
-    // but we could to it not in the background here especially for smaller
-    // things this would work well)
-    let upload = UiPremultiplyUpload::prepare(renderer, image, offset);
-    target_texture_uploads.push(upload);
-    //todo!()
+    debug_assert_eq!(aabr.map(u32::from).size().into_tuple(), image.dimensions());
+    if premultiply_on_gpu {
+        *upload_batch =
+            renderer.ui_premultiply_upload(target_texture, *upload_batch, image, aabr.min);
+    } else {
+        let aabr = aabr.map(u32::from);
+        let offset = aabr.min.into_array();
+        let size = aabr.size().into_array();
+        // upload directly
+        renderer.update_texture(
+            &*target_texture,
+            offset,
+            size,
+            // NOTE: Rgba texture, so each pixel is 4 bytes, ergo this cannot fail.
+            // We make the cast parameters explicit for clarity.
+            bytemuck::cast_slice::<u8, [u8; 4]>(
+                &(&**image)[..size[0] as usize * size[1] as usize * 4],
+            ),
+        )
+    }
 }

 // This is used for border_color.is_some() images (ie the map image).
@ -750,7 +777,8 @@ fn create_image(
    renderer: &mut Renderer,
    image: &RgbaImage,
    texture_parameters: TextureParameters,
-) -> (Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>) {
+    premultiply_on_gpu: bool,
+) -> (Arc<Texture>, UiTextureBindGroup, UiUploadBatchId) {
    let (tex, bind) = create_image_texture(
        renderer,
        texture_parameters.size.map(u32::from),
@ -760,17 +788,82 @@ fn create_image(
            //.map(|c| c.into_array().into()),
            .map(|_| wgpu::AddressMode::ClampToBorder),
    );
-    let mut uploads = Vec::new();
+    let mut upload_batch = UiUploadBatchId::default();
    let aabr = Aabr {
        min: Vec2::zero(),
        max: texture_parameters.size,
    };
-    upload_image(renderer, aabr, &mut uploads, image);
-    (tex, bind, uploads)
+    upload_image(
+        renderer,
+        &tex,
+        &mut upload_batch,
+        image,
+        aabr,
+        premultiply_on_gpu,
+    );
+    (tex, bind, upload_batch)
+}
+
+// CPU-side alpha premultiplication implementation.
+
+pub struct PremultiplyLookupTable {
+    alpha: [u16; 256],
+    // This is for both colors that are always below the linear transform threshold (of the
+    // transform between linear/non-linear srgb) and colors that start above the threshold when
+    // transforming into linear srgb and then fall below it after being multiplied by alpha (before
+    // being transformed out of linear srgb).
+    color: [u16; 256],
+}
+
+impl Default for PremultiplyLookupTable {
+    fn default() -> Self {
+        #[rustfmt::skip]
+        fn accurate_to_linear(c: u8) -> f32 {
+            let c = c as f32 / 255.0;
+            // https://en.wikipedia.org/wiki/SRGB#Transformation
+            if c <= 0.04045 {
+                c / 12.92
+            } else {
+                // 0.055 ~= 14
+                ((c + 0.055) / 1.055).powf(2.4)
+            }
+        }
+
+        use core::array;
+        let alpha = array::from_fn(|alpha| {
+            // NOTE: u16::MAX + 1 here relies on the max alpha being short-circuited (and
+            // not using this table). We multiply by this factor since it is a
+            // power of 2, which means later demultiplying it will optimize to a
+            // bitshift.
+            (((alpha as f32 / 255.0).powf(1.0 / 2.4) * (u16::MAX as f32 + 1.0)) + 0.5) as u16
+        });
+        let color = array::from_fn(|color| {
+            (if color <= 10 {
+                //  <= 10 means the transform is linear!
+                color as f32 / 255.0
+            } else {
+                // Here the transform into linear srgb isn't linear but the transform out of it is. 
+                //
+                // This is transform into and out of linear srgb with the theoretical alpha
+                // multiplication factored out.
+                accurate_to_linear(color as u8) * 12.92
+            }
+            // take advantage of the precision offered by u16
+            * (1 << 13) as f32
+            // round to the nearest integer when the cast truncates
+            + 0.5) as u16
+        });
+        Self { alpha, color }
+    }
 }

 fn premultiply_alpha(image: &mut RgbaImage) {
-    use fast_srgb8::{f32x4_to_srgb8, srgb8_to_f32};
+    prof_span!("premultiply alpha");
+
+    lazy_static::lazy_static! {
+        static ref LOOKUP: PremultiplyLookupTable = Default::default();
+    }
+    let lookup = &*LOOKUP;
    // TODO: Apparently it is possible for ImageBuffer raw vec to have more pixels
    // than the dimensions of the actual image (I don't think we actually have
    // this occuring but we should probably fix other spots that use the raw
@ -779,52 +872,200 @@ fn premultiply_alpha(image: &mut RgbaImage) {
    let dims = image.dimensions();
    let image_buffer_len = dims.0 as usize * dims.1 as usize * 4;
    let (arrays, end) = (&mut **image)[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
-    // Rgba8 has 4 bytes per pixel they should be no remainder when dividing by 4.
+    // Rgba8 has 4 bytes per pixel there should be no remainder when dividing by 4.
    let (end, _) = end.as_chunks_mut::<4>();
    end.iter_mut().for_each(|pixel| {
        let alpha = pixel[3];
        if alpha == 0 {
            *pixel = [0; 4];
-        } else if alpha != 255 {
-            let linear_alpha = alpha as f32 / 255.0;
-            let [r, g, b] = core::array::from_fn(|i| srgb8_to_f32(pixel[i]) * linear_alpha);
-            let srgb8 = f32x4_to_srgb8([r, g, b, 0.0]);
-            (pixel[0], pixel[1], pixel[3]) = (srgb8[0], srgb8[1], srgb8[3]);
+            return;
+        } else if alpha == 255 {
+            return;
+        };
+
+        for color in &mut pixel[..3] {
+            let predicted = ((lookup.alpha[alpha as usize] as u32) * (*color as u32 + 14) + 32433)
+                / (u16::MAX as u32 + 1);
+            let multiplied_color = (if predicted < 9 + 14 {
+                (lookup.color[*color as usize] as u32 * alpha as u32 + 4096) >> 13
+            } else {
+                predicted - 14
+            }) as u8;
+            *color = multiplied_color;
        }
    });
    arrays.iter_mut().for_each(|pixelx4| {
-        use core::simd::{f32x4, u8x4, Simd};
-        let alpha = Simd::from_array([pixelx4[3], pixelx4[7], pixelx4[11], pixelx4[15]]);
-        if alpha == Simd::splat(0) {
-            *pixelx4 = [0; 16];
-        } else if alpha != Simd::splat(255) {
-            let linear_simd = |array: [u8; 4]| Simd::from_array(array.map(srgb8_to_f32));
-            // Pack rgb components from the 4th pixel into the the last position for each of
-            // the other 3 pixels.
-            let a = linear_simd([pixelx4[0], pixelx4[1], pixelx4[2], pixelx4[12]]);
-            let b = linear_simd([pixelx4[4], pixelx4[5], pixelx4[6], pixelx4[13]]);
-            let c = linear_simd([pixelx4[8], pixelx4[9], pixelx4[10], pixelx4[14]]);
-            let linear_alpha = alpha.cast::<f32>() * Simd::splat(1.0 / 255.0);
-
-            // Multiply by alpha and then convert back into srgb8.
-            let premultiply = |x: f32x4, i| {
-                let mut a = f32x4::splat(linear_alpha[i]);
-                a[3] = linear_alpha[3];
-                u8x4::from_array(f32x4_to_srgb8((x * a).to_array()))
-            };
-            let pa = premultiply(a, 0);
-            let pb = premultiply(b, 1);
-            let pc = premultiply(c, 2);
-
-            (pixelx4[0], pixelx4[1], pixelx4[2]) = (pa[0], pa[1], pa[2]);
-            (pixelx4[4], pixelx4[5], pixelx4[6]) = (pb[0], pb[1], pb[2]);
-            (pixelx4[8], pixelx4[9], pixelx4[10]) = (pc[0], pc[1], pc[2]);
-            (pixelx4[12], pixelx4[13], pixelx4[14]) = (pa[3], pb[3], pc[3]);
+        // Short-circuit for alpha == 0 or 255
+        // This adds ~7 us (worst case) for a 256x256 image.
+        // Best case is decreased to 20 us total time.
+        if pixelx4[3] == pixelx4[7] && pixelx4[3] == pixelx4[11] && pixelx4[3] == pixelx4[15] {
+            if pixelx4[3] == 0 {
+                *pixelx4 = [0; 16];
+                return;
+            } else if pixelx4[3] == u8::MAX {
+                return;
+            }
        }
-    })
+
+        // Lookup transformed alpha values for each pixel first.
+        // Putting this here seems to make things slightly faster.
+        let factors = [
+            lookup.alpha[pixelx4[3] as usize],
+            lookup.alpha[pixelx4[7] as usize],
+            lookup.alpha[pixelx4[11] as usize],
+            lookup.alpha[pixelx4[15] as usize],
+        ];
+        for pixel_index in 0..4 {
+            let alpha_factor = factors[pixel_index];
+            let alpha = pixelx4[pixel_index * 4 + 3];
+            // Putting this code outside the loop makes things take ~25% less time.
+            let color_factors = [
+                lookup.color[pixelx4[pixel_index * 4 + 0] as usize] as u32 * alpha as u32 + 4096,
+                lookup.color[pixelx4[pixel_index * 4 + 1] as usize] as u32 * alpha as u32 + 4096,
+                lookup.color[pixelx4[pixel_index * 4 + 2] as usize] as u32 * alpha as u32 + 4096,
+            ];
+            for i in 0..3 {
+                let color = &mut pixelx4[pixel_index * 4 + i];
+                // Loosely based on transform to linear and back (above threshold) (this is
+                // where use of 14 comes from).
+                // `32433` selected via trial and error to reduce the number of mismatches.
+                // `/ (u16::MAX as u32 + 1)` transforms back to `u8` precision (we add 1 so it
+                // will be a division by a power of 2 which optimizes well).
+                let predicted =
+                    ((alpha_factor as u32) * (*color as u32 + 14) + 32328) / (u16::MAX as u32 + 1);
+                let multiplied_color = (if predicted < 9 + 14 {
+                    // Here we handle two cases:
+                    // 1. When the transform starts and ends as linear.
+                    // 2. When the color is over the linear threshold for the transform into linear
+                    //    space but below this threshold when transforming back out (due to being
+                    //    multiplied with a small alpha).
+                    // (in both cases the result is linearly related to alpha and we can encode how
+                    // it is related to the color in a lookup table)
+                    // NOTE: 212 is the largest color value used here (when alpha isn't 0)
+                    color_factors[i] >> 13
+                } else {
+                    predicted - 14
+                }) as u8;
+                *color = multiplied_color;
+            }
+        }
+    });
 }

-// Next step: Handling invalidation / removal of old textures when
-// replace_graphic is used under new resizing scheme.
-//
-// TODO: does screenshot texture have COPY_DST? I don't think it needs this.
+/// Strategy for how alpha premultiplication will be applied to an image.
+enum PremultiplyStrategy {
+    UseCpu,
+    UseGpu,
+    // Image is fully opaque.
+    NotNeeded,
+}
+
+impl PremultiplyStrategy {
+    #[rustfmt::skip] // please don't format comment with 'ns/pixel' to a separate line from the value
+    fn determine(image: &RgbaImage) -> Self {
+        // TODO: Would be useful to re-time this after a wgpu update.
+        //
+        // Thresholds below are based on the timing measurements of the CPU based premultiplication
+        // vs ovehead of interacting with the GPU API to perform premultiplication on the GPU.
+        // These timings are quite circumstantial and could vary between machines, wgpu updates,
+        // and changes to the structure of the GPU based path.  
+        //
+        // GPU path costs (For calculations I used `57.6 us` as a roughly reasonable estimate of
+        // total time here but that can vary lower and higher. Everything is a bit imprecise here
+        // so I won't list individual timings. The key takeaway is that this can be made more
+        // efficient by avoidiing the create/drop of a texture, texture view, and bind group for
+        // each image. Also, if we didn't need a separate render pass for each target image that
+        // would be helpful as well. Using compute passes and passing data in as a raw buffer may
+        // help with both of these but initial attempts with that ran into issues (e.g. when we get
+        // the ability to have non-srgb views of srgb textures that will be useful)):
+        // * create/drop texture
+        // * create/drop texture view
+        // * create/drop bind group
+        // * run render pass (NOTE: if many images are processed at once with the same target
+        //   texture this portion of the cost can be split between them)
+        //
+        // CPU path costs:
+        // * clone image (0.17 ns/pixel (benchmark) - 0.73 ns/pixel (in voxygen))
+        // * run premultiplication (0.305 ns/pixel (when shortcircuits are always hit) -
+        //   3.81 ns/pixel (with random alpha))
+        //
+        // Shared costs include:
+        // * write_texture
+        // * (optional) check for fraction of shortcircuit blocks in image (0.223 ns/pixel)
+        //
+        // `ALWAYS_CPU_THRESHOLD` is roughly:
+        // ("cost of GPU path" + "shortcircuit count cost") / "worst case cost of CPU path per pixel"
+        //
+        // `ALWAYS_GPU_THRESHOLD` is NOT: "cost of GPU path" / "best case cost of CPU path per pixel"
+        // since the cost of checking for whether the CPU path is better at this quantity of pixels
+        // becomes more than the on the amount of overhead we are willing to add to the worst case
+        // scenario where we run the short-circuit count check and end up using the GPU path. The
+        // currently selected value of 200x200 adds at most about ~20% of the cost of the GPU path.
+        // (TODO: maybe we could have the check bail out early if the results aren't looking
+        // favorable for the CPU path and/or sample a random subset of the pixels).
+        //
+        // `CHECKED_THRESHOLD` is roughly: "cost of GPU path / "best case cost of CPU path per pixel"
+        const ALWAYS_CPU_THRESHOLD: usize = 120 * 120;
+        const ALWAYS_GPU_THRESHOLD: usize = 200 * 200;
+        const CHECKED_THRESHOLD: usize = 240 * 240;
+
+        let dims = image.dimensions();
+        let pixel_count = dims.0 as usize * dims.1 as usize;
+        if pixel_count <= ALWAYS_CPU_THRESHOLD {
+            Self::UseCpu
+        } else if pixel_count > ALWAYS_GPU_THRESHOLD {
+            Self::UseGpu
+        } else if let Some(fraction) = fraction_shortcircuit_blocks(image) {
+            // This seems correct...?
+            // TODO: I think we technically can exit the fraction checking early if we know the
+            // total fraction value will be over: (threshold - ALWAYS_CPU_THRESHOLD) /
+            // (CHECKED_THRESHOLD - ALWAYS_CPU_THRESHOLD).
+            let threshold = fraction * CHECKED_THRESHOLD as f32
+                + (1.0 - fraction) * ALWAYS_CPU_THRESHOLD as f32;
+            if pixel_count as f32 <= threshold {
+                Self::UseCpu
+            } else {
+                Self::UseGpu
+            }
+        } else {
+            Self::NotNeeded
+        }
+    }
+}
+
+/// Useful to estimates cost of premultiplying alpha in the provided image via
+/// the CPU method.
+///
+/// Computes the fraction of 4 pixel chunks that are fully translucent or
+/// opaque. Returns `None` if no premultiplication is needed (i.e. all alpha
+/// values are 255).
+fn fraction_shortcircuit_blocks(image: &RgbaImage) -> Option<f32> {
+    let dims = image.dimensions();
+    let pixel_count = dims.0 as usize * dims.1 as usize;
+    let (arrays, end) = (&**image)[..pixel_count * 4].as_chunks::<{ 4 * 4 }>();
+
+    // Rgba8 has 4 bytes per pixel there should be no remainder when dividing by 4.
+    let (end, _) = end.as_chunks::<4>();
+    let end_is_opaque = end.iter().all(|pixel| pixel[3] == 255);
+
+    // 14.6 us for 256x256 image
+    let num_chunks = arrays.len();
+    let mut num_translucent = 0;
+    let mut num_opaque = 0;
+    arrays.iter().for_each(|pixelx4| {
+        let v = u128::from_ne_bytes(*pixelx4);
+        let alpha_mask = 0x000000FF_000000FF_000000FF_000000FF;
+        let masked = v & alpha_mask;
+        if masked == 0 {
+            num_translucent += 1;
+        } else if masked == alpha_mask {
+            num_opaque += 1;
+        }
+    });
+
+    if num_chunks == num_opaque && num_translucent == 0 && end_is_opaque {
+        None
+    } else {
+        Some((num_translucent as f32 + num_opaque as f32) / num_chunks as f32)
+    }
+}
--- a/voxygen/src/ui/ice/cache.rs
+++ b/voxygen/src/ui/ice/cache.rs
@ -8,6 +8,9 @@ use glyph_brush::GlyphBrushBuilder;
 use std::cell::{RefCell, RefMut};
 use vek::*;

+// TODO: probably make cache fields where we have mut getters into just public
+// fields
+
 // Multiplied by current window size
 const GLYPH_CACHE_SIZE: u32 = 1;
 // Glyph cache tolerances