Complete GPU based alpha premultiplication impl and make the CPU version

even faster.

* GPU based version started in previous commit, but this fixes errors
  and bugs and gets it actually compiling and running.
* Add a way to batch together images to use the same render pass for GPU
  premultiplication if they all target the same texture.
* Pending premultiplication uploads are automatically done when calling
  `Drawer::third_pass`.
* `fast-srgb8` dep removed, we no longer convert to `f32`s to do the
  premultiplication. Two `[u16; 256]` tables are combined to compute the
  alpa premultiplied color within the same error bounds used by the
  `fast-srgb8` crate. We also no longer use explicit simd.
* Remove explicit lifetimes from `PlayState::render` since `&self` and
  `Drawer<'_>` don't need to have the same lifetime.
* Fix existing bug where invalidated cache entries were never set to
  valid when reusing them.
* `prepare_graphic` now runs some heuristics to determine whether
  premultiplication should be executed CPU side or GPU side and then
  returns a bool indicating if GPU premultiplication is needed.
This commit is contained in:
Imbris 2022-11-13 18:52:13 -05:00
parent efd932c71e
commit 63096b2042
18 changed files with 505 additions and 177 deletions

7
Cargo.lock generated
View File

@ -1959,12 +1959,6 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fast-srgb8"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd2e7510819d6fbf51a5545c8f922716ecfb14df168a3242f7d33e0239efe6a1"
[[package]]
name = "fastrand"
version = "1.8.0"
@ -7014,7 +7008,6 @@ dependencies = [
"enum-iterator 1.1.3",
"etagere",
"euc",
"fast-srgb8",
"gilrs",
"glyph_brush",
"guillotiere",

View File

@ -44,7 +44,7 @@ vec3 linear_to_srgb(vec3 col) {
}
vec4 srgba8_to_linear(uint srgba8) {
uvec4 nonlinear = vec4(uvec4(
vec4 nonlinear = vec4(uvec4(
(srgba8 >> 24) & 0xFFu,
(srgba8 >> 16) & 0xFFu,
(srgba8 >> 8) & 0xFFu,

View File

@ -1,4 +1,5 @@
#version 420 core
#extension GL_EXT_samplerless_texture_functions : enable
layout(set = 0, binding = 0)
uniform texture2D source_texture;

View File

@ -15,14 +15,14 @@ layout(location = 0) out vec2 source_coords;
uvec2 unpack(uint xy) {
return uvec2(
bitfieldExtract(xy, 0, 16),
bitfieldExtract(xy, 16, 16),
bitfieldExtract(xy, 0, 16),
bitfieldExtract(xy, 16, 16)
);
}
void main() {
vec2 source_size = vec2(unpack(source_size_xy));
vec2 target_offset = vec2(unpack(target_offset_size_xy));
vec2 target_offset = vec2(unpack(target_offset_xy));
vec2 target_size = vec2(unpack(target_size_xy));
// Generate rectangle (counter clockwise triangles)
@ -36,10 +36,10 @@ void main() {
// left -> right (on screen)
mix(0.0, 1.0, x_select),
// bottom -> top (on screen)
mix(1.0, 0.0, y_select),
);
mix(1.0, 0.0, y_select)
) * source_size;
vec2 target_coords_normalized = (target_offset + source_coords * source_size) / target_size;
vec2 target_coords_normalized = (target_offset + source_coords) / target_size;
// Flip y and transform [0.0, 1.0] -> [-1.0, 1.0] to get NDC coordinates.
vec2 v_pos = ((target_coords_normalized * 2.0) - vec2(1.0)) * vec2(1.0, -1.0);

View File

@ -134,7 +134,6 @@ num_cpus = "1.0"
# vec_map = { version = "0.8.2" }
inline_tweak = "1.0.2"
itertools = "0.10.0"
fast-srgb8 = "1.0.0"
# Tracy
tracing = "0.1"

View File

@ -5,7 +5,6 @@
#![feature(
array_methods,
array_zip,
array_from_fn,
drain_filter,
once_cell,
trait_alias,
@ -13,7 +12,6 @@
map_try_insert,
slice_as_chunks,
let_chains
portable_simd
)]
#![recursion_limit = "2048"]
@ -157,7 +155,7 @@ pub trait PlayState {
fn globals_bind_group(&self) -> &GlobalsBindGroup;
/// Draw the play state.
fn render<'a>(&'a self, drawer: &mut Drawer<'a>, settings: &Settings);
fn render(&self, drawer: &mut Drawer<'_>, settings: &Settings);
/// Determines whether egui will be rendered for this play state
fn egui_enabled(&self) -> bool;

View File

@ -275,7 +275,7 @@ impl PlayState for CharSelectionState {
fn globals_bind_group(&self) -> &GlobalsBindGroup { self.scene.global_bind_group() }
fn render<'a>(&'a self, drawer: &mut Drawer<'a>, _: &Settings) {
fn render(&self, drawer: &mut Drawer<'_>, _: &Settings) {
let client = self.client.borrow();
let (humanoid_body, loadout) =
Self::get_humanoid_body_inventory(&self.char_selection_ui, &client);

View File

@ -394,7 +394,7 @@ impl PlayState for MainMenuState {
fn globals_bind_group(&self) -> &GlobalsBindGroup { self.scene.global_bind_group() }
fn render<'a>(&'a self, drawer: &mut Drawer<'a>, _: &Settings) {
fn render(&self, drawer: &mut Drawer<'_>, _: &Settings) {
// Draw the UI to the screen.
let mut third_pass = drawer.third_pass();
if let Some(mut ui_drawer) = third_pass.draw_ui() {

View File

@ -43,7 +43,7 @@ pub use self::{
create_quad as create_ui_quad,
create_quad_vert_gradient as create_ui_quad_vert_gradient, create_tri as create_ui_tri,
BoundLocals as UiBoundLocals, Locals as UiLocals, Mode as UiMode,
PremultiplyUpload as UiPremultiplyUpload, TextureBindGroup as UiTextureBindGroup,
TextureBindGroup as UiTextureBindGroup, UploadBatchId as UiUploadBatchId,
Vertex as UiVertex,
},
GlobalModel, Globals, GlobalsBindGroup, GlobalsLayouts, Light, Shadow,

View File

@ -552,7 +552,7 @@ pub struct PremultiplyAlphaParams {
///
/// From here we will use the `PremultiplyAlpha` pipeline to premultiply the
/// alpha while transfering the image to its destination texture.
pub struct PremultiplyUpload {
pub(in super::super) struct PremultiplyUpload {
source_bg: wgpu::BindGroup,
source_size_xy: u32,
/// The location in the final texture this will be placed at. Technically,
@ -562,7 +562,7 @@ pub struct PremultiplyUpload {
}
impl PremultiplyUpload {
pub fn prepare(
pub(in super::super) fn prepare(
device: &wgpu::Device,
queue: &wgpu::Queue,
layout: &PremultiplyAlphaLayout,
@ -593,7 +593,7 @@ impl PremultiplyUpload {
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
},
&(&**image)[..(image.width() as usize * image.height() as usize)],
&(&**image)[..(image.width() as usize * image.height() as usize * 4)],
wgpu::ImageDataLayout {
offset: 0,
bytes_per_row: NonZeroU32::new(image.width() * 4),
@ -622,7 +622,7 @@ impl PremultiplyUpload {
});
// NOTE: We assume the max texture size is less than u16::MAX.
let source_size_xy = image_size.width + image_size.height << 16;
let source_size_xy = image_size.width + (image_size.height << 16);
Self {
source_bg,
@ -634,15 +634,74 @@ impl PremultiplyUpload {
/// Semantically, this consumes the `PremultiplyUpload` but we need to keep
/// the bind group alive to the end of the render pass and don't want to
/// bother storing it somewhere else.
pub fn draw_data(&self, target: &Texture) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
let target_offset_xy = u32::from(self.offset.x) + u32::from(self.offset.y) << 16;
pub(in super::super) fn draw_data(
&self,
target: &Texture,
) -> (&wgpu::BindGroup, PremultiplyAlphaParams) {
let target_offset_xy = u32::from(self.offset.x) + (u32::from(self.offset.y) << 16);
let target_dims = target.get_dimensions();
// NOTE: We assume the max texture size is less than u16::MAX.
let target_size_xy = target_dims.x + target_dims.y << 16;
let target_size_xy = target_dims.x + (target_dims.y << 16);
(&self.source_bg, PremultiplyAlphaParams {
source_size_xy: self.source_size_xy,
target_offset_xy,
target_size_xy,
})
}
pub fn area_dbg(&self) -> f32 {
(self.source_size_xy & 0xFFFF) as f32 * (self.source_size_xy >> 16) as f32
}
}
use std::sync::Arc;
/// Per-target texture batched uploads
#[derive(Default)]
pub(in super::super) struct BatchedUploads {
batches: Vec<(Arc<Texture>, Vec<PremultiplyUpload>)>,
}
#[derive(Default, Clone, Copy)]
pub struct UploadBatchId(usize);
impl BatchedUploads {
/// Adds the provided upload to the batch indicated by the provided target
/// texture and optional batch id. A new batch will be created if the batch
/// id is invalid (doesn't refer to an existing batch) or the provided
/// target texture isn't the same as the one associated with the
/// provided batch id. Creating a new batch involves cloning the
/// provided texture `Arc`.
///
/// The id of the batch where the upload is ultimately submitted will be
/// returned. This id can be used in subsequent calls to add items to
/// the same batch (i.e. uploads for the same texture).
///
/// Batch ids will reset every frame, however since we check that the
/// texture matches, it is perfectly fine to use a stale id (just keep
/// in mind that this will create a new batch). This also means that it is
/// sufficient to use `UploadBatchId::default()` when calling this with
/// new textures.
pub(in super::super) fn submit(
&mut self,
target_texture: &Arc<Texture>,
batch_id: UploadBatchId,
upload: PremultiplyUpload,
) -> UploadBatchId {
if let Some(batch) = self
.batches
.get_mut(batch_id.0)
.filter(|b| Arc::ptr_eq(&b.0, target_texture))
{
batch.1.push(upload);
batch_id
} else {
let new_batch_id = UploadBatchId(self.batches.len());
self.batches
.push((Arc::clone(target_texture), vec![upload]));
new_batch_id
}
}
pub(in super::super) fn take(&mut self) -> Vec<(Arc<Texture>, Vec<PremultiplyUpload>)> {
core::mem::take(&mut self.batches)
}
}

View File

@ -178,6 +178,8 @@ pub struct Renderer {
profile_times: Vec<wgpu_profiler::GpuTimerScopeResult>,
profiler_features_enabled: bool,
ui_premultiply_uploads: ui::BatchedUploads,
#[cfg(feature = "egui-ui")]
egui_renderpass: egui_wgpu_backend::RenderPass,
@ -545,6 +547,8 @@ impl Renderer {
profile_times: Vec::new(),
profiler_features_enabled,
ui_premultiply_uploads: Default::default(),
#[cfg(feature = "egui-ui")]
egui_renderpass,
@ -1437,18 +1441,23 @@ impl Renderer {
texture.update(&self.queue, offset, size, bytemuck::cast_slice(data))
}
pub fn prepare_premultiply_upload(
&self,
/// See docs on [`ui::BatchedUploads::submit`].
pub fn ui_premultiply_upload(
&mut self,
target_texture: &Arc<Texture>,
batch: ui::UploadBatchId,
image: &image::RgbaImage,
offset: Vec2<u16>,
) -> ui::PremultiplyUpload {
ui::PremultiplyUpload::prepare(
) -> ui::UploadBatchId {
let upload = ui::PremultiplyUpload::prepare(
&self.device,
&self.queue,
&self.layouts.premultiply_alpha,
image,
offset,
)
);
self.ui_premultiply_uploads
.submit(target_texture, batch, upload)
}
/// Queue to obtain a screenshot on the next frame render

View File

@ -37,6 +37,14 @@ impl<'frame> Pipelines<'frame> {
}
}
fn premultiply_alpha(&self) -> Option<&ui::PremultiplyAlphaPipeline> {
match self {
Pipelines::Interface(pipelines) => Some(&pipelines.premultiply_alpha),
Pipelines::All(pipelines) => Some(&pipelines.premultiply_alpha),
Pipelines::None => None,
}
}
fn blit(&self) -> Option<&blit::BlitPipeline> {
match self {
Pipelines::Interface(pipelines) => Some(&pipelines.blit),
@ -67,6 +75,7 @@ struct RendererBorrow<'frame> {
pipeline_modes: &'frame super::PipelineModes,
quad_index_buffer_u16: &'frame Buffer<u16>,
quad_index_buffer_u32: &'frame Buffer<u32>,
ui_premultiply_uploads: &'frame mut ui::BatchedUploads,
#[cfg(feature = "egui-ui")]
egui_render_pass: &'frame mut egui_wgpu_backend::RenderPass,
}
@ -118,6 +127,7 @@ impl<'frame> Drawer<'frame> {
pipeline_modes: &renderer.pipeline_modes,
quad_index_buffer_u16: &renderer.quad_index_buffer_u16,
quad_index_buffer_u32: &renderer.quad_index_buffer_u32,
ui_premultiply_uploads: &mut renderer.ui_premultiply_uploads,
#[cfg(feature = "egui-ui")]
egui_render_pass: &mut renderer.egui_renderpass,
};
@ -425,15 +435,19 @@ impl<'frame> Drawer<'frame> {
});
}
pub fn run_ui_premultiply_passes<'a>(
&mut self,
targets: impl Iterator<Item = (&'a super::super::Texture, Vec<ui::PremultiplyUpload>)>,
) {
/// Runs render passes with alpha premultiplication pipeline to complete any
/// pending uploads.
fn run_ui_premultiply_passes<'a>(&mut self) {
prof_span!("run_ui_premultiply_passes");
let Some(premultiply_alpha) = self.borrow.pipelines.premultiply_alpha() else { return };
let encoder = self.encoder.as_mut().unwrap();
let device = self.borrow.device;
let targets = self.borrow.ui_premultiply_uploads.take();
// TODO: What is the CPU overhead of each renderpass?
for (i, (target_texture, uploads)) in targets.enumerate() {
for (i, (target_texture, uploads)) in targets.into_iter().enumerate() {
let mut area = 0.0;
prof_span!("ui premultiply pass");
tracing::info!("{} uploads", uploads.len());
let profile_name = format!("ui_premultiply_pass {}", i);
@ -447,23 +461,31 @@ impl<'frame> Drawer<'frame> {
view: &target_texture.view,
resolve_target: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT),
load: wgpu::LoadOp::Load,
store: true,
},
}],
depth_stencil_attachment: None,
});
render_pass.set_pipeline(&premultiply_alpha.pipeline);
for upload in &uploads {
let (source_bind_group, push_constant_data) = upload.draw_data(target_texture);
area += upload.area_dbg();
let (source_bind_group, push_constant_data) = upload.draw_data(&target_texture);
let bytes = bytemuck::bytes_of(&push_constant_data);
render_pass.set_bind_group(0, source_bind_group, &[]);
render_pass.set_push_constants(wgpu::ShaderStage::VERTEX, 0, bytes);
render_pass.draw_indexed(0..6, 0, 0..1);
render_pass.draw(0..6, 0..1);
}
let avg_area = area as f32 / uploads.len() as f32;
tracing::info!("avg area sqrt {}", f32::sqrt(avg_area));
}
}
/// Note, this automatically calls the internal `run_ui_premultiply_passes`
/// to complete any pending image uploads for the UI.
pub fn third_pass(&mut self) -> ThirdPassDrawer {
self.run_ui_premultiply_passes();
let encoder = self.encoder.as_mut().unwrap();
let device = self.borrow.device;
let mut render_pass =
@ -537,7 +559,7 @@ impl<'frame> Drawer<'frame> {
/// Does nothing if the shadow pipelines are not available or shadow map
/// rendering is disabled
pub fn draw_point_shadows<'data: 'frame>(
pub fn draw_point_shadows<'data>(
&mut self,
matrices: &[shadow::PointLightMatrix; 126],
chunks: impl Clone

View File

@ -74,7 +74,7 @@ impl assets::Compound for Shaders {
"ui-vert",
"ui-frag",
"premultiply-alpha-vert",
"premultiply_alpha-frag",
"premultiply-alpha-frag",
"lod-terrain-vert",
"lod-terrain-frag",
"clouds-vert",

View File

@ -1232,9 +1232,9 @@ impl Scene {
pub fn global_bind_group(&self) -> &GlobalsBindGroup { &self.globals_bind_group }
/// Render the scene using the provided `Drawer`.
pub fn render<'a>(
&'a self,
drawer: &mut Drawer<'a>,
pub fn render(
&self,
drawer: &mut Drawer<'_>,
state: &State,
viewpoint_entity: EcsEntity,
tick: u64,

View File

@ -1896,7 +1896,7 @@ impl PlayState for SessionState {
/// Render the session to the screen.
///
/// This method should be called once per frame.
fn render<'a>(&'a self, drawer: &mut Drawer<'a>, settings: &Settings) {
fn render(&self, drawer: &mut Drawer<'_>, settings: &Settings) {
span!(_guard, "render", "<Session as PlayState>::render");
let client = self.client.borrow();

View File

@ -7,6 +7,9 @@ use conrod_core::{text::GlyphCache, widget::Id};
use hashbrown::HashMap;
use vek::*;
// TODO: probably make cache fields where we have mut getters into just public
// fields
// Multiplied by current window size
const GLYPH_CACHE_SIZE: u32 = 1;
// Glyph cache tolerances

View File

@ -4,10 +4,11 @@ pub mod renderer;
pub use renderer::{SampleStrat, Transform};
use crate::{
render::{Renderer, Texture, UiPremultiplyUpload, UiTextureBindGroup},
render::{Renderer, Texture, UiTextureBindGroup, UiUploadBatchId},
ui::KeyedJobs,
};
use common::{figure::Segment, slowjob::SlowJobPool};
use common_base::prof_span;
use guillotiere::{size2, SimpleAtlasAllocator};
use hashbrown::{hash_map::Entry, HashMap};
use image::{DynamicImage, RgbaImage};
@ -86,7 +87,7 @@ impl CachedDetails {
fn info(
&self,
atlases: &[(SimpleAtlasAllocator, usize)],
textures: &Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
textures: &Slab<(Arc<Texture>, UiTextureBindGroup, UiUploadBatchId)>,
) -> (usize, bool, Aabr<u16>) {
match *self {
CachedDetails::Atlas {
@ -119,6 +120,17 @@ impl CachedDetails {
},
}
}
fn set_valid(&mut self) {
match self {
Self::Atlas { ref mut valid, .. } => {
*valid = true;
},
Self::Texture { ref mut valid, .. } => {
*valid = true;
},
}
}
}
/// Requirements that a particular graphic has with respect to the atlas
@ -241,7 +253,7 @@ pub struct GraphicCache {
/// for this frame. The purpose of this is to collect all the operations
/// together so that a single renderpass is performed for each target
/// texture.
textures: Slab<(Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>)>,
textures: Slab<(Arc<Texture>, UiTextureBindGroup, UiUploadBatchId)>,
/// The location and details of graphics cached on the GPU.
///
/// Graphic::Voxel images include the dimensions they were rasterized at in
@ -257,7 +269,7 @@ impl GraphicCache {
let (atlas, (tex, bind)) = create_atlas_texture(renderer);
let mut textures = Slab::new();
let tex_id = textures.insert((tex, bind, Vec::new()));
let tex_id = textures.insert((tex, bind, UiUploadBatchId::default()));
Self {
graphic_map: HashMap::default(),
@ -336,7 +348,7 @@ impl GraphicCache {
/// Used to acquire textures for rendering
pub fn get_tex(&self, id: TexId) -> (&Texture, &UiTextureBindGroup) {
let (tex, bind, _uploads) = self.textures.get(id.0).expect("Invalid TexId used");
let (tex, bind, _upload_batch) = self.textures.get(id.0).expect("Invalid TexId used");
(tex, bind)
}
@ -368,18 +380,13 @@ impl GraphicCache {
let (atlas, (tex, bind)) = create_atlas_texture(renderer);
let mut textures = Slab::new();
let tex_id = textures.insert((tex, bind, Vec::new()));
let tex_id = textures.insert((tex, bind, UiUploadBatchId::default()));
self.atlases = vec![(atlas, tex_id)];
self.textures = textures;
}
/// Source rectangle should be from 0 to 1, and represents a bounding box
/// for the source image of the graphic.
///
/// [`complete_premultiply_uploads`](Self::complete_premultiply_uploads)
/// needs to be called to finalize updates on the GPU that are initiated
/// here. Thus, ideally that would be called before drawing UI elements
/// using the images cached here.
pub fn cache_res(
&mut self,
renderer: &mut Renderer,
@ -465,19 +472,18 @@ impl GraphicCache {
requirements.to_key_and_tex_parameters(graphic_id, requested_dims_upright);
let details = match cache_map.entry(key) {
Entry::Occupied(details) => {
let details = details.get();
Entry::Occupied(mut details) => {
let details = details.get_mut();
let (idx, valid, aabr) = details.info(atlases, textures);
// Check if the cached version has been invalidated by replacing the underlying
// graphic
if !valid {
// Create image
let image = prepare_graphic(
let (image, gpu_premul) = prepare_graphic(
graphic,
key,
requested_dims_upright,
false,
&mut self.keyed_jobs,
pool,
)?;
@ -489,7 +495,9 @@ impl GraphicCache {
texture_parameters.size.map(u32::from).into_tuple()
);
// Transfer to the gpu
upload_image(renderer, aabr, &mut textures[idx].2, &image);
let (ref texture, _, ref mut upload_batch) = &mut textures[idx];
upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
details.set_valid();
}
return Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)));
@ -498,11 +506,10 @@ impl GraphicCache {
};
// Construct image in an optional threadpool.
let image = prepare_graphic(
let (image, gpu_premul) = prepare_graphic(
graphic,
key,
requested_dims_upright,
false,
&mut self.keyed_jobs,
pool,
)?;
@ -540,7 +547,8 @@ impl GraphicCache {
valid: true,
aabr,
});
upload_image(renderer, aabr, &mut textures[texture_idx].2, &image);
let (ref texture, _, ref mut upload_batch) = &mut textures[texture_idx];
upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
break;
}
}
@ -555,10 +563,11 @@ impl GraphicCache {
.map(aabr_from_alloc_rect)
.unwrap();
// NOTE: All mutations happen only after the texture creation succeeds!
let tex_idx = textures.insert((tex, bind, Vec::new()));
let tex_idx = textures.insert((tex, bind, UiUploadBatchId::default()));
let atlas_idx = atlases.len();
atlases.push((atlas, tex_idx));
upload_image(renderer, aabr, &mut textures[tex_idx].2, &image);
let (ref texture, _, ref mut upload_batch) = &mut textures[tex_idx];
upload_image(renderer, texture, upload_batch, &image, aabr, gpu_premul);
CachedDetails::Atlas {
atlas_idx,
valid: true,
@ -568,11 +577,12 @@ impl GraphicCache {
}
} else {
// Create a texture just for this
let (tex, bind, uploads) = create_image(renderer, &image, texture_parameters);
let (tex, bind, upload_batch) =
create_image(renderer, &image, texture_parameters, gpu_premul);
// NOTE: All mutations happen only after the texture creation and upload
// initiation succeeds! (completing the upload does not have any failure cases
// afaik)
let index = textures.insert((tex, bind, uploads));
// initiation succeeds! (completing the upload does not have any
// failure cases afaik)
let index = textures.insert((tex, bind, upload_batch));
CachedDetails::Texture { index, valid: true }
};
@ -584,77 +594,76 @@ impl GraphicCache {
Some((transformed_aabr(aabr.map(|e| e as f64)), TexId(idx)))
}
/// Runs render passes with alpha premultiplication pipeline to complete any
/// pending uploads.
///
/// This should be called before starting the pass where the ui is rendered.
pub fn complete_premultiply_uploads(&mut self, drawer: &mut crate::render::Drawer<'_>) {
drawer.run_ui_premultiply_passes(
self.textures
.iter_mut()
.map(|(_tex_id, (texture, _, uploads))| (&*texture, core::mem::take(uploads))),
);
}
}
/// Prepare the graphic into the form that will be uploaded to the GPU.
///
/// For voxel graphics, draws the graphic at the specified dimensions.
///
/// Also can pre-multiplies alpha in images so they can be linearly filtered on
/// the GPU (this is optional since we also have a path to do this
/// premultiplication on the GPU).
/// Alpha premultiplication is necessary so that images so they can be linearly
/// filtered on the GPU. Premultiplication can either occur here or on the GPU
/// depending on the size of the image and other factors. If premultiplication
/// on the GPU is needed the returned bool will be `true`.
fn prepare_graphic<'graphic>(
graphic: &'graphic Graphic,
cache_key: CacheKey,
dims: Vec2<u16>,
premultiply_on_cpu: bool, // TODO: currently unused
keyed_jobs: &mut KeyedJobs<CacheKey, RgbaImage>,
pool: Option<&SlowJobPool>,
) -> Option<Cow<'graphic, RgbaImage>> {
) -> Option<(Cow<'graphic, RgbaImage>, bool)> {
prof_span!("prepare_graphic");
match graphic {
// Short-circuit spawning a job on the threadpool for blank graphics
Graphic::Blank => None,
Graphic::Image(image, _border_color) => {
if premultiply_on_cpu {
keyed_jobs
.spawn(pool, cache_key, || {
let image = Arc::clone(image);
move |_| {
// Image will be rescaled when sampling from it on the GPU so we don't
// need to resize it here.
let mut image = image.to_rgba8();
// TODO: could potentially do this when loading the image and for voxel
// images maybe at some point in the `draw_vox` processing. Or we could
// push it in the other direction and do conversion on the GPU.
premultiply_alpha(&mut image);
image
}
})
.map(|(_, v)| Cow::Owned(v))
} else if let Some(rgba) = image.as_rgba8() {
Some(Cow::Borrowed(rgba))
} else {
// TODO: we should require rgba8 format
warn!("Non-rgba8 image in UI used this may be deprecated.");
Some(Cow::Owned(image.to_rgba8()))
}
// Image will be rescaled when sampling from it on the GPU so we don't
// need to resize it here.
//
// TODO: We could potentially push premultiplication even earlier (e.g. to the
// time of loading images or packaging veloren for distribution).
let mut rgba_cow = image.as_rgba8().map_or_else(
|| {
// TODO: we may want to require loading in as the rgba8 format so we don't have
// to perform conversion here. On the other hand, we can take advantage of
// certain formats to know that alpha premultiplication doesn't need to be
// performed (but we would probably just want to store that with the loaded
// rgba8 format).
Cow::Owned(image.to_rgba8())
},
Cow::Borrowed,
);
// NOTE: We do premultiplication on the main thread since if it would be
// expensive enough to do in the background we would just do it on
// the GPU. Could still use `rayon` to parallelize this work, if
// needed.
let premultiply_strategy = PremultiplyStrategy::determine(&*rgba_cow);
let needs_gpu_premultiply = match premultiply_strategy {
PremultiplyStrategy::UseGpu => true,
PremultiplyStrategy::NotNeeded => false,
PremultiplyStrategy::UseCpu => {
// NOTE: to_mut will clone the image if it was Cow::Borrowed
premultiply_alpha(rgba_cow.to_mut());
false
},
};
Some((rgba_cow, needs_gpu_premultiply))
},
Graphic::Voxel(segment, trans, sample_strat) => keyed_jobs
.spawn(pool, cache_key, || {
let segment = Arc::clone(segment);
let (trans, sample_strat) = (*trans, *sample_strat);
move |_| {
// TODO: for now we always use CPU premultiplication for these, may want to
// re-evaluate this after zoomy worldgen branch is merged (and it is more clear
// when these jobs go to the background thread pool or not).
// Render voxel model at requested resolution
let mut image = renderer::draw_vox(&segment, dims, trans, sample_strat);
if premultiply_on_cpu {
premultiply_alpha(&mut image);
}
premultiply_alpha(&mut image);
image
}
})
.map(|(_, v)| Cow::Owned(v)),
.map(|(_, v)| (Cow::Owned(v), false)),
}
}
@ -672,7 +681,11 @@ fn create_image_texture(
renderer: &mut Renderer,
size: Vec2<u32>,
address_mode: Option<wgpu::AddressMode>,
) -> (Texture, UiTextureBindGroup) {
) -> (Arc<Texture>, UiTextureBindGroup) {
// TODO: Right now we have to manually clear images to workaround AMD DX bug,
// for this we use Queue::write_texture which needs this usage. I think this
// may be fixed in newer wgpu versions that auto-clear the texture.
let workaround_usage = wgpu::TextureUsage::COPY_DST;
let tex_info = wgpu::TextureDescriptor {
label: None,
size: wgpu::Extent3d {
@ -684,7 +697,10 @@ fn create_image_texture(
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8UnormSrgb,
usage: wgpu::TextureUsage::RENDER_ATTACHMENT | wgpu::TextureUsage::SAMPLED,
usage: wgpu::TextureUsage::RENDER_ATTACHMENT // GPU premultiply
| wgpu::TextureUsage::COPY_DST // CPU premultiply
| wgpu::TextureUsage::SAMPLED // using image in ui rendering
| workaround_usage,
};
let view_info = wgpu::TextureViewDescriptor {
format: Some(tex_info.format),
@ -701,12 +717,12 @@ fn create_image_texture(
};
let tex = renderer.create_texture_raw(&tex_info, &view_info, &sampler_info);
let bind = renderer.ui_bind_texture(&tex);
(tex, bind)
(Arc::new(tex), bind)
}
fn create_atlas_texture(
renderer: &mut Renderer,
) -> (SimpleAtlasAllocator, (Texture, UiTextureBindGroup)) {
) -> (SimpleAtlasAllocator, (Arc<Texture>, UiTextureBindGroup)) {
let size = atlas_size(renderer);
// Note: here we assume the max texture size is under i32::MAX.
let atlas = SimpleAtlasAllocator::new(size2(size.x as i32, size.y as i32));
@ -726,23 +742,34 @@ fn aabr_from_alloc_rect(rect: guillotiere::Rectangle) -> Aabr<u16> {
fn upload_image(
renderer: &mut Renderer,
aabr: Aabr<u16>,
target_texture_uploads: &mut Vec<UiPremultiplyUpload>,
target_texture: &Arc<Texture>,
upload_batch: &mut UiUploadBatchId,
image: &RgbaImage,
aabr: Aabr<u16>,
premultiply_on_gpu: bool,
) {
let aabr = aabr.map(u32::from);
// Check that this image and the target aabr are the same size (otherwise there
// is a bug in this module).
debug_assert_eq!(aabr.size().into_tuple(), image.dimensions());
let offset = aabr.min.into_array();
// TODO: can we transparently have cpu based version behind this (actually this
// would introduce more complexity to be able to do it in the background,
// but we could to it not in the background here especially for smaller
// things this would work well)
let upload = UiPremultiplyUpload::prepare(renderer, image, offset);
target_texture_uploads.push(upload);
//todo!()
debug_assert_eq!(aabr.map(u32::from).size().into_tuple(), image.dimensions());
if premultiply_on_gpu {
*upload_batch =
renderer.ui_premultiply_upload(target_texture, *upload_batch, image, aabr.min);
} else {
let aabr = aabr.map(u32::from);
let offset = aabr.min.into_array();
let size = aabr.size().into_array();
// upload directly
renderer.update_texture(
&*target_texture,
offset,
size,
// NOTE: Rgba texture, so each pixel is 4 bytes, ergo this cannot fail.
// We make the cast parameters explicit for clarity.
bytemuck::cast_slice::<u8, [u8; 4]>(
&(&**image)[..size[0] as usize * size[1] as usize * 4],
),
)
}
}
// This is used for border_color.is_some() images (ie the map image).
@ -750,7 +777,8 @@ fn create_image(
renderer: &mut Renderer,
image: &RgbaImage,
texture_parameters: TextureParameters,
) -> (Texture, UiTextureBindGroup, Vec<UiPremultiplyUpload>) {
premultiply_on_gpu: bool,
) -> (Arc<Texture>, UiTextureBindGroup, UiUploadBatchId) {
let (tex, bind) = create_image_texture(
renderer,
texture_parameters.size.map(u32::from),
@ -760,17 +788,82 @@ fn create_image(
//.map(|c| c.into_array().into()),
.map(|_| wgpu::AddressMode::ClampToBorder),
);
let mut uploads = Vec::new();
let mut upload_batch = UiUploadBatchId::default();
let aabr = Aabr {
min: Vec2::zero(),
max: texture_parameters.size,
};
upload_image(renderer, aabr, &mut uploads, image);
(tex, bind, uploads)
upload_image(
renderer,
&tex,
&mut upload_batch,
image,
aabr,
premultiply_on_gpu,
);
(tex, bind, upload_batch)
}
// CPU-side alpha premultiplication implementation.
pub struct PremultiplyLookupTable {
alpha: [u16; 256],
// This is for both colors that are always below the linear transform threshold (of the
// transform between linear/non-linear srgb) and colors that start above the threshold when
// transforming into linear srgb and then fall below it after being multiplied by alpha (before
// being transformed out of linear srgb).
color: [u16; 256],
}
impl Default for PremultiplyLookupTable {
fn default() -> Self {
#[rustfmt::skip]
fn accurate_to_linear(c: u8) -> f32 {
let c = c as f32 / 255.0;
// https://en.wikipedia.org/wiki/SRGB#Transformation
if c <= 0.04045 {
c / 12.92
} else {
// 0.055 ~= 14
((c + 0.055) / 1.055).powf(2.4)
}
}
use core::array;
let alpha = array::from_fn(|alpha| {
// NOTE: u16::MAX + 1 here relies on the max alpha being short-circuited (and
// not using this table). We multiply by this factor since it is a
// power of 2, which means later demultiplying it will optimize to a
// bitshift.
(((alpha as f32 / 255.0).powf(1.0 / 2.4) * (u16::MAX as f32 + 1.0)) + 0.5) as u16
});
let color = array::from_fn(|color| {
(if color <= 10 {
// <= 10 means the transform is linear!
color as f32 / 255.0
} else {
// Here the transform into linear srgb isn't linear but the transform out of it is.
//
// This is transform into and out of linear srgb with the theoretical alpha
// multiplication factored out.
accurate_to_linear(color as u8) * 12.92
}
// take advantage of the precision offered by u16
* (1 << 13) as f32
// round to the nearest integer when the cast truncates
+ 0.5) as u16
});
Self { alpha, color }
}
}
fn premultiply_alpha(image: &mut RgbaImage) {
use fast_srgb8::{f32x4_to_srgb8, srgb8_to_f32};
prof_span!("premultiply alpha");
lazy_static::lazy_static! {
static ref LOOKUP: PremultiplyLookupTable = Default::default();
}
let lookup = &*LOOKUP;
// TODO: Apparently it is possible for ImageBuffer raw vec to have more pixels
// than the dimensions of the actual image (I don't think we actually have
// this occuring but we should probably fix other spots that use the raw
@ -779,52 +872,200 @@ fn premultiply_alpha(image: &mut RgbaImage) {
let dims = image.dimensions();
let image_buffer_len = dims.0 as usize * dims.1 as usize * 4;
let (arrays, end) = (&mut **image)[..image_buffer_len].as_chunks_mut::<{ 4 * 4 }>();
// Rgba8 has 4 bytes per pixel they should be no remainder when dividing by 4.
// Rgba8 has 4 bytes per pixel there should be no remainder when dividing by 4.
let (end, _) = end.as_chunks_mut::<4>();
end.iter_mut().for_each(|pixel| {
let alpha = pixel[3];
if alpha == 0 {
*pixel = [0; 4];
} else if alpha != 255 {
let linear_alpha = alpha as f32 / 255.0;
let [r, g, b] = core::array::from_fn(|i| srgb8_to_f32(pixel[i]) * linear_alpha);
let srgb8 = f32x4_to_srgb8([r, g, b, 0.0]);
(pixel[0], pixel[1], pixel[3]) = (srgb8[0], srgb8[1], srgb8[3]);
return;
} else if alpha == 255 {
return;
};
for color in &mut pixel[..3] {
let predicted = ((lookup.alpha[alpha as usize] as u32) * (*color as u32 + 14) + 32433)
/ (u16::MAX as u32 + 1);
let multiplied_color = (if predicted < 9 + 14 {
(lookup.color[*color as usize] as u32 * alpha as u32 + 4096) >> 13
} else {
predicted - 14
}) as u8;
*color = multiplied_color;
}
});
arrays.iter_mut().for_each(|pixelx4| {
use core::simd::{f32x4, u8x4, Simd};
let alpha = Simd::from_array([pixelx4[3], pixelx4[7], pixelx4[11], pixelx4[15]]);
if alpha == Simd::splat(0) {
*pixelx4 = [0; 16];
} else if alpha != Simd::splat(255) {
let linear_simd = |array: [u8; 4]| Simd::from_array(array.map(srgb8_to_f32));
// Pack rgb components from the 4th pixel into the the last position for each of
// the other 3 pixels.
let a = linear_simd([pixelx4[0], pixelx4[1], pixelx4[2], pixelx4[12]]);
let b = linear_simd([pixelx4[4], pixelx4[5], pixelx4[6], pixelx4[13]]);
let c = linear_simd([pixelx4[8], pixelx4[9], pixelx4[10], pixelx4[14]]);
let linear_alpha = alpha.cast::<f32>() * Simd::splat(1.0 / 255.0);
// Multiply by alpha and then convert back into srgb8.
let premultiply = |x: f32x4, i| {
let mut a = f32x4::splat(linear_alpha[i]);
a[3] = linear_alpha[3];
u8x4::from_array(f32x4_to_srgb8((x * a).to_array()))
};
let pa = premultiply(a, 0);
let pb = premultiply(b, 1);
let pc = premultiply(c, 2);
(pixelx4[0], pixelx4[1], pixelx4[2]) = (pa[0], pa[1], pa[2]);
(pixelx4[4], pixelx4[5], pixelx4[6]) = (pb[0], pb[1], pb[2]);
(pixelx4[8], pixelx4[9], pixelx4[10]) = (pc[0], pc[1], pc[2]);
(pixelx4[12], pixelx4[13], pixelx4[14]) = (pa[3], pb[3], pc[3]);
// Short-circuit for alpha == 0 or 255
// This adds ~7 us (worst case) for a 256x256 image.
// Best case is decreased to 20 us total time.
if pixelx4[3] == pixelx4[7] && pixelx4[3] == pixelx4[11] && pixelx4[3] == pixelx4[15] {
if pixelx4[3] == 0 {
*pixelx4 = [0; 16];
return;
} else if pixelx4[3] == u8::MAX {
return;
}
}
})
// Lookup transformed alpha values for each pixel first.
// Putting this here seems to make things slightly faster.
let factors = [
lookup.alpha[pixelx4[3] as usize],
lookup.alpha[pixelx4[7] as usize],
lookup.alpha[pixelx4[11] as usize],
lookup.alpha[pixelx4[15] as usize],
];
for pixel_index in 0..4 {
let alpha_factor = factors[pixel_index];
let alpha = pixelx4[pixel_index * 4 + 3];
// Putting this code outside the loop makes things take ~25% less time.
let color_factors = [
lookup.color[pixelx4[pixel_index * 4 + 0] as usize] as u32 * alpha as u32 + 4096,
lookup.color[pixelx4[pixel_index * 4 + 1] as usize] as u32 * alpha as u32 + 4096,
lookup.color[pixelx4[pixel_index * 4 + 2] as usize] as u32 * alpha as u32 + 4096,
];
for i in 0..3 {
let color = &mut pixelx4[pixel_index * 4 + i];
// Loosely based on transform to linear and back (above threshold) (this is
// where use of 14 comes from).
// `32433` selected via trial and error to reduce the number of mismatches.
// `/ (u16::MAX as u32 + 1)` transforms back to `u8` precision (we add 1 so it
// will be a division by a power of 2 which optimizes well).
let predicted =
((alpha_factor as u32) * (*color as u32 + 14) + 32328) / (u16::MAX as u32 + 1);
let multiplied_color = (if predicted < 9 + 14 {
// Here we handle two cases:
// 1. When the transform starts and ends as linear.
// 2. When the color is over the linear threshold for the transform into linear
// space but below this threshold when transforming back out (due to being
// multiplied with a small alpha).
// (in both cases the result is linearly related to alpha and we can encode how
// it is related to the color in a lookup table)
// NOTE: 212 is the largest color value used here (when alpha isn't 0)
color_factors[i] >> 13
} else {
predicted - 14
}) as u8;
*color = multiplied_color;
}
}
});
}
// Next step: Handling invalidation / removal of old textures when
// replace_graphic is used under new resizing scheme.
//
// TODO: does screenshot texture have COPY_DST? I don't think it needs this.
/// Strategy for how alpha premultiplication will be applied to an image.
enum PremultiplyStrategy {
UseCpu,
UseGpu,
// Image is fully opaque.
NotNeeded,
}
impl PremultiplyStrategy {
#[rustfmt::skip] // please don't format comment with 'ns/pixel' to a separate line from the value
fn determine(image: &RgbaImage) -> Self {
// TODO: Would be useful to re-time this after a wgpu update.
//
// Thresholds below are based on the timing measurements of the CPU based premultiplication
// vs ovehead of interacting with the GPU API to perform premultiplication on the GPU.
// These timings are quite circumstantial and could vary between machines, wgpu updates,
// and changes to the structure of the GPU based path.
//
// GPU path costs (For calculations I used `57.6 us` as a roughly reasonable estimate of
// total time here but that can vary lower and higher. Everything is a bit imprecise here
// so I won't list individual timings. The key takeaway is that this can be made more
// efficient by avoidiing the create/drop of a texture, texture view, and bind group for
// each image. Also, if we didn't need a separate render pass for each target image that
// would be helpful as well. Using compute passes and passing data in as a raw buffer may
// help with both of these but initial attempts with that ran into issues (e.g. when we get
// the ability to have non-srgb views of srgb textures that will be useful)):
// * create/drop texture
// * create/drop texture view
// * create/drop bind group
// * run render pass (NOTE: if many images are processed at once with the same target
// texture this portion of the cost can be split between them)
//
// CPU path costs:
// * clone image (0.17 ns/pixel (benchmark) - 0.73 ns/pixel (in voxygen))
// * run premultiplication (0.305 ns/pixel (when shortcircuits are always hit) -
// 3.81 ns/pixel (with random alpha))
//
// Shared costs include:
// * write_texture
// * (optional) check for fraction of shortcircuit blocks in image (0.223 ns/pixel)
//
// `ALWAYS_CPU_THRESHOLD` is roughly:
// ("cost of GPU path" + "shortcircuit count cost") / "worst case cost of CPU path per pixel"
//
// `ALWAYS_GPU_THRESHOLD` is NOT: "cost of GPU path" / "best case cost of CPU path per pixel"
// since the cost of checking for whether the CPU path is better at this quantity of pixels
// becomes more than the on the amount of overhead we are willing to add to the worst case
// scenario where we run the short-circuit count check and end up using the GPU path. The
// currently selected value of 200x200 adds at most about ~20% of the cost of the GPU path.
// (TODO: maybe we could have the check bail out early if the results aren't looking
// favorable for the CPU path and/or sample a random subset of the pixels).
//
// `CHECKED_THRESHOLD` is roughly: "cost of GPU path / "best case cost of CPU path per pixel"
const ALWAYS_CPU_THRESHOLD: usize = 120 * 120;
const ALWAYS_GPU_THRESHOLD: usize = 200 * 200;
const CHECKED_THRESHOLD: usize = 240 * 240;
let dims = image.dimensions();
let pixel_count = dims.0 as usize * dims.1 as usize;
if pixel_count <= ALWAYS_CPU_THRESHOLD {
Self::UseCpu
} else if pixel_count > ALWAYS_GPU_THRESHOLD {
Self::UseGpu
} else if let Some(fraction) = fraction_shortcircuit_blocks(image) {
// This seems correct...?
// TODO: I think we technically can exit the fraction checking early if we know the
// total fraction value will be over: (threshold - ALWAYS_CPU_THRESHOLD) /
// (CHECKED_THRESHOLD - ALWAYS_CPU_THRESHOLD).
let threshold = fraction * CHECKED_THRESHOLD as f32
+ (1.0 - fraction) * ALWAYS_CPU_THRESHOLD as f32;
if pixel_count as f32 <= threshold {
Self::UseCpu
} else {
Self::UseGpu
}
} else {
Self::NotNeeded
}
}
}
/// Useful to estimates cost of premultiplying alpha in the provided image via
/// the CPU method.
///
/// Computes the fraction of 4 pixel chunks that are fully translucent or
/// opaque. Returns `None` if no premultiplication is needed (i.e. all alpha
/// values are 255).
fn fraction_shortcircuit_blocks(image: &RgbaImage) -> Option<f32> {
let dims = image.dimensions();
let pixel_count = dims.0 as usize * dims.1 as usize;
let (arrays, end) = (&**image)[..pixel_count * 4].as_chunks::<{ 4 * 4 }>();
// Rgba8 has 4 bytes per pixel there should be no remainder when dividing by 4.
let (end, _) = end.as_chunks::<4>();
let end_is_opaque = end.iter().all(|pixel| pixel[3] == 255);
// 14.6 us for 256x256 image
let num_chunks = arrays.len();
let mut num_translucent = 0;
let mut num_opaque = 0;
arrays.iter().for_each(|pixelx4| {
let v = u128::from_ne_bytes(*pixelx4);
let alpha_mask = 0x000000FF_000000FF_000000FF_000000FF;
let masked = v & alpha_mask;
if masked == 0 {
num_translucent += 1;
} else if masked == alpha_mask {
num_opaque += 1;
}
});
if num_chunks == num_opaque && num_translucent == 0 && end_is_opaque {
None
} else {
Some((num_translucent as f32 + num_opaque as f32) / num_chunks as f32)
}
}

View File

@ -8,6 +8,9 @@ use glyph_brush::GlyphBrushBuilder;
use std::cell::{RefCell, RefMut};
use vek::*;
// TODO: probably make cache fields where we have mut getters into just public
// fields
// Multiplied by current window size
const GLYPH_CACHE_SIZE: u32 = 1;
// Glyph cache tolerances