veloren/assets/voxygen/shaders/light-shadows-geom.glsl
2020-05-15 14:22:17 +02:00

199 lines
10 KiB
GLSL

// Adapted from https://learnopengl.com/Advanced-Lighting/Shadows/Point-Shadows
// NOTE: We only technically need this for cube map arrays and geometry shader
// instancing.
#version 400 core
// Currently, we only need globals for the max light count (light_shadow_count.x).
#include <globals.glsl>
// Since our output primitive is a triangle strip, we have to render three vertices
// each.
#define VERTICES_PER_FACE 3
// Since we render our depth texture to a cube map, we need to render each face
// six times. If we used other shadow mapping methods with fewer outputs, this would
// shrink considerably.
#define FACES_PER_POINT_LIGHT 6
// If MAX_VERTEX_UNIFORM_COMPONENTS_ARB = 512 on many platforms, and we want a mat4
// for each of 6 directions for each light, 20 is close to the maximum allowable
// size. We could add a final matrix for the directional light of the sun or moon
// to bring us to 126 matrices, which is just 2 off.
//
// To improve this limit, we could do many things, such as:
// - choose an implementation that isn't cube maps (e.g. tetrahedrons or curves;
// if there were an easy way to sample from tetrahedrons, we'd be at 32 * 4 = 128
// exactly, leaving no room for a solar body, though).
// - Do more work in the geometry shader (e.g. just have a single projection
// matrix per light, and derive the different-facing components; or there may be
// other ways of greatly simplifying this). The tradeoff would be losing performance
// here.
// - Use ARB_instanced_arrays and switch lights with indexing, instead of a uniform
// buffer. This would probably work fine (and ARB_instanced_arrays is supported on
// pretty much every platform), but AFAIK it's possible that instanced arrays are
// slower than uniform arraay access on many platforms.
// - Don't try to do everything in one call (break this out into multiple passes).
//
// Actually, according to what I'm reading, MAX_GEOM_UNIFORM_COMPONENTS = 1024, and
// gl_MaxGeometryUniformComponents = 1024.
//
// Also, this only applies to uniforms defined *outside* of uniform blocks, of which
// there can be up to 12 (14 in OpenGL 4.3, which we definitely can't support).
// GL_MAX_UNIFORM_BLOCK_SIZE has a minimum of 16384, which *easily* exceeds our usage
// constraints. So this part might not matter.
//
// Other restrictions are easy to satisfy:
//
// gl_MaxGeometryVaryingComponents has a minimum of 64 and is the maximum number of
// varying components; I think this is the number of out components per vertex, which
// is technically 0, but would be 4 if we wrote FragPos. But it might also
// be the *total* number of varying components, in which case if we wrote FragPos
// it would be 4 * 20 * 6 * 3 = 1440, which would blow it out of the water. However,
// I kind of doubt this interpretation because writing FragPos for each of 18 vertices,
// as the original shader did, already yields 4 * 18 = 72, and it seems unlikely that
// the original example exceeds OpenGL limits.
//
// gl_MaxGeometryOutputComponents has a minimum of 128 and is the maximum number of
// components allowed in out variables; we easily fall under this since we actually
// have 0 of these. However, if we were to write FragPos for each vertex, it *might*
// cause us to exceed this limit, depending on whether it refers to the total output
// component count *including* varying components, or not. See the previous
// discussion; since 72 < 128 it's more plausible that this interpretation might be
// correct, but hopefully it's not.
//
// gl_MaxGeometryInputComponents has a minimum of 64 and we easily fall under that
// limit (I'm actually not sure we even have any user-defined input components?).
//
// gl_MaxGeometryTextureImageUnits = 16 and we have no texture image units (or maybe
// 1, the one we bound?). This might come into play if we were to have attached
// cubemaps instead of a single cubemap array, in which case it would limit us to
// 16 lights *regardless* of any of the fixes mentioned above (i.e., we'd just have
// to split up draw calls, I think).
//
// ---
//
// However, there is another limit to consider: GL_MAX_GEOMETRY_OUTPUT_VERTICES. Its
// minimum is 256, and 20 * 6 * 3 = 360, which exceeds that. This introduces a new
// limit of at most 14 point lights.
//
// Another, related limit is GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS. This counts
// every component output ("component" is usually a 4-byte field of a vector, but maybe
// this would improve with something like half-floats?), and has a minimum (as of
// OpenGL 3.3) of 1024. Since even builtin outputs gl_Layer count against this total,
// this means we issue 5 components per vertex, and 14 * 6 * 3 * 5 = 1260 > 1024.
//
// Ultimately, we find our maximum output limit of 11, ≤ 1024/5/3/6.
//
// If we choose to reserve a slot for a non-point light (and/or other uniforms), it
// is just 10, or half what we got from VERTICES_PER_FACE (we could also round down to
// 8 as a power of 2, if we had to).
//
// Unlike the input limits, whwich we can get around with "clever" solutions, it seems
// likely that the only real way to defeat the vertex limits is to use instancing of
// some sort (be it geometry shader or otherwise). This would restrict us to OpenGL
// 4.0 or above.
//
// A further consideration (were we to switch to OpenGL 4.1-supported features, but
// actually it is often supported on 3.3 hardware with ARB_viewport_array--whereas
// geometry shader instancing is *not* supported on any 3.3 hardware, so would actually
// require us to upgrade) would be setting gl_ViewportIndex. The main reason to consider
// this is that it allows specifying a separate scissor rectangle per viewport. This
// introduces two new constraints. Firstly, it adds an extra component to each vertex
// (lowering our maximum to 9 faces per light ≤ 1024/6/3/6, or 8 if we want to support a
// directional light).
//
// Secondly, a new constant (MAX_VIEWPORTS) is introduced, which would restrict the
// total number of active viewports; the minimum value for this is 16. While this may
// not seem all that relevant since our current hard limit is 11, the difference is that
// this limit would apply *across* instanced calls (since it may be a "global"
// restriction, tied to the OpenGL context; this means it couldn't even be a multiple
// frame buffer thing, as there is usually one per window). This would also tie in
// with gl_MaxGeometryTextureImageUnits, I guess.
//
// --
//
// I just realized tht using cube map arrays at all bumps our required OpenGL
// version to 4.0, so let's just do instancing...
//
// The instancing limit on MAX_GEOMETRY_SHADER_INVOCATIONS has a minimum of 32, which
// would be sufficient to run through all 32 lights with a different cube map and
// completely removes any imits on ight count.
//
// This should instantly bring us below all relevant limits in all cases considered
// except for the two that would require 16. Unfortunately, 32 is also the *maximum*
// number of point lights, which is much higher than the usual value, and the instance
// count has to be a constant. If we were to instead geometry-shader-instance each
// *face*, we'd get a maximum light count of 56 ≤ 1024/6/3, which is not as elegant
// but is easily higher than 32. So, let's try using that instead.
//
// It is *possible* that using instancing on the *vertex* shader with the (dynamically
// uniform) total number of instances set to the actual number of point lights, would
// improve performance, since it would give us a 1:1 vertex input:output ratio, which
// might be optimized in hardware.
//
// It also seems plausible that constructing a separate geometry shader with values
// from 1 to 32 would be worthwhile, but that seems a little extreme.
//
// ---
//
// Since wgpu doesn't support geometry shaders anyway, it seems likely that we'll have
// to do the multiple draw calls, anyway... I don't think gl_Layer can be set from
// outside a geometry shader. But in wgpu, such a thing is much cheaper, anyway.
#define MAX_POINT_LIGHTS 32
// We use geometry shader instancing to construct each face separately.
#define MAX_LAYER_VERTICES_PER_FACE (MAX_POINT_LIGHTS * VERTICES_PER_FACE)
#define MAX_LAYER_FACES (MAX_POINT_LIGHTS * FACES_PER_POINT_LIGHT)
layout (triangles, invocations = 6) in;
layout (triangle_strip, max_vertices = /*MAX_LAYER_VERTICES_PER_FACE*/96) out;
struct ShadowLocals {
mat4 shadowMatrices;
};
layout (std140)
uniform u_light_shadows {
ShadowLocals shadowMats[/*MAX_LAYER_FACES*/192];
};
// NOTE: We choose not to output FragPos currently to save on space limitations
// (see extensive documentation above). However, as these limitations have been
// relaxed (unless the total of all our varying output components can't exceed
// 128, which would mean FragPos would sum to 4 * 3 * 32 = 384; this could be
// remedied only by setting MAX_POINT_LIGHTS to ), we might enable it again soon.
//
out vec4 FragPos; // FragPos from GS (output per emitvertex)
flat out int FragLayer; // Current layer
void main() {
// NOTE: Assuming that light_shadow_count.x < MAX_POINT_LIGHTS. We could min
// it, but that might make this less optimized, and I'd like to keep this loop as
// optimized as is reasonably possible.
int face = gl_InvocationID;
for (int layer = 0; layer < light_shadow_count.x; ++layer)
{
// We use instancing here in order to increase the number of emitted vertices.
// int face = gl_InvocationID;
// for(int face = 0; face < FACES_PER_POINT_LIGHT; ++face)
// {
int layer_face = layer * FACES_PER_POINT_LIGHT + face;
for(int i = 0; i < VERTICES_PER_FACE; ++i) // for each triangle vertex
{
// NOTE: See above, we don't make FragPos a uniform.
FragPos = gl_in[i].gl_Position;
FragLayer = layer;
// vec4 FragPos = gl_in[i].gl_Position;
gl_Layer = layer_face; // built-in variable that specifies to which face we render.
gl_Position = shadowMats[layer_face].shadowMatrices * FragPos;
EmitVertex();
}
EndPrimitive();
// }
}
}