diff options
Diffstat (limited to 'src/video_core')
-rw-r--r-- | src/video_core/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/video_core/engines/const_buffer_info.h | 17 | ||||
-rw-r--r-- | src/video_core/engines/kepler_compute.h | 7 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_3d.cpp | 4 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_3d.h | 8 | ||||
-rw-r--r-- | src/video_core/memory_manager.h | 4 | ||||
-rw-r--r-- | src/video_core/rasterizer_cache.h | 3 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache.cpp | 2 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_device.cpp | 3 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_global_cache.cpp | 1 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 80 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 15 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 31 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/renderer_opengl.cpp | 1 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 49 | ||||
-rw-r--r-- | src/video_core/shader/decode/other.cpp | 18 | ||||
-rw-r--r-- | src/video_core/shader/node.h | 7 |
17 files changed, 160 insertions, 91 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 2d4caa08d..f8b67cbe1 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -3,6 +3,7 @@ add_library(video_core STATIC dma_pusher.h debug_utils/debug_utils.cpp debug_utils/debug_utils.h + engines/const_buffer_info.h engines/engine_upload.cpp engines/engine_upload.h engines/fermi_2d.cpp diff --git a/src/video_core/engines/const_buffer_info.h b/src/video_core/engines/const_buffer_info.h new file mode 100644 index 000000000..d8f672462 --- /dev/null +++ b/src/video_core/engines/const_buffer_info.h @@ -0,0 +1,17 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" + +namespace Tegra::Engines { + +struct ConstBufferInfo { + GPUVAddr address; + u32 size; + bool enabled; +}; + +} // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 5250b8d9b..6a3309a2c 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -140,7 +140,7 @@ public: BitField<0, 16, u32> shared_alloc; - BitField<0, 31, u32> block_dim_x; + BitField<16, 16, u32> block_dim_x; union { BitField<0, 16, u32> block_dim_y; BitField<16, 16, u32> block_dim_z; @@ -153,7 +153,7 @@ public: INSERT_PADDING_WORDS(0x8); - struct { + struct ConstBufferConfig { u32 address_low; union { BitField<0, 8, u32> address_high; @@ -163,7 +163,8 @@ public: return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) | address_low); } - } const_buffer_config[8]; + }; + std::array<ConstBufferConfig, NumConstBuffers> const_buffer_config; union { BitField<0, 20, u32> local_pos_alloc; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 39968d403..08d553696 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -396,12 +396,10 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) { auto& shader = state.shader_stages[static_cast<std::size_t>(stage)]; auto& bind_data = regs.cb_bind[static_cast<std::size_t>(stage)]; - auto& buffer = shader.const_buffers[bind_data.index]; - ASSERT(bind_data.index < Regs::MaxConstBuffers); + auto& buffer = shader.const_buffers[bind_data.index]; buffer.enabled = bind_data.valid.Value() != 0; - buffer.index = bind_data.index; buffer.address = regs.const_buffer.BufferAddress(); buffer.size = regs.const_buffer.cb_size; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index f342c78e6..13e314944 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -15,6 +15,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/math_util.h" +#include "video_core/engines/const_buffer_info.h" #include "video_core/engines/engine_upload.h" #include "video_core/gpu.h" #include "video_core/macro_interpreter.h" @@ -1112,13 +1113,6 @@ public: static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable"); struct State { - struct ConstBufferInfo { - GPUVAddr address; - u32 index; - u32 size; - bool enabled; - }; - struct ShaderStageInfo { std::array<ConstBufferInfo, Regs::MaxConstBuffers> const_buffers; }; diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 113f9d8f3..43a84bd52 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -163,8 +163,8 @@ private: static constexpr u64 page_size{1 << page_bits}; static constexpr u64 page_mask{page_size - 1}; - /// Address space in bits, this is fairly arbitrary but sufficiently large. - static constexpr u32 address_space_width{39}; + /// Address space in bits, according to Tegra X1 TRM + static constexpr u32 address_space_width{40}; /// Start address for mapping, this is fairly arbitrary but must be non-zero. static constexpr GPUVAddr address_space_base{0x100000}; /// End of address space, based on address space in bits. diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h index 0c4ea1494..6de1597a2 100644 --- a/src/video_core/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache.h @@ -169,6 +169,8 @@ protected: object->MarkAsModified(false, *this); } + std::recursive_mutex mutex; + private: /// Returns a list of cached objects from the specified memory region, ordered by access time std::vector<T> GetSortedObjectsFromRegion(CacheAddr addr, u64 size) { @@ -208,5 +210,4 @@ private: IntervalCache interval_cache; ///< Cache of objects u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing VideoCore::RasterizerInterface& rasterizer; - std::recursive_mutex mutex; }; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 48b86f3bd..2b9bd142e 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -23,6 +23,7 @@ OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size) GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment, bool cache) { + std::lock_guard lock{mutex}; auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); // Cache management is a big overhead, so only cache entries with a given size. @@ -62,6 +63,7 @@ GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std:: GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment) { + std::lock_guard lock{mutex}; AlignBuffer(alignment); std::memcpy(buffer_ptr, raw_pointer, size); const GLintptr uploaded_offset = buffer_offset; diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 65a88b06c..a48e14d2e 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -43,8 +43,9 @@ bool Device::TestVariableAoffi() { // This is a unit test, please ignore me on apitrace bug reports. uniform sampler2D tex; uniform ivec2 variable_offset; +out vec4 output_attribute; void main() { - gl_Position = textureOffset(tex, vec2(0), variable_offset); + output_attribute = textureOffset(tex, vec2(0), variable_offset); } )"; const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)}; diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp index ea4a593af..d5e385151 100644 --- a/src/video_core/renderer_opengl/gl_global_cache.cpp +++ b/src/video_core/renderer_opengl/gl_global_cache.cpp @@ -76,6 +76,7 @@ GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion( const GLShader::GlobalMemoryEntry& global_region, Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) { + std::lock_guard lock{mutex}; auto& gpu{Core::System::GetInstance().GPU()}; auto& memory_manager{gpu.MemoryManager()}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index ca410287a..d77426067 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -322,9 +322,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage); - SetupConstBuffers(stage_enum, shader, program_handle, base_bindings); - SetupGlobalRegions(stage_enum, shader, program_handle, base_bindings); - SetupTextures(stage_enum, shader, program_handle, base_bindings); + SetupDrawConstBuffers(stage_enum, shader); + SetupGlobalRegions(stage_enum, shader); + SetupTextures(stage_enum, shader, base_bindings); // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen @@ -776,57 +776,55 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } -void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader, GLuint program_handle, - BaseBindings base_bindings) { +void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader) { MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& gpu = system.GPU(); - const auto& maxwell3d = gpu.Maxwell3D(); - const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)]; + const auto stage_index = static_cast<std::size_t>(stage); + const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index]; const auto& entries = shader->GetShaderEntries().const_buffers; // Upload only the enabled buffers from the 16 constbuffers of each shader stage for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { - const auto& used_buffer = entries[bindpoint]; - const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()]; - - if (!buffer.enabled) { - // Set values to zero to unbind buffers - bind_ubo_pushbuffer.Push(0, 0, 0); - continue; - } + const auto& entry = entries[bindpoint]; + SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry); + } +} - std::size_t size = 0; +void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer, + const GLShader::ConstBufferEntry& entry) { + if (!buffer.enabled) { + // Set values to zero to unbind buffers + bind_ubo_pushbuffer.Push(0, 0, 0); + return; + } - if (used_buffer.IsIndirect()) { - // Buffer is accessed indirectly, so upload the entire thing - size = buffer.size; + std::size_t size; + if (entry.IsIndirect()) { + // Buffer is accessed indirectly, so upload the entire thing + size = buffer.size; - if (size > MaxConstbufferSize) { - LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size, - MaxConstbufferSize); - size = MaxConstbufferSize; - } - } else { - // Buffer is accessed directly, upload just what we use - size = used_buffer.GetSize(); + if (size > MaxConstbufferSize) { + LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size, + MaxConstbufferSize); + size = MaxConstbufferSize; } + } else { + // Buffer is accessed directly, upload just what we use + size = entry.GetSize(); + } - // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140 - // UBO alignment requirements. - size = Common::AlignUp(size, sizeof(GLvec4)); - ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big"); - - const GLintptr const_buffer_offset = - buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); + // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140 + // UBO alignment requirements. + size = Common::AlignUp(size, sizeof(GLvec4)); + ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big"); - bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size); - } + const std::size_t alignment = device.GetUniformBufferAlignment(); + const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment); + bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size); } void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader, GLenum primitive_mode, - BaseBindings base_bindings) { + const Shader& shader) { const auto& entries = shader->GetShaderEntries().global_memory_entries; for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) { const auto& entry{entries[bindpoint]}; @@ -840,7 +838,7 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade } void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader, - GLuint program_handle, BaseBindings base_bindings) { + BaseBindings base_bindings) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& gpu = system.GPU(); const auto& maxwell3d = gpu.Maxwell3D(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 2817f65c9..f7671ff5d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -17,6 +17,7 @@ #include <glad/glad.h> #include "common/common_types.h" +#include "video_core/engines/const_buffer_info.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" @@ -27,6 +28,7 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_sampler_cache.h" #include "video_core/renderer_opengl/gl_shader_cache.h" +#include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/utils.h" @@ -105,17 +107,20 @@ private: bool preserve_contents = true, std::optional<std::size_t> single_color_target = {}); /// Configures the current constbuffers to use for the draw command. - void SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader, - GLuint program_handle, BaseBindings base_bindings); + void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, + const Shader& shader); + + /// Configures a constant buffer. + void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer, + const GLShader::ConstBufferEntry& entry); /// Configures the current global memory entries to use for the draw command. void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, - const Shader& shader, GLenum primitive_mode, - BaseBindings base_bindings); + const Shader& shader); /// Configures the current textures to use for the draw command. void SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader, - GLuint program_handle, BaseBindings base_bindings); + BaseBindings base_bindings); /// Syncs the viewport and depth range to match the guest state void SyncViewport(OpenGLState& current_state); diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 739477cc9..7dc2e0560 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -143,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) { return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0); } +constexpr const char* GetFlowStackPrefix(MetaStackClass stack) { + switch (stack) { + case MetaStackClass::Ssy: + return "ssy"; + case MetaStackClass::Pbk: + return "pbk"; + } + return {}; +} + +std::string FlowStackName(MetaStackClass stack) { + return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack)); +} + +std::string FlowStackTopName(MetaStackClass stack) { + return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); +} + class GLSLDecompiler final { public: explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage, @@ -173,8 +191,10 @@ public: // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems // unlikely that shaders will use 20 nested SSYs and PBKs. constexpr u32 FLOW_STACK_SIZE = 20; - code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE); - code.AddLine("uint flow_stack_top = 0u;"); + for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) { + code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE); + code.AddLine("uint {} = 0u;", FlowStackTopName(stack)); + } code.AddLine("while (true) {{"); ++code.scope; @@ -1438,15 +1458,18 @@ private: } std::string PushFlowStack(Operation operation) { + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); const auto target = std::get_if<ImmediateNode>(&*operation[0]); UNIMPLEMENTED_IF(!target); - code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue()); + code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack), + target->GetValue()); return {}; } std::string PopFlowStack(Operation operation) { - code.AddLine("jmp_to = flow_stack[--flow_stack_top];"); + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); + code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack)); code.AddLine("break;"); return {}; } diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 3451d321d..aafd6f31b 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -18,7 +18,6 @@ #include "core/perf_stats.h" #include "core/settings.h" #include "core/telemetry_session.h" -#include "core/tracer/recorder.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/renderer_opengl.h" diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 547883425..33ad9764a 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -132,20 +132,16 @@ public: branch_labels.push_back(label); } - // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely - // that shaders will use 20 nested SSYs and PBKs. - constexpr u32 FLOW_STACK_SIZE = 20; - const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE)); jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint), spv::StorageClass::Function, Constant(t_uint, first_address))); - flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type), - spv::StorageClass::Function, ConstantNull(flow_stack_type))); - flow_stack_top = - Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0))); + std::tie(ssy_flow_stack, ssy_flow_stack_top) = CreateFlowStack(); + std::tie(pbk_flow_stack, pbk_flow_stack_top) = CreateFlowStack(); Name(jmp_to, "jmp_to"); - Name(flow_stack, "flow_stack"); - Name(flow_stack_top, "flow_stack_top"); + Name(ssy_flow_stack, "ssy_flow_stack"); + Name(ssy_flow_stack_top, "ssy_flow_stack_top"); + Name(pbk_flow_stack, "pbk_flow_stack"); + Name(pbk_flow_stack_top, "pbk_flow_stack_top"); Emit(OpBranch(loop_label)); Emit(loop_label); @@ -952,6 +948,7 @@ private: const auto target = std::get_if<ImmediateNode>(&*operation[0]); ASSERT(target); + const auto [flow_stack, flow_stack_top] = GetFlowStack(operation); const Id current = Emit(OpLoad(t_uint, flow_stack_top)); const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1))); const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current)); @@ -962,6 +959,7 @@ private: } Id PopFlowStack(Operation operation) { + const auto [flow_stack, flow_stack_top] = GetFlowStack(operation); const Id current = Emit(OpLoad(t_uint, flow_stack_top)); const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1))); const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous)); @@ -1172,6 +1170,31 @@ private: Emit(skip_label); } + std::tuple<Id, Id> CreateFlowStack() { + // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely + // that shaders will use 20 nested SSYs and PBKs. + constexpr u32 FLOW_STACK_SIZE = 20; + constexpr auto storage_class = spv::StorageClass::Function; + + const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE)); + const Id stack = Emit(OpVariable(TypePointer(storage_class, flow_stack_type), storage_class, + ConstantNull(flow_stack_type))); + const Id top = Emit(OpVariable(t_func_uint, storage_class, Constant(t_uint, 0))); + return std::tie(stack, top); + } + + std::pair<Id, Id> GetFlowStack(Operation operation) { + const auto stack_class = std::get<MetaStackClass>(operation.GetMeta()); + switch (stack_class) { + case MetaStackClass::Ssy: + return {ssy_flow_stack, ssy_flow_stack_top}; + case MetaStackClass::Pbk: + return {pbk_flow_stack, pbk_flow_stack_top}; + } + UNREACHABLE(); + return {}; + } + static constexpr OperationDecompilersArray operation_decompilers = { &SPIRVDecompiler::Assign, @@ -1414,8 +1437,10 @@ private: Id execute_function{}; Id jmp_to{}; - Id flow_stack_top{}; - Id flow_stack{}; + Id ssy_flow_stack_top{}; + Id pbk_flow_stack_top{}; + Id ssy_flow_stack{}; + Id pbk_flow_stack{}; Id continue_label{}; std::map<u32, Id> labels; }; diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index 6fc07f213..d46a8ab82 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -109,22 +109,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, "Constant buffer flow is not supported"); - // The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the - // target of the jump that the SYNC instruction will make. The SSY opcode has a similar - // structure to the BRA opcode. + // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC. const u32 target = pc + instr.bra.GetBranchTarget(); - bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target))); + bb.push_back( + Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target))); break; } case OpCode::Id::PBK: { UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, "Constant buffer PBK is not supported"); - // PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but - // using SYNC on a PBK address will kill the shader execution. We don't emulate this because - // it's very unlikely a driver will emit such invalid shader. + // PBK pushes to a stack the address where BRK will jump to. const u32 target = pc + instr.bra.GetBranchTarget(); - bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target))); + bb.push_back( + Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target))); break; } case OpCode::Id::SYNC: { @@ -133,7 +131,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { static_cast<u32>(cc)); // The SYNC opcode jumps to the address previously set by the SSY opcode - bb.push_back(Operation(OperationCode::PopFlowStack)); + bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy)); break; } case OpCode::Id::BRK: { @@ -142,7 +140,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { static_cast<u32>(cc)); // The BRK opcode jumps to the address previously set by the PBK opcode - bb.push_back(Operation(OperationCode::PopFlowStack)); + bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk)); break; } case OpCode::Id::IPA: { diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index c002f90f9..3cfb911bb 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -174,6 +174,11 @@ enum class InternalFlag { Amount = 4, }; +enum class MetaStackClass { + Ssy, + Pbk, +}; + class OperationNode; class ConditionalNode; class GprNode; @@ -285,7 +290,7 @@ struct MetaTexture { }; /// Parameters that modify an operation but are not part of any particular operand -using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>; +using Meta = std::variant<MetaArithmetic, MetaTexture, MetaStackClass, Tegra::Shader::HalfType>; /// Holds any kind of operation that can be done in the IR class OperationNode final { |