17 files changed, 160 insertions, 91 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 2d4caa08d..f8b67cbe1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,7 @@ add_library(video_core STATIC
     dma_pusher.h
     debug_utils/debug_utils.cpp
     debug_utils/debug_utils.h
+    engines/const_buffer_info.h
     engines/engine_upload.cpp
     engines/engine_upload.h
     engines/fermi_2d.cpp
diff --git a/src/video_core/engines/const_buffer_info.h b/src/video_core/engines/const_buffer_info.h
new file mode 100644
index 000000000..d8f672462
--- /dev/null
+++ b/src/video_core/engines/const_buffer_info.h
@@ -0,0 +1,17 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+struct ConstBufferInfo {
+    GPUVAddr address;
+    u32 size;
+    bool enabled;
+};
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 5250b8d9b..6a3309a2c 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -140,7 +140,7 @@ public:
 
         BitField<0, 16, u32> shared_alloc;
 
-        BitField<0, 31, u32> block_dim_x;
+        BitField<16, 16, u32> block_dim_x;
         union {
             BitField<0, 16, u32> block_dim_y;
             BitField<16, 16, u32> block_dim_z;
@@ -153,7 +153,7 @@ public:
 
         INSERT_PADDING_WORDS(0x8);
 
-        struct {
+        struct ConstBufferConfig {
             u32 address_low;
             union {
                 BitField<0, 8, u32> address_high;
@@ -163,7 +163,8 @@ public:
                 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
                                              address_low);
             }
-        } const_buffer_config[8];
+        };
+        std::array<ConstBufferConfig, NumConstBuffers> const_buffer_config;
 
         union {
             BitField<0, 20, u32> local_pos_alloc;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 39968d403..08d553696 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -396,12 +396,10 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
     auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     auto& bind_data = regs.cb_bind[static_cast<std::size_t>(stage)];
 
-    auto& buffer = shader.const_buffers[bind_data.index];
-
     ASSERT(bind_data.index < Regs::MaxConstBuffers);
+    auto& buffer = shader.const_buffers[bind_data.index];
 
     buffer.enabled = bind_data.valid.Value() != 0;
-    buffer.index = bind_data.index;
     buffer.address = regs.const_buffer.BufferAddress();
     buffer.size = regs.const_buffer.cb_size;
 }
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index f342c78e6..13e314944 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -15,6 +15,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 #include "video_core/macro_interpreter.h"
@@ -1112,13 +1113,6 @@ public:
     static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");
 
     struct State {
-        struct ConstBufferInfo {
-            GPUVAddr address;
-            u32 index;
-            u32 size;
-            bool enabled;
-        };
-
         struct ShaderStageInfo {
             std::array<ConstBufferInfo, Regs::MaxConstBuffers> const_buffers;
         };
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 113f9d8f3..43a84bd52 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -163,8 +163,8 @@ private:
     static constexpr u64 page_size{1 << page_bits};
     static constexpr u64 page_mask{page_size - 1};
 
-    /// Address space in bits, this is fairly arbitrary but sufficiently large.
-    static constexpr u32 address_space_width{39};
+    /// Address space in bits, according to Tegra X1 TRM
+    static constexpr u32 address_space_width{40};
     /// Start address for mapping, this is fairly arbitrary but must be non-zero.
     static constexpr GPUVAddr address_space_base{0x100000};
     /// End of address space, based on address space in bits.
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index 0c4ea1494..6de1597a2 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -169,6 +169,8 @@ protected:
         object->MarkAsModified(false, *this);
     }
 
+    std::recursive_mutex mutex;
+
 private:
     /// Returns a list of cached objects from the specified memory region, ordered by access time
     std::vector<T> GetSortedObjectsFromRegion(CacheAddr addr, u64 size) {
@@ -208,5 +210,4 @@ private:
     IntervalCache interval_cache; ///< Cache of objects
     u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
     VideoCore::RasterizerInterface& rasterizer;
-    std::recursive_mutex mutex;
 };
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 48b86f3bd..2b9bd142e 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -23,6 +23,7 @@ OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
 
 GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment,
                                       bool cache) {
+    std::lock_guard lock{mutex};
     auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
 
     // Cache management is a big overhead, so only cache entries with a given size.
@@ -62,6 +63,7 @@ GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::
 
 GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size,
                                           std::size_t alignment) {
+    std::lock_guard lock{mutex};
     AlignBuffer(alignment);
     std::memcpy(buffer_ptr, raw_pointer, size);
     const GLintptr uploaded_offset = buffer_offset;
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 65a88b06c..a48e14d2e 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -43,8 +43,9 @@ bool Device::TestVariableAoffi() {
 // This is a unit test, please ignore me on apitrace bug reports.
 uniform sampler2D tex;
 uniform ivec2 variable_offset;
+out vec4 output_attribute;
 void main() {
-    gl_Position = textureOffset(tex, vec2(0), variable_offset);
+    output_attribute = textureOffset(tex, vec2(0), variable_offset);
 }
 )";
     const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)};
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
index ea4a593af..d5e385151 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -76,6 +76,7 @@ GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
 GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
     const GLShader::GlobalMemoryEntry& global_region,
     Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) {
+    std::lock_guard lock{mutex};
 
     auto& gpu{Core::System::GetInstance().GPU()};
     auto& memory_manager{gpu.MemoryManager()};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index ca410287a..d77426067 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -322,9 +322,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         }
 
         const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
-        SetupConstBuffers(stage_enum, shader, program_handle, base_bindings);
-        SetupGlobalRegions(stage_enum, shader, program_handle, base_bindings);
-        SetupTextures(stage_enum, shader, program_handle, base_bindings);
+        SetupDrawConstBuffers(stage_enum, shader);
+        SetupGlobalRegions(stage_enum, shader);
+        SetupTextures(stage_enum, shader, base_bindings);
 
         // Workaround for Intel drivers.
         // When a clip distance is enabled but not set in the shader it crops parts of the screen
@@ -776,57 +776,55 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                         const Shader& shader, GLuint program_handle,
-                                         BaseBindings base_bindings) {
+void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                             const Shader& shader) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& gpu = system.GPU();
-    const auto& maxwell3d = gpu.Maxwell3D();
-    const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto stage_index = static_cast<std::size_t>(stage);
+    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
     const auto& entries = shader->GetShaderEntries().const_buffers;
 
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& used_buffer = entries[bindpoint];
-        const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
-
-        if (!buffer.enabled) {
-            // Set values to zero to unbind buffers
-            bind_ubo_pushbuffer.Push(0, 0, 0);
-            continue;
-        }
+        const auto& entry = entries[bindpoint];
+        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
+    }
+}
 
-        std::size_t size = 0;
+void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
+                                        const GLShader::ConstBufferEntry& entry) {
+    if (!buffer.enabled) {
+        // Set values to zero to unbind buffers
+        bind_ubo_pushbuffer.Push(0, 0, 0);
+        return;
+    }
 
-        if (used_buffer.IsIndirect()) {
-            // Buffer is accessed indirectly, so upload the entire thing
-            size = buffer.size;
+    std::size_t size;
+    if (entry.IsIndirect()) {
+        // Buffer is accessed indirectly, so upload the entire thing
+        size = buffer.size;
 
-            if (size > MaxConstbufferSize) {
-                LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
-                            MaxConstbufferSize);
-                size = MaxConstbufferSize;
-            }
-        } else {
-            // Buffer is accessed directly, upload just what we use
-            size = used_buffer.GetSize();
+        if (size > MaxConstbufferSize) {
+            LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
+                        MaxConstbufferSize);
+            size = MaxConstbufferSize;
         }
+    } else {
+        // Buffer is accessed directly, upload just what we use
+        size = entry.GetSize();
+    }
 
-        // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
-        // UBO alignment requirements.
-        size = Common::AlignUp(size, sizeof(GLvec4));
-        ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
-
-        const GLintptr const_buffer_offset =
-            buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
+    // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
+    // UBO alignment requirements.
+    size = Common::AlignUp(size, sizeof(GLvec4));
+    ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big");
 
-        bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
-    }
+    const std::size_t alignment = device.GetUniformBufferAlignment();
+    const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size);
 }
 
 void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader, GLenum primitive_mode,
-                                          BaseBindings base_bindings) {
+                                          const Shader& shader) {
     const auto& entries = shader->GetShaderEntries().global_memory_entries;
     for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& entry{entries[bindpoint]};
@@ -840,7 +838,7 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
 }
 
 void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
-                                     GLuint program_handle, BaseBindings base_bindings) {
+                                     BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& gpu = system.GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 2817f65c9..f7671ff5d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -17,6 +17,7 @@
 #include <glad/glad.h>
 
 #include "common/common_types.h"
+#include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
@@ -27,6 +28,7 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
+#include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/utils.h"
@@ -105,17 +107,20 @@ private:
         bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
 
     /// Configures the current constbuffers to use for the draw command.
-    void SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                           GLuint program_handle, BaseBindings base_bindings);
+    void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                               const Shader& shader);
+
+    /// Configures a constant buffer.
+    void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
+                          const GLShader::ConstBufferEntry& entry);
 
     /// Configures the current global memory entries to use for the draw command.
     void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader, GLenum primitive_mode,
-                            BaseBindings base_bindings);
+                            const Shader& shader);
 
     /// Configures the current textures to use for the draw command.
     void SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                       GLuint program_handle, BaseBindings base_bindings);
+                       BaseBindings base_bindings);
 
     /// Syncs the viewport and depth range to match the guest state
     void SyncViewport(OpenGLState& current_state);
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 739477cc9..7dc2e0560 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -143,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) {
     return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
 }
 
+constexpr const char* GetFlowStackPrefix(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "ssy";
+    case MetaStackClass::Pbk:
+        return "pbk";
+    }
+    return {};
+}
+
+std::string FlowStackName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack));
+}
+
+std::string FlowStackTopName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
+}
+
 class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
@@ -173,8 +191,10 @@ public:
         // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
         // unlikely that shaders will use 20 nested SSYs and PBKs.
         constexpr u32 FLOW_STACK_SIZE = 20;
-        code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
-        code.AddLine("uint flow_stack_top = 0u;");
+        for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
+            code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
+            code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+        }
 
         code.AddLine("while (true) {{");
         ++code.scope;
@@ -1438,15 +1458,18 @@ private:
     }
 
     std::string PushFlowStack(Operation operation) {
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
+        code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
+                     target->GetValue());
         return {};
     }
 
     std::string PopFlowStack(Operation operation) {
-        code.AddLine("jmp_to = flow_stack[--flow_stack_top];");
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+        code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
         code.AddLine("break;");
         return {};
     }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 3451d321d..aafd6f31b 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -18,7 +18,6 @@
 #include "core/perf_stats.h"
 #include "core/settings.h"
 #include "core/telemetry_session.h"
-#include "core/tracer/recorder.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 547883425..33ad9764a 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -132,20 +132,16 @@ public:
             branch_labels.push_back(label);
         }
 
-        // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
-        // that shaders will use 20 nested SSYs and PBKs.
-        constexpr u32 FLOW_STACK_SIZE = 20;
-        const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
         jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
                                  spv::StorageClass::Function, Constant(t_uint, first_address)));
-        flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type),
-                                     spv::StorageClass::Function, ConstantNull(flow_stack_type)));
-        flow_stack_top =
-            Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0)));
+        std::tie(ssy_flow_stack, ssy_flow_stack_top) = CreateFlowStack();
+        std::tie(pbk_flow_stack, pbk_flow_stack_top) = CreateFlowStack();
 
         Name(jmp_to, "jmp_to");
-        Name(flow_stack, "flow_stack");
-        Name(flow_stack_top, "flow_stack_top");
+        Name(ssy_flow_stack, "ssy_flow_stack");
+        Name(ssy_flow_stack_top, "ssy_flow_stack_top");
+        Name(pbk_flow_stack, "pbk_flow_stack");
+        Name(pbk_flow_stack_top, "pbk_flow_stack_top");
 
         Emit(OpBranch(loop_label));
         Emit(loop_label);
@@ -952,6 +948,7 @@ private:
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         ASSERT(target);
 
+        const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
         const Id current = Emit(OpLoad(t_uint, flow_stack_top));
         const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
         const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
@@ -962,6 +959,7 @@ private:
     }
 
     Id PopFlowStack(Operation operation) {
+        const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
         const Id current = Emit(OpLoad(t_uint, flow_stack_top));
         const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
         const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
@@ -1172,6 +1170,31 @@ private:
         Emit(skip_label);
     }
 
+    std::tuple<Id, Id> CreateFlowStack() {
+        // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
+        // that shaders will use 20 nested SSYs and PBKs.
+        constexpr u32 FLOW_STACK_SIZE = 20;
+        constexpr auto storage_class = spv::StorageClass::Function;
+
+        const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
+        const Id stack = Emit(OpVariable(TypePointer(storage_class, flow_stack_type), storage_class,
+                                         ConstantNull(flow_stack_type)));
+        const Id top = Emit(OpVariable(t_func_uint, storage_class, Constant(t_uint, 0)));
+        return std::tie(stack, top);
+    }
+
+    std::pair<Id, Id> GetFlowStack(Operation operation) {
+        const auto stack_class = std::get<MetaStackClass>(operation.GetMeta());
+        switch (stack_class) {
+        case MetaStackClass::Ssy:
+            return {ssy_flow_stack, ssy_flow_stack_top};
+        case MetaStackClass::Pbk:
+            return {pbk_flow_stack, pbk_flow_stack_top};
+        }
+        UNREACHABLE();
+        return {};
+    }
+
     static constexpr OperationDecompilersArray operation_decompilers = {
         &SPIRVDecompiler::Assign,
 
@@ -1414,8 +1437,10 @@ private:
 
     Id execute_function{};
     Id jmp_to{};
-    Id flow_stack_top{};
-    Id flow_stack{};
+    Id ssy_flow_stack_top{};
+    Id pbk_flow_stack_top{};
+    Id ssy_flow_stack{};
+    Id pbk_flow_stack{};
     Id continue_label{};
     std::map<u32, Id> labels;
 };
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index 6fc07f213..d46a8ab82 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -109,22 +109,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer flow is not supported");
 
-        // The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the
-        // target of the jump that the SYNC instruction will make. The SSY opcode has a similar
-        // structure to the BRA opcode.
+        // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target)));
         break;
     }
     case OpCode::Id::PBK: {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer PBK is not supported");
 
-        // PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but
-        // using SYNC on a PBK address will kill the shader execution. We don't emulate this because
-        // it's very unlikely a driver will emit such invalid shader.
+        // PBK pushes to a stack the address where BRK will jump to.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target)));
         break;
     }
     case OpCode::Id::SYNC: {
@@ -133,7 +131,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                              static_cast<u32>(cc));
 
         // The SYNC opcode jumps to the address previously set by the SSY opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
         break;
     }
     case OpCode::Id::BRK: {
@@ -142,7 +140,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                              static_cast<u32>(cc));
 
         // The BRK opcode jumps to the address previously set by the PBK opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
         break;
     }
     case OpCode::Id::IPA: {
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c002f90f9..3cfb911bb 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -174,6 +174,11 @@ enum class InternalFlag {
     Amount = 4,
 };
 
+enum class MetaStackClass {
+    Ssy,
+    Pbk,
+};
+
 class OperationNode;
 class ConditionalNode;
 class GprNode;
@@ -285,7 +290,7 @@ struct MetaTexture {
 };
 
 /// Parameters that modify an operation but are not part of any particular operand
-using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
+using Meta = std::variant<MetaArithmetic, MetaTexture, MetaStackClass, Tegra::Shader::HalfType>;
 
 /// Holds any kind of operation that can be done in the IR
 class OperationNode final {