23 files changed, 583 insertions, 186 deletions
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 2cdf1aa7f..b1088af3d 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -5,7 +5,7 @@
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
-#include "core/frontend/scope_acquire_window_context.h"
+#include "core/frontend/scope_acquire_context.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
@@ -27,7 +27,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
         return;
     }
 
-    Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
+    Core::Frontend::ScopeAcquireContext acquire_context{renderer.GetRenderWindow()};
 
     CommandDataContainer next;
     while (state.is_running) {
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index af1bebc4f..5ec99a126 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -35,15 +35,19 @@ public:
     explicit RendererBase(Core::Frontend::EmuWindow& window);
     virtual ~RendererBase();
 
-    /// Swap buffers (render frame)
-    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
-
     /// Initialize the renderer
     virtual bool Init() = 0;
 
     /// Shutdown the renderer
     virtual void ShutDown() = 0;
 
+    /// Finalize rendering the guest frame and draw into the presentation texture
+    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
+
+    /// Draws the latest frame to the window waiting timeout_ms for a frame to arrive (Renderer
+    /// specific implementation)
+    virtual void TryPresent(int timeout_ms) = 0;
+
     // Getter/setter functions:
     // ------------------------
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index e1965fb21..3fcd319fd 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -36,6 +36,7 @@ namespace OpenGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
+using Tegra::Engines::ShaderType;
 using VideoCore::Surface::PixelFormat;
 using VideoCore::Surface::SurfaceTarget;
 using VideoCore::Surface::SurfaceType;
@@ -56,8 +57,7 @@ namespace {
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
-                                               Tegra::Engines::ShaderType shader_type,
-                                               std::size_t index = 0) {
+                                               ShaderType shader_type, std::size_t index = 0) {
     if (entry.IsBindless()) {
         const Tegra::Texture::TextureHandle tex_handle =
             engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset());
@@ -910,15 +910,10 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
     const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).sampler;
     for (const auto& entry : shader->GetShaderEntries().samplers) {
-        const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
-        if (!entry.IsIndexed()) {
-            const auto texture = GetTextureInfo(maxwell3d, entry, shader_type);
+        const auto shader_type = static_cast<ShaderType>(stage_index);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
             SetupTexture(binding++, texture, entry);
-        } else {
-            for (std::size_t i = 0; i < entry.Size(); ++i) {
-                const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
-                SetupTexture(binding++, texture, entry);
-            }
         }
     }
 }
@@ -928,16 +923,9 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : kernel->GetShaderEntries().samplers) {
-        if (!entry.IsIndexed()) {
-            const auto texture =
-                GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i);
             SetupTexture(binding++, texture, entry);
-        } else {
-            for (std::size_t i = 0; i < entry.Size(); ++i) {
-                const auto texture =
-                    GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute, i);
-                SetupTexture(binding++, texture, entry);
-            }
         }
     }
 }
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index f0ddfb276..c0aee770f 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -15,6 +15,24 @@ MICROPROFILE_DEFINE(OpenGL_ResourceDeletion, "OpenGL", "Resource Deletion", MP_R
 
 namespace OpenGL {
 
+void OGLRenderbuffer::Create() {
+    if (handle != 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
+    glGenRenderbuffers(1, &handle);
+}
+
+void OGLRenderbuffer::Release() {
+    if (handle == 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteRenderbuffers(1, &handle);
+    OpenGLState::GetCurState().ResetRenderbuffer(handle).Apply();
+    handle = 0;
+}
+
 void OGLTexture::Create(GLenum target) {
     if (handle != 0)
         return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index 514d1d165..995a4e45e 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -11,6 +11,31 @@
 
 namespace OpenGL {
 
+class OGLRenderbuffer : private NonCopyable {
+public:
+    OGLRenderbuffer() = default;
+
+    OGLRenderbuffer(OGLRenderbuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLRenderbuffer() {
+        Release();
+    }
+
+    OGLRenderbuffer& operator=(OGLRenderbuffer&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    /// Creates a new internal OpenGL resource and stores the handle
+    void Create();
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 class OGLTexture : private NonCopyable {
 public:
     OGLTexture() = default;
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index ab1f7983c..7d3bc1a1f 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -423,6 +423,13 @@ void OpenGLState::ApplyClipControl() {
     }
 }
 
+void OpenGLState::ApplyRenderBuffer() {
+    if (cur_state.renderbuffer != renderbuffer) {
+        cur_state.renderbuffer = renderbuffer;
+        glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer);
+    }
+}
+
 void OpenGLState::ApplyTextures() {
     const std::size_t size = std::size(textures);
     for (std::size_t i = 0; i < size; ++i) {
@@ -478,6 +485,7 @@ void OpenGLState::Apply() {
     ApplyPolygonOffset();
     ApplyAlphaTest();
     ApplyClipControl();
+    ApplyRenderBuffer();
 }
 
 void OpenGLState::EmulateViewportWithScissor() {
@@ -551,4 +559,11 @@ OpenGLState& OpenGLState::ResetFramebuffer(GLuint handle) {
     return *this;
 }
 
+OpenGLState& OpenGLState::ResetRenderbuffer(GLuint handle) {
+    if (renderbuffer == handle) {
+        renderbuffer = 0;
+    }
+    return *this;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 4953eeda2..bce662f2c 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -158,6 +158,8 @@ public:
         GLenum depth_mode = GL_NEGATIVE_ONE_TO_ONE;
     } clip_control;
 
+    GLuint renderbuffer{}; // GL_RENDERBUFFER_BINDING
+
     OpenGLState();
 
     /// Get the currently active OpenGL state
@@ -196,6 +198,7 @@ public:
     void ApplyPolygonOffset();
     void ApplyAlphaTest();
     void ApplyClipControl();
+    void ApplyRenderBuffer();
 
     /// Resets any references to the given resource
     OpenGLState& UnbindTexture(GLuint handle);
@@ -204,6 +207,7 @@ public:
     OpenGLState& ResetPipeline(GLuint handle);
     OpenGLState& ResetVertexArray(GLuint handle);
     OpenGLState& ResetFramebuffer(GLuint handle);
+    OpenGLState& ResetRenderbuffer(GLuint handle);
 
     /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
     void EmulateViewportWithScissor();
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 7ed505628..d3dea3659 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -92,8 +92,32 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         }
     case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return GL_UNSIGNED_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return GL_UNSIGNED_SHORT;
+        default:
+            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            return {};
+        }
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+        switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return GL_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return GL_SHORT;
         default:
             LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
             return {};
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index bba16afaf..a4340b502 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -9,11 +9,11 @@
 #include <glad/glad.h>
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "common/telemetry.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/frontend/emu_window.h"
-#include "core/frontend/scope_acquire_window_context.h"
 #include "core/memory.h"
 #include "core/perf_stats.h"
 #include "core/settings.h"
@@ -24,6 +24,144 @@
 
 namespace OpenGL {
 
+// If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have
+// to wait on available presentation frames.
+constexpr std::size_t SWAP_CHAIN_SIZE = 3;
+
+struct Frame {
+    u32 width{};                      /// Width of the frame (to detect resize)
+    u32 height{};                     /// Height of the frame
+    bool color_reloaded{};            /// Texture attachment was recreated (ie: resized)
+    OpenGL::OGLRenderbuffer color{};  /// Buffer shared between the render/present FBO
+    OpenGL::OGLFramebuffer render{};  /// FBO created on the render thread
+    OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread
+    GLsync render_fence{};            /// Fence created on the render thread
+    GLsync present_fence{};           /// Fence created on the presentation thread
+    bool is_srgb{};                   /// Framebuffer is sRGB or RGB
+};
+
+/**
+ * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
+ * but also make sure that rendering happens at the pace that the frontend dictates. This is a
+ * helper class that the renderer uses to sync frames between the render thread and the presentation
+ * thread
+ */
+class FrameMailbox {
+public:
+    std::mutex swap_chain_lock;
+    std::condition_variable present_cv;
+    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
+    std::queue<Frame*> free_queue;
+    std::deque<Frame*> present_queue;
+    Frame* previous_frame{};
+
+    FrameMailbox() {
+        for (auto& frame : swap_chain) {
+            free_queue.push(&frame);
+        }
+    }
+
+    ~FrameMailbox() {
+        // lock the mutex and clear out the present and free_queues and notify any people who are
+        // blocked to prevent deadlock on shutdown
+        std::scoped_lock lock{swap_chain_lock};
+        std::queue<Frame*>().swap(free_queue);
+        present_queue.clear();
+        present_cv.notify_all();
+    }
+
+    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
+        frame->present.Release();
+        frame->present.Create();
+        GLint previous_draw_fbo{};
+        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
+        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
+        }
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
+        frame->color_reloaded = false;
+    }
+
+    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
+        OpenGLState prev_state = OpenGLState::GetCurState();
+        OpenGLState state = OpenGLState::GetCurState();
+
+        // Recreate the color texture attachment
+        frame->color.Release();
+        frame->color.Create();
+        state.renderbuffer = frame->color.handle;
+        state.Apply();
+        glRenderbufferStorage(GL_RENDERBUFFER, frame->is_srgb ? GL_SRGB8 : GL_RGB8, width, height);
+
+        // Recreate the FBO for the render target
+        frame->render.Release();
+        frame->render.Create();
+        state.draw.read_framebuffer = frame->render.handle;
+        state.draw.draw_framebuffer = frame->render.handle;
+        state.Apply();
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
+        }
+        prev_state.Apply();
+        frame->width = width;
+        frame->height = height;
+        frame->color_reloaded = true;
+    }
+
+    Frame* GetRenderFrame() {
+        std::unique_lock lock{swap_chain_lock};
+
+        // If theres no free frames, we will reuse the oldest render frame
+        if (free_queue.empty()) {
+            auto frame = present_queue.back();
+            present_queue.pop_back();
+            return frame;
+        }
+
+        Frame* frame = free_queue.front();
+        free_queue.pop();
+        return frame;
+    }
+
+    void ReleaseRenderFrame(Frame* frame) {
+        std::unique_lock lock{swap_chain_lock};
+        present_queue.push_front(frame);
+        present_cv.notify_one();
+    }
+
+    Frame* TryGetPresentFrame(int timeout_ms) {
+        std::unique_lock lock{swap_chain_lock};
+        // wait for new entries in the present_queue
+        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
+                            [&] { return !present_queue.empty(); });
+        if (present_queue.empty()) {
+            // timed out waiting for a frame to draw so return the previous frame
+            return previous_frame;
+        }
+
+        // free the previous frame and add it back to the free queue
+        if (previous_frame) {
+            free_queue.push(previous_frame);
+        }
+
+        // the newest entries are pushed to the front of the queue
+        Frame* frame = present_queue.front();
+        present_queue.pop_front();
+        // remove all old entries from the present queue and move them back to the free_queue
+        for (auto f : present_queue) {
+            free_queue.push(f);
+        }
+        present_queue.clear();
+        previous_frame = frame;
+        return frame;
+    }
+};
+
 namespace {
 
 constexpr char vertex_shader[] = R"(
@@ -158,21 +296,91 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
 } // Anonymous namespace
 
 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)
-    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system} {}
+    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system},
+      frame_mailbox{std::make_unique<FrameMailbox>()} {}
 
 RendererOpenGL::~RendererOpenGL() = default;
 
+MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64));
+MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128));
+
 void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    render_window.PollEvents();
+
+    if (!framebuffer) {
+        return;
+    }
+
     // Maintain the rasterizer's state as a priority
     OpenGLState prev_state = OpenGLState::GetCurState();
     state.AllDirty();
     state.Apply();
 
+    PrepareRendertarget(framebuffer);
+    RenderScreenshot();
+
+    Frame* frame;
+    {
+        MICROPROFILE_SCOPE(OpenGL_WaitPresent);
+
+        frame = frame_mailbox->GetRenderFrame();
+
+        // Clean up sync objects before drawing
+
+        // INTEL driver workaround. We can't delete the previous render sync object until we are
+        // sure that the presentation is done
+        if (frame->present_fence) {
+            glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
+        }
+
+        // delete the draw fence if the frame wasn't presented
+        if (frame->render_fence) {
+            glDeleteSync(frame->render_fence);
+            frame->render_fence = 0;
+        }
+
+        // wait for the presentation to be done
+        if (frame->present_fence) {
+            glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
+            glDeleteSync(frame->present_fence);
+            frame->present_fence = 0;
+        }
+    }
+
+    {
+        MICROPROFILE_SCOPE(OpenGL_RenderFrame);
+        const auto& layout = render_window.GetFramebufferLayout();
+
+        // Recreate the frame if the size of the window has changed
+        if (layout.width != frame->width || layout.height != frame->height ||
+            screen_info.display_srgb != frame->is_srgb) {
+            LOG_DEBUG(Render_OpenGL, "Reloading render frame");
+            frame->is_srgb = screen_info.display_srgb;
+            frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height);
+        }
+        state.draw.draw_framebuffer = frame->render.handle;
+        state.Apply();
+        DrawScreen(layout);
+        // Create a fence for the frontend to wait on and swap this frame to OffTex
+        frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        glFlush();
+        frame_mailbox->ReleaseRenderFrame(frame);
+        m_current_frame++;
+        rasterizer->TickFrame();
+    }
+
+    // Restore the rasterizer state
+    prev_state.AllDirty();
+    prev_state.Apply();
+}
+
+void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) {
     if (framebuffer) {
         // If framebuffer is provided, reload it from memory to a texture
         if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
             screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
-            screen_info.texture.pixel_format != framebuffer->pixel_format) {
+            screen_info.texture.pixel_format != framebuffer->pixel_format ||
+            gl_framebuffer_data.empty()) {
             // Reallocate texture if the framebuffer size has changed.
             // This is expected to not happen very often and hence should not be a
             // performance problem.
@@ -181,22 +389,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
 
         // Load the framebuffer from memory, draw it to the screen, and swap buffers
         LoadFBToScreenInfo(*framebuffer);
-
-        if (renderer_settings.screenshot_requested)
-            CaptureScreenshot();
-
-        DrawScreen(render_window.GetFramebufferLayout());
-
-        rasterizer->TickFrame();
-
-        render_window.SwapBuffers();
     }
-
-    render_window.PollEvents();
-
-    // Restore the rasterizer state
-    prev_state.AllDirty();
-    prev_state.Apply();
 }
 
 void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) {
@@ -418,13 +611,48 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
     DrawScreenTriangles(screen_info, static_cast<float>(screen.left),
                         static_cast<float>(screen.top), static_cast<float>(screen.GetWidth()),
                         static_cast<float>(screen.GetHeight()));
+}
 
-    m_current_frame++;
+void RendererOpenGL::TryPresent(int timeout_ms) {
+    const auto& layout = render_window.GetFramebufferLayout();
+    auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms);
+    if (!frame) {
+        LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present");
+        return;
+    }
+
+    // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a
+    // readback since we won't be doing any blending
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    // Recreate the presentation FBO if the color attachment was changed
+    if (frame->color_reloaded) {
+        LOG_DEBUG(Render_OpenGL, "Reloading present frame");
+        frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height);
+    }
+    glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED);
+    // INTEL workaround.
+    // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete
+    // it on the emulation thread without too much penalty
+    // glDeleteSync(frame.render_sync);
+    // frame.render_sync = 0;
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle);
+    glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height,
+                      GL_COLOR_BUFFER_BIT, GL_LINEAR);
+
+    // Insert fence for the main thread to block on
+    frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    glFlush();
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
 }
 
-void RendererOpenGL::UpdateFramerate() {}
+void RendererOpenGL::RenderScreenshot() {
+    if (!renderer_settings.screenshot_requested) {
+        return;
+    }
 
-void RendererOpenGL::CaptureScreenshot() {
     // Draw the current frame to the screenshot framebuffer
     screenshot_framebuffer.Create();
     GLuint old_read_fb = state.draw.read_framebuffer;
@@ -459,8 +687,6 @@ void RendererOpenGL::CaptureScreenshot() {
 }
 
 bool RendererOpenGL::Init() {
-    Core::Frontend::ScopeAcquireWindowContext acquire_context{render_window};
-
     if (GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
         glDebugMessageCallback(DebugHandler, nullptr);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index b56328a7f..d45e69cbc 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -44,19 +44,23 @@ struct ScreenInfo {
     TextureInfo texture;
 };
 
+struct PresentationTexture {
+    u32 width = 0;
+    u32 height = 0;
+    OGLTexture texture;
+};
+
+class FrameMailbox;
+
 class RendererOpenGL final : public VideoCore::RendererBase {
 public:
     explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system);
     ~RendererOpenGL() override;
 
-    /// Swap buffers (render frame)
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-
-    /// Initialize the renderer
     bool Init() override;
-
-    /// Shutdown the renderer
     void ShutDown() override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
+    void TryPresent(int timeout_ms) override;
 
 private:
     /// Initializes the OpenGL state and creates persistent objects.
@@ -74,10 +78,7 @@ private:
 
     void DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, float h);
 
-    /// Updates the framerate.
-    void UpdateFramerate();
-
-    void CaptureScreenshot();
+    void RenderScreenshot();
 
     /// Loads framebuffer from emulated memory into the active OpenGL texture.
     void LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer);
@@ -87,6 +88,8 @@ private:
     void LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a,
                                     const TextureInfo& texture);
 
+    void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer);
+
     Core::Frontend::EmuWindow& emu_window;
     Core::System& system;
 
@@ -107,6 +110,9 @@ private:
     /// Used for transforming the framebuffer orientation
     Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags;
     Common::Rectangle<int> framebuffer_crop_rect;
+
+    /// Frame presentation mailbox
+    std::unique_ptr<FrameMailbox> frame_mailbox;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index ef66dd141..aad0c895b 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -371,8 +371,22 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
         }
     case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return vk::Format::eR8Uscaled;
         case Maxwell::VertexAttribute::Size::Size_8_8:
             return vk::Format::eR8G8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return vk::Format::eR8G8B8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return vk::Format::eR8G8B8A8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return vk::Format::eR16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return vk::Format::eR16G16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return vk::Format::eR16G16B16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return vk::Format::eR16G16B16A16Uscaled;
         default:
             break;
         }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index d5032b432..ddc62bc97 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -106,8 +106,14 @@ RendererVulkan::~RendererVulkan() {
 }
 
 void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    render_window.PollEvents();
+
+    if (!framebuffer) {
+        return;
+    }
+
     const auto& layout = render_window.GetFramebufferLayout();
-    if (framebuffer && layout.width > 0 && layout.height > 0 && render_window.IsShown()) {
+    if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) {
         const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset;
         const bool use_accelerated =
             rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride);
@@ -128,13 +134,16 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
             blit_screen->Recreate();
         }
 
-        render_window.SwapBuffers();
         rasterizer->TickFrame();
     }
 
     render_window.PollEvents();
 }
 
+void RendererVulkan::TryPresent(int /*timeout_ms*/) {
+    // TODO (bunnei): ImplementMe
+}
+
 bool RendererVulkan::Init() {
     PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr{};
     render_window.RetrieveVulkanHandlers(&vkGetInstanceProcAddr, &instance, &surface);
@@ -262,4 +271,4 @@ void RendererVulkan::Report() const {
     telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions);
 }
 
-} // namespace Vulkan
-\ No newline at end of file
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index a472c5dc9..f513397f0 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -36,14 +36,10 @@ public:
     explicit RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system);
     ~RendererVulkan() override;
 
-    /// Swap buffers (render frame)
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-
-    /// Initialize the renderer
     bool Init() override;
-
-    /// Shutdown the renderer
     void ShutDown() override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
+    void TryPresent(int timeout_ms) override;
 
 private:
     std::optional<vk::DebugUtilsMessengerEXT> CreateDebugCallback(
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 9d5b8de7a..60f57d83e 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -73,7 +73,7 @@ UniqueDescriptorUpdateTemplate VKComputePipeline::CreateDescriptorUpdateTemplate
     std::vector<vk::DescriptorUpdateTemplateEntry> template_entries;
     u32 binding = 0;
     u32 offset = 0;
-    FillDescriptorUpdateTemplateEntries(device, entries, binding, offset, template_entries);
+    FillDescriptorUpdateTemplateEntries(entries, binding, offset, template_entries);
     if (template_entries.empty()) {
         // If the shader doesn't use descriptor sets, skip template creation.
         return UniqueDescriptorUpdateTemplate{};
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index b155dfb49..6a02403c1 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -97,8 +97,7 @@ UniqueDescriptorUpdateTemplate VKGraphicsPipeline::CreateDescriptorUpdateTemplat
     u32 offset = 0;
     for (const auto& stage : program) {
         if (stage) {
-            FillDescriptorUpdateTemplateEntries(device, stage->entries, binding, offset,
-                                                template_entries);
+            FillDescriptorUpdateTemplateEntries(stage->entries, binding, offset, template_entries);
         }
     }
     if (template_entries.empty()) {
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 7ddf7d3ee..696e4b291 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -36,6 +36,13 @@ using Tegra::Engines::ShaderType;
 
 namespace {
 
+// C++20's using enum
+constexpr auto eUniformBuffer = vk::DescriptorType::eUniformBuffer;
+constexpr auto eStorageBuffer = vk::DescriptorType::eStorageBuffer;
+constexpr auto eUniformTexelBuffer = vk::DescriptorType::eUniformTexelBuffer;
+constexpr auto eCombinedImageSampler = vk::DescriptorType::eCombinedImageSampler;
+constexpr auto eStorageImage = vk::DescriptorType::eStorageImage;
+
 constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
     VideoCommon::Shader::CompileDepth::FullDecompile};
 
@@ -119,23 +126,32 @@ ShaderType GetShaderType(Maxwell::ShaderProgram program) {
     }
 }
 
+template <vk::DescriptorType descriptor_type, class Container>
+void AddBindings(std::vector<vk::DescriptorSetLayoutBinding>& bindings, u32& binding,
+                 vk::ShaderStageFlags stage_flags, const Container& container) {
+    const u32 num_entries = static_cast<u32>(std::size(container));
+    for (std::size_t i = 0; i < num_entries; ++i) {
+        u32 count = 1;
+        if constexpr (descriptor_type == eCombinedImageSampler) {
+            // Combined image samplers can be arrayed.
+            count = container[i].Size();
+        }
+        bindings.emplace_back(binding++, descriptor_type, count, stage_flags, nullptr);
+    }
+}
+
 u32 FillDescriptorLayout(const ShaderEntries& entries,
                          std::vector<vk::DescriptorSetLayoutBinding>& bindings,
                          Maxwell::ShaderProgram program_type, u32 base_binding) {
     const ShaderType stage = GetStageFromProgram(program_type);
-    const vk::ShaderStageFlags stage_flags = MaxwellToVK::ShaderStage(stage);
+    const vk::ShaderStageFlags flags = MaxwellToVK::ShaderStage(stage);
 
     u32 binding = base_binding;
-    const auto AddBindings = [&](vk::DescriptorType descriptor_type, std::size_t num_entries) {
-        for (std::size_t i = 0; i < num_entries; ++i) {
-            bindings.emplace_back(binding++, descriptor_type, 1, stage_flags, nullptr);
-        }
-    };
-    AddBindings(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size());
-    AddBindings(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size());
-    AddBindings(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size());
-    AddBindings(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size());
-    AddBindings(vk::DescriptorType::eStorageImage, entries.images.size());
+    AddBindings<eUniformBuffer>(bindings, binding, flags, entries.const_buffers);
+    AddBindings<eStorageBuffer>(bindings, binding, flags, entries.global_buffers);
+    AddBindings<eUniformTexelBuffer>(bindings, binding, flags, entries.texel_buffers);
+    AddBindings<eCombinedImageSampler>(bindings, binding, flags, entries.samplers);
+    AddBindings<eStorageImage>(bindings, binding, flags, entries.images);
     return binding;
 }
 
@@ -361,32 +377,45 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
     return {std::move(program), std::move(bindings)};
 }
 
-void FillDescriptorUpdateTemplateEntries(
-    const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset,
-    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) {
-    static constexpr auto entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry));
-    const auto AddEntry = [&](vk::DescriptorType descriptor_type, std::size_t count_) {
-        const u32 count = static_cast<u32>(count_);
-        if (descriptor_type == vk::DescriptorType::eUniformTexelBuffer &&
-            device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) {
-            // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
-            // crash.
-            for (u32 i = 0; i < count; ++i) {
-                template_entries.emplace_back(binding + i, 0, 1, descriptor_type,
-                                              offset + i * entry_size, entry_size);
-            }
-        } else if (count != 0) {
-            template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size);
+template <vk::DescriptorType descriptor_type, class Container>
+void AddEntry(std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries, u32& binding,
+              u32& offset, const Container& container) {
+    static constexpr u32 entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry));
+    const u32 count = static_cast<u32>(std::size(container));
+
+    if constexpr (descriptor_type == eCombinedImageSampler) {
+        for (u32 i = 0; i < count; ++i) {
+            const u32 num_samplers = container[i].Size();
+            template_entries.emplace_back(binding, 0, num_samplers, descriptor_type, offset,
+                                          entry_size);
+            ++binding;
+            offset += num_samplers * entry_size;
         }
-        offset += count * entry_size;
-        binding += count;
-    };
+        return;
+    }
 
-    AddEntry(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size());
-    AddEntry(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size());
-    AddEntry(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size());
-    AddEntry(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size());
-    AddEntry(vk::DescriptorType::eStorageImage, entries.images.size());
+    if constexpr (descriptor_type == eUniformTexelBuffer) {
+        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
+        // crash.
+        for (u32 i = 0; i < count; ++i) {
+            template_entries.emplace_back(binding + i, 0, 1, descriptor_type,
+                                          offset + i * entry_size, entry_size);
+        }
+    } else if (count > 0) {
+        template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size);
+    }
+    offset += count * entry_size;
+    binding += count;
+}
+
+void FillDescriptorUpdateTemplateEntries(
+    const ShaderEntries& entries, u32& binding, u32& offset,
+    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) {
+    AddEntry<eUniformBuffer>(template_entries, offset, binding, entries.const_buffers);
+    AddEntry<eStorageBuffer>(template_entries, offset, binding, entries.global_buffers);
+    AddEntry<eUniformTexelBuffer>(template_entries, offset, binding, entries.texel_buffers);
+    AddEntry<eCombinedImageSampler>(template_entries, offset, binding, entries.samplers);
+    AddEntry<eStorageImage>(template_entries, offset, binding, entries.images);
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index 8678fc9c3..92a670cc7 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -194,7 +194,7 @@ private:
 };
 
 void FillDescriptorUpdateTemplateEntries(
-    const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset,
+    const ShaderEntries& entries, u32& binding, u32& offset,
     std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries);
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 3bf86da87..3fe28c204 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -105,17 +105,20 @@ void TransitionImages(const std::vector<ImageView>& views, vk::PipelineStageFlag
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
-                                               std::size_t stage) {
+                                               std::size_t stage, std::size_t index = 0) {
     const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
     if (entry.IsBindless()) {
         const Tegra::Texture::TextureHandle tex_handle =
             engine.AccessConstBuffer32(stage_type, entry.GetBuffer(), entry.GetOffset());
         return engine.GetTextureInfo(tex_handle);
     }
+    const auto& gpu_profile = engine.AccessGuestDriverProfile();
+    const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
+    const u32 offset = entry.GetOffset() + entry_offset;
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
-        return engine.GetStageTexture(stage_type, entry.GetOffset());
+        return engine.GetStageTexture(stage_type, offset);
     } else {
-        return engine.GetTexture(entry.GetOffset());
+        return engine.GetTexture(offset);
     }
 }
 
@@ -836,8 +839,10 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
     MICROPROFILE_SCOPE(Vulkan_Textures);
     const auto& gpu = system.GPU().Maxwell3D();
     for (const auto& entry : entries.samplers) {
-        const auto texture = GetTextureInfo(gpu, entry, stage);
-        SetupTexture(texture, entry);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(gpu, entry, stage, i);
+            SetupTexture(texture, entry);
+        }
     }
 }
 
@@ -886,8 +891,10 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
     const auto& gpu = system.GPU().KeplerCompute();
     for (const auto& entry : entries.samplers) {
-        const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex);
-        SetupTexture(texture, entry);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i);
+            SetupTexture(texture, entry);
+        }
     }
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 2da622d15..cfcca5af0 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -69,8 +69,9 @@ struct TexelBuffer {
 
 struct SampledImage {
     Id image_type{};
-    Id sampled_image_type{};
-    Id sampler{};
+    Id sampler_type{};
+    Id sampler_pointer_type{};
+    Id variable{};
 };
 
 struct StorageImage {
@@ -833,16 +834,20 @@ private:
             constexpr int sampled = 1;
             constexpr auto format = spv::ImageFormat::Unknown;
             const Id image_type = TypeImage(t_float, dim, depth, arrayed, ms, sampled, format);
-            const Id sampled_image_type = TypeSampledImage(image_type);
-            const Id pointer_type =
-                TypePointer(spv::StorageClass::UniformConstant, sampled_image_type);
+            const Id sampler_type = TypeSampledImage(image_type);
+            const Id sampler_pointer_type =
+                TypePointer(spv::StorageClass::UniformConstant, sampler_type);
+            const Id type = sampler.IsIndexed()
+                                ? TypeArray(sampler_type, Constant(t_uint, sampler.Size()))
+                                : sampler_type;
+            const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, type);
             const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
             AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex())));
             Decorate(id, spv::Decoration::Binding, binding++);
             Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
 
-            sampled_images.emplace(sampler.GetIndex(),
-                                   SampledImage{image_type, sampled_image_type, id});
+            sampled_images.emplace(sampler.GetIndex(), SampledImage{image_type, sampler_type,
+                                                                    sampler_pointer_type, id});
         }
         return binding;
     }
@@ -1525,7 +1530,12 @@ private:
         ASSERT(!meta.sampler.IsBuffer());
 
         const auto& entry = sampled_images.at(meta.sampler.GetIndex());
-        return OpLoad(entry.sampled_image_type, entry.sampler);
+        Id sampler = entry.variable;
+        if (meta.sampler.IsIndexed()) {
+            const Id index = AsInt(Visit(meta.index));
+            sampler = OpAccessChain(entry.sampler_pointer_type, sampler, index);
+        }
+        return OpLoad(entry.sampler_type, sampler);
     }
 
     Id GetTextureImage(Operation operation) {
@@ -2211,16 +2221,14 @@ private:
         switch (specialization.attribute_types.at(location)) {
         case Maxwell::VertexAttribute::Type::SignedNorm:
         case Maxwell::VertexAttribute::Type::UnsignedNorm:
+        case Maxwell::VertexAttribute::Type::UnsignedScaled:
+        case Maxwell::VertexAttribute::Type::SignedScaled:
         case Maxwell::VertexAttribute::Type::Float:
             return {Type::Float, t_in_float, t_in_float4};
         case Maxwell::VertexAttribute::Type::SignedInt:
             return {Type::Int, t_in_int, t_in_int4};
         case Maxwell::VertexAttribute::Type::UnsignedInt:
             return {Type::Uint, t_in_uint, t_in_uint4};
-        case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        case Maxwell::VertexAttribute::Type::SignedScaled:
-            UNIMPLEMENTED();
-            return {Type::Float, t_in_float, t_in_float4};
         default:
             UNREACHABLE();
             return {Type::Float, t_in_float, t_in_float4};
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 90240c765..478394682 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -53,29 +53,24 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
 
         op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);
 
-        // TODO(Rodrigo): Should precise be used when there's a postfactor?
-        Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b);
+        static constexpr std::array FmulPostFactor = {
+            1.000f, // None
+            0.500f, // Divide 2
+            0.250f, // Divide 4
+            0.125f, // Divide 8
+            8.000f, // Mul 8
+            4.000f, // Mul 4
+            2.000f, // Mul 2
+        };
 
         if (instr.fmul.postfactor != 0) {
-            auto postfactor = static_cast<s32>(instr.fmul.postfactor);
-
-            // Postfactor encoded as 3-bit 1's complement in instruction, interpreted with below
-            // logic.
-            if (postfactor >= 4) {
-                postfactor = 7 - postfactor;
-            } else {
-                postfactor = 0 - postfactor;
-            }
-
-            if (postfactor > 0) {
-                value = Operation(OperationCode::FMul, NO_PRECISE, value,
-                                  Immediate(static_cast<f32>(1 << postfactor)));
-            } else {
-                value = Operation(OperationCode::FDiv, NO_PRECISE, value,
-                                  Immediate(static_cast<f32>(1 << -postfactor)));
-            }
+            op_a = Operation(OperationCode::FMul, NO_PRECISE, op_a,
+                             Immediate(FmulPostFactor[instr.fmul.postfactor]));
         }
 
+        // TODO(Rodrigo): Should precise be used when there's a postfactor?
+        Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b);
+
         value = GetSaturatedFloat(value, instr.alu.saturate_d);
 
         SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index 21366869d..2fe787d6f 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -293,44 +293,66 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
 
 void ShaderIR::WriteLop3Instruction(NodeBlock& bb, Register dest, Node op_a, Node op_b, Node op_c,
                                     Node imm_lut, bool sets_cc) {
-    constexpr u32 lop_iterations = 32;
-    const Node one = Immediate(1);
-    const Node two = Immediate(2);
-
-    Node value;
-    for (u32 i = 0; i < lop_iterations; ++i) {
-        const Node shift_amount = Immediate(i);
-
-        const Node a = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_c, shift_amount);
-        const Node pack_0 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, one);
-
-        const Node b = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_b, shift_amount);
-        const Node c = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, b, one);
-        const Node pack_1 = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, c, one);
-
-        const Node d = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_a, shift_amount);
-        const Node e = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, d, one);
-        const Node pack_2 = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, e, two);
-
-        const Node pack_01 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, pack_0, pack_1);
-        const Node pack_012 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, pack_01, pack_2);
-
-        const Node shifted_bit =
-            Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, imm_lut, pack_012);
-        const Node bit = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, shifted_bit, one);
-
-        const Node right =
-            Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, bit, shift_amount);
-
-        if (i > 0) {
-            value = Operation(OperationCode::IBitwiseOr, NO_PRECISE, value, right);
-        } else {
-            value = right;
+    const Node lop3_fast = [&](const Node na, const Node nb, const Node nc, const Node ttbl) {
+        Node value = Immediate(0);
+        const ImmediateNode imm = std::get<ImmediateNode>(*ttbl);
+        if (imm.GetValue() & 0x01) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
         }
-    }
+        if (imm.GetValue() & 0x02) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x04) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x08) {
+            const Node a = Operation(OperationCode::IBitwiseNot, na);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x10) {
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x20) {
+            const Node b = Operation(OperationCode::IBitwiseNot, nb);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, b);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x40) {
+            const Node c = Operation(OperationCode::IBitwiseNot, nc);
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        if (imm.GetValue() & 0x80) {
+            Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, nb);
+            r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc);
+            value = Operation(OperationCode::IBitwiseOr, value, r);
+        }
+        return value;
+    }(op_a, op_b, op_c, imm_lut);
 
-    SetInternalFlagsFromInteger(bb, value, sets_cc);
-    SetRegister(bb, dest, value);
+    SetInternalFlagsFromInteger(bb, lop3_fast, sets_cc);
+    SetRegister(bb, dest, lop3_fast);
 }
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index a0a7b9111..a1828546e 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -299,7 +299,7 @@ private:
     u32 index{};  ///< Emulated index given for the this sampler.
     u32 offset{}; ///< Offset in the const buffer from where the sampler is being read.
     u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
-    u32 size{};   ///< Size of the sampler if indexed.
+    u32 size{1};  ///< Size of the sampler.
 
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
     bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index face8c943..15e22b9fa 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -157,13 +157,21 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
         if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
             return {};
         }
-        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
-        // register that it uses as operand
-        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
-        if (!source) {
-            return {};
+        s64 current_cursor = cursor;
+        while (current_cursor > 0) {
+            // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
+            // register that it uses as operand
+            const auto [source, new_cursor] = TrackRegister(gpr, code, current_cursor - 1);
+            current_cursor = new_cursor;
+            if (!source) {
+                continue;
+            }
+            const auto [base_address, index, offset] = TrackCbuf(source, code, current_cursor);
+            if (base_address != nullptr) {
+                return {base_address, index, offset};
+            }
         }
-        return TrackCbuf(source, code, new_cursor);
+        return {};
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
         for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {