49 files changed, 1343 insertions, 591 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 1e31a2900..1e010e4da 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
     dma_pusher.h
     debug_utils/debug_utils.cpp
     debug_utils/debug_utils.h
+    engines/engine_upload.cpp
+    engines/engine_upload.h
     engines/fermi_2d.cpp
     engines/fermi_2d.h
     engines/kepler_compute.cpp
@@ -36,6 +38,8 @@ add_library(video_core STATIC
     renderer_base.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
+    renderer_opengl/gl_device.cpp
+    renderer_opengl/gl_device.h
     renderer_opengl/gl_global_cache.cpp
     renderer_opengl/gl_global_cache.h
     renderer_opengl/gl_primitive_assembler.cpp
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 046d047cb..036e66f05 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -57,8 +57,8 @@ bool DmaPusher::Step() {
 
     // Push buffer non-empty, read a word
     command_headers.resize(command_list_header.size);
-    gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(),
-                                  command_list_header.size * sizeof(u32));
+    gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                        command_list_header.size * sizeof(u32));
 
     for (const CommandHeader& command_header : command_headers) {
 
@@ -105,6 +105,8 @@ bool DmaPusher::Step() {
                 dma_state.non_incrementing = false;
                 dma_increment_once = true;
                 break;
+            default:
+                break;
             }
         }
     }
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
new file mode 100644
index 000000000..f8aa4ff55
--- /dev/null
+++ b/src/video_core/engines/engine_upload.cpp
@@ -0,0 +1,48 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/memory_manager.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines::Upload {
+
+State::State(MemoryManager& memory_manager, Registers& regs)
+    : memory_manager(memory_manager), regs(regs) {}
+
+void State::ProcessExec(const bool is_linear) {
+    write_offset = 0;
+    copy_size = regs.line_length_in * regs.line_count;
+    inner_buffer.resize(copy_size);
+    this->is_linear = is_linear;
+}
+
+void State::ProcessData(const u32 data, const bool is_last_call) {
+    const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
+    std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
+    write_offset += sub_copy_size;
+    if (!is_last_call) {
+        return;
+    }
+    const GPUVAddr address{regs.dest.Address()};
+    if (is_linear) {
+        memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
+    } else {
+        UNIMPLEMENTED_IF(regs.dest.z != 0);
+        UNIMPLEMENTED_IF(regs.dest.depth != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
+        const std::size_t dst_size = Tegra::Texture::CalculateSize(
+            true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
+        tmp_buffer.resize(dst_size);
+        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
+        Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
+                                      regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
+                                      tmp_buffer.data());
+        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+    }
+}
+
+} // namespace Tegra::Engines::Upload
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
new file mode 100644
index 000000000..9c6e0d21c
--- /dev/null
+++ b/src/video_core/engines/engine_upload.h
@@ -0,0 +1,75 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Upload {
+
+struct Registers {
+    u32 line_length_in;
+    u32 line_count;
+
+    struct {
+        u32 address_high;
+        u32 address_low;
+        u32 pitch;
+        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 z;
+        u32 x;
+        u32 y;
+
+        GPUVAddr Address() const {
+            return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
+        }
+
+        u32 BlockWidth() const {
+            return 1U << block_width.Value();
+        }
+
+        u32 BlockHeight() const {
+            return 1U << block_height.Value();
+        }
+
+        u32 BlockDepth() const {
+            return 1U << block_depth.Value();
+        }
+    } dest;
+};
+
+class State {
+public:
+    State(MemoryManager& memory_manager, Registers& regs);
+    ~State() = default;
+
+    void ProcessExec(const bool is_linear);
+    void ProcessData(const u32 data, const bool is_last_call);
+
+private:
+    u32 write_offset = 0;
+    u32 copy_size = 0;
+    std::vector<u8> inner_buffer;
+    std::vector<u8> tmp_buffer;
+    bool is_linear = false;
+    Registers& regs;
+    MemoryManager& memory_manager;
+};
+
+} // namespace Tegra::Engines::Upload
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 2e51b7f13..45f59a4d9 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -21,6 +21,12 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as G80_2D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
+ */
+
 #define FERMI2D_REG_INDEX(field_name)                                                              \
     (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
 
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index b1d950460..7404a8163 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -4,12 +4,21 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-KeplerCompute::KeplerCompute(MemoryManager& memory_manager) : memory_manager{memory_manager} {}
+KeplerCompute::KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                             MemoryManager& memory_manager)
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, upload_state{
+                                                                                  memory_manager,
+                                                                                  regs.upload} {}
 
 KeplerCompute::~KeplerCompute() = default;
 
@@ -20,14 +29,34 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
     regs.reg_array[method_call.method] = method_call.argument;
 
     switch (method_call.method) {
+    case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        }
+        break;
+    }
     case KEPLER_COMPUTE_REG_INDEX(launch):
-        // Abort execution since compute shaders can be used to alter game memory (e.g. CUDA
-        // kernels)
-        UNREACHABLE_MSG("Compute shaders are not implemented");
+        ProcessLaunch();
         break;
     default:
         break;
     }
 }
 
+void KeplerCompute::ProcessLaunch() {
+
+    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
+    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
+                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
+
+    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
+    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+}
+
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index fb6cdf432..5250b8d9b 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -6,22 +6,40 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 
+namespace Core {
+class System;
+}
+
 namespace Tegra {
 class MemoryManager;
 }
 
+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GK104_Compute. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
+ */
+
 #define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
     (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
 
 class KeplerCompute final {
 public:
-    explicit KeplerCompute(MemoryManager& memory_manager);
+    explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                           MemoryManager& memory_manager);
     ~KeplerCompute();
 
     static constexpr std::size_t NumConstBuffers = 8;
@@ -31,30 +49,181 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0xAF);
+                INSERT_PADDING_WORDS(0x60);
+
+                Upload::Registers upload;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
+                    }
+                } launch_desc_loc;
+
+                INSERT_PADDING_WORDS(0x1);
 
                 u32 launch;
 
-                INSERT_PADDING_WORDS(0xC48);
+                INSERT_PADDING_WORDS(0x4A7);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tsc;
+
+                INSERT_PADDING_WORDS(0x3);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tic;
+
+                INSERT_PADDING_WORDS(0x22);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } code_loc;
+
+                INSERT_PADDING_WORDS(0x3FE);
+
+                u32 texture_const_buffer_index;
+
+                INSERT_PADDING_WORDS(0x374);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
     } regs{};
+
+    struct LaunchParams {
+        static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        u32 program_start;
+
+        INSERT_PADDING_WORDS(0x2);
+
+        BitField<30, 1, u32> linked_tsc;
+
+        BitField<0, 31, u32> grid_dim_x;
+        union {
+            BitField<0, 16, u32> grid_dim_y;
+            BitField<16, 16, u32> grid_dim_z;
+        };
+
+        INSERT_PADDING_WORDS(0x3);
+
+        BitField<0, 16, u32> shared_alloc;
+
+        BitField<0, 31, u32> block_dim_x;
+        union {
+            BitField<0, 16, u32> block_dim_y;
+            BitField<16, 16, u32> block_dim_z;
+        };
+
+        union {
+            BitField<0, 8, u32> const_buffer_enable_mask;
+            BitField<29, 2, u32> cache_layout;
+        } memory_config;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        struct {
+            u32 address_low;
+            union {
+                BitField<0, 8, u32> address_high;
+                BitField<15, 17, u32> size;
+            };
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
+                                             address_low);
+            }
+        } const_buffer_config[8];
+
+        union {
+            BitField<0, 20, u32> local_pos_alloc;
+            BitField<27, 5, u32> barrier_alloc;
+        };
+
+        union {
+            BitField<0, 20, u32> local_neg_alloc;
+            BitField<24, 5, u32> gpr_alloc;
+        };
+
+        INSERT_PADDING_WORDS(0x11);
+    } launch_description;
+
+    struct {
+        u32 write_offset = 0;
+        u32 copy_size = 0;
+        std::vector<u8> inner_buffer;
+    } state{};
+
     static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
                   "KeplerCompute Regs has wrong size");
 
+    static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32),
+                  "KeplerCompute LaunchParams has wrong size");
+
     /// Write the value to the register identified by method.
     void CallMethod(const GPU::MethodCall& method_call);
 
 private:
+    Core::System& system;
+    VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
+    Upload::State upload_state;
+
+    void ProcessLaunch();
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4,                       \
                   "Field " #field_name " has invalid position")
 
+#define ASSERT_LAUNCH_PARAM_POSITION(field_name, position)                                         \
+    static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4,               \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(launch, 0xAF);
+ASSERT_REG_POSITION(tsc, 0x557);
+ASSERT_REG_POSITION(tic, 0x55D);
+ASSERT_REG_POSITION(code_loc, 0x582);
+ASSERT_REG_POSITION(texture_const_buffer_index, 0x982);
+ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
+ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
+ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
+ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
+ASSERT_LAUNCH_PARAM_POSITION(memory_config, 0x14);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
 
 #undef ASSERT_REG_POSITION
 
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index cd51a31d7..0561f676c 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -10,12 +10,12 @@
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                           MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
+KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
+    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
 
 KeplerMemory::~KeplerMemory() = default;
 
@@ -27,30 +27,18 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
 
     switch (method_call.method) {
     case KEPLERMEMORY_REG_INDEX(exec): {
-        state.write_offset = 0;
+        upload_state.ProcessExec(regs.exec.linear != 0);
         break;
     }
     case KEPLERMEMORY_REG_INDEX(data): {
-        ProcessData(method_call.argument);
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        }
         break;
     }
     }
 }
 
-void KeplerMemory::ProcessData(u32 data) {
-    ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
-    ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
-
-    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
-    // We do this before actually writing the new data because the destination address might
-    // contain a dirty surface that will have to be written back to memory.
-    const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)};
-    rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32));
-    memory_manager.Write<u32>(address, data);
-
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
-
-    state.write_offset++;
-}
-
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 78b6c3e45..f3bc675a9 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -6,9 +6,11 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 
 namespace Core {
@@ -19,19 +21,20 @@ namespace Tegra {
 class MemoryManager;
 }
 
-namespace VideoCore {
-class RasterizerInterface;
-}
-
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as P2MF. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
+ */
+
 #define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
     (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
 
 class KeplerMemory final {
 public:
-    KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                 MemoryManager& memory_manager);
+    KeplerMemory(Core::System& system, MemoryManager& memory_manager);
     ~KeplerMemory();
 
     /// Write the value to the register identified by method.
@@ -44,26 +47,7 @@ public:
             struct {
                 INSERT_PADDING_WORDS(0x60);
 
-                u32 line_length_in;
-                u32 line_count;
-
-                struct {
-                    u32 address_high;
-                    u32 address_low;
-                    u32 pitch;
-                    u32 block_dimensions;
-                    u32 width;
-                    u32 height;
-                    u32 depth;
-                    u32 z;
-                    u32 x;
-                    u32 y;
-
-                    GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
-                    }
-                } dest;
+                Upload::Registers upload;
 
                 struct {
                     union {
@@ -79,25 +63,17 @@ public:
         };
     } regs{};
 
-    struct {
-        u32 write_offset = 0;
-    } state{};
-
 private:
     Core::System& system;
-    VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
-
-    void ProcessData(u32 data);
+    Upload::State upload_state;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
                   "Field " #field_name " has invalid position")
 
-ASSERT_REG_POSITION(line_length_in, 0x60);
-ASSERT_REG_POSITION(line_count, 0x61);
-ASSERT_REG_POSITION(dest, 0x62);
+ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(exec, 0x6C);
 ASSERT_REG_POSITION(data, 0x6D);
 #undef ASSERT_REG_POSITION
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index b198793bc..d7b586db9 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -20,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                      MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{
-                                                                                  *this} {
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
+      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
     InitializeRegisterDefaults();
 }
 
@@ -253,6 +253,18 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessSyncPoint();
         break;
     }
+    case MAXWELL3D_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(data_upload): {
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            dirty_flags.OnMemoryWrite();
+        }
+        break;
+    }
     default:
         break;
     }
@@ -418,7 +430,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
 
     Texture::TICEntry tic_entry;
-    memory_manager.ReadBlock(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
 
     ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
                    tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
@@ -439,7 +451,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
     const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
 
     Texture::TSCEntry tsc_entry;
-    memory_manager.ReadBlock(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
     return tsc_entry;
 }
 
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index cc2424d38..4883b582a 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -14,6 +14,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 #include "video_core/macro_interpreter.h"
 #include "video_core/textures/texture.h"
@@ -32,6 +33,12 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GF100_3D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gf100_3d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
+ */
+
 #define MAXWELL3D_REG_INDEX(field_name)                                                            \
     (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))
 
@@ -243,9 +250,10 @@ public:
                     return "10_10_10_2";
                 case Size::Size_11_11_10:
                     return "11_11_10";
+                default:
+                    UNREACHABLE();
+                    return {};
                 }
-                UNREACHABLE();
-                return {};
             }
 
             std::string TypeString() const {
@@ -579,7 +587,18 @@ public:
                     u32 bind;
                 } macros;
 
-                INSERT_PADDING_WORDS(0x69);
+                INSERT_PADDING_WORDS(0x17);
+
+                Upload::Registers upload;
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_PADDING_WORDS(0x44);
 
                 struct {
                     union {
@@ -1175,6 +1194,8 @@ private:
     /// Interpreter for the macro codes uploaded to the GPU.
     MacroInterpreter macro_interpreter;
 
+    Upload::State upload_state;
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
@@ -1218,6 +1239,9 @@ private:
                   "Field " #field_name " has invalid position")
 
 ASSERT_REG_POSITION(macros, 0x45);
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(sync_info, 0xB2);
 ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
 ASSERT_REG_POSITION(rt, 0x200);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 2426d0067..3a5dfef0c 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -83,57 +83,66 @@ void MaxwellDMA::HandleCopy() {
 
     ASSERT(regs.exec.enable_2d == 1);
 
-    const std::size_t copy_size = regs.x_count * regs.y_count;
+    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        ASSERT(regs.src_params.size_z == 1);
+        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const std::size_t src_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
+            regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
 
-    auto source_ptr{memory_manager.GetPointer(source)};
-    auto dst_ptr{memory_manager.GetPointer(dest)};
+        const std::size_t dst_size = regs.dst_pitch * regs.y_count;
 
-    if (!source_ptr) {
-        LOG_ERROR(HW_GPU, "source_ptr is invalid");
-        return;
-    }
+        if (read_buffer.size() < src_size) {
+            read_buffer.resize(src_size);
+        }
 
-    if (!dst_ptr) {
-        LOG_ERROR(HW_GPU, "dst_ptr is invalid");
-        return;
-    }
+        if (write_buffer.size() < dst_size) {
+            write_buffer.resize(dst_size);
+        }
 
-    const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
-        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
-        // copying.
-        rasterizer.FlushRegion(ToCacheAddr(source_ptr), src_size);
+        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
-        // We have to invalidate the destination region to evict any outdated surfaces from the
-        // cache. We do this before actually writing the new data because the destination address
-        // might contain a dirty surface that will have to be written back to memory.
-        rasterizer.InvalidateRegion(ToCacheAddr(dst_ptr), dst_size);
-    };
+        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
+                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
+                                  write_buffer.data(), regs.src_params.BlockHeight(),
+                                  regs.src_params.pos_x, regs.src_params.pos_y);
 
-    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.size_z == 1);
-        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
+    } else {
+        ASSERT(regs.dst_params.BlockDepth() == 1);
 
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
 
-        FlushAndInvalidate(regs.src_pitch * regs.src_params.size_y,
-                           copy_size * src_bytes_per_pixel);
+        const std::size_t dst_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
+            regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
-        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, source_ptr, dst_ptr,
-                                  regs.src_params.BlockHeight(), regs.src_params.pos_x,
-                                  regs.src_params.pos_y);
-    } else {
-        ASSERT(regs.dst_params.size_z == 1);
-        ASSERT(regs.src_pitch == regs.x_count);
+        const std::size_t dst_layer_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
+            regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
-        const u32 src_bpp = regs.src_pitch / regs.x_count;
+        const std::size_t src_size = regs.src_pitch * regs.y_count;
 
-        FlushAndInvalidate(regs.src_pitch * regs.y_count,
-                           regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);
+        if (read_buffer.size() < src_size) {
+            read_buffer.resize(src_size);
+        }
+
+        if (write_buffer.size() < dst_size) {
+            write_buffer.resize(dst_size);
+        }
+
+        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
         // If the input is linear and the output is tiled, swizzle the input and copy it over.
         Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bpp, dst_ptr, source_ptr, regs.dst_params.BlockHeight());
+                                src_bytes_per_pixel,
+                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
+                                read_buffer.data(), regs.dst_params.BlockHeight());
+
+        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index c6b649842..e5942f671 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -25,6 +26,11 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GK104_Copy. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
+ */
+
 class MaxwellDMA final {
 public:
     explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
@@ -63,6 +69,16 @@ public:
 
         static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
 
+        enum class ComponentMode : u32 {
+            Src0 = 0,
+            Src1 = 1,
+            Src2 = 2,
+            Src3 = 3,
+            Const0 = 4,
+            Const1 = 5,
+            Zero = 6,
+        };
+
         enum class CopyMode : u32 {
             None = 0,
             Unk1 = 1,
@@ -128,7 +144,26 @@ public:
                 u32 x_count;
                 u32 y_count;
 
-                INSERT_PADDING_WORDS(0xBB);
+                INSERT_PADDING_WORDS(0xB8);
+
+                u32 const0;
+                u32 const1;
+                union {
+                    BitField<0, 4, ComponentMode> component0;
+                    BitField<4, 4, ComponentMode> component1;
+                    BitField<8, 4, ComponentMode> component2;
+                    BitField<12, 4, ComponentMode> component3;
+                    BitField<16, 2, u32> component_size;
+                    BitField<20, 3, u32> src_num_components;
+                    BitField<24, 3, u32> dst_num_components;
+
+                    u32 SrcBytePerPixel() const {
+                        return src_num_components.Value() * component_size.Value();
+                    }
+                    u32 DstBytePerPixel() const {
+                        return dst_num_components.Value() * component_size.Value();
+                    }
+                } swizzle_config;
 
                 Parameters dst_params;
 
@@ -149,6 +184,9 @@ private:
 
     MemoryManager& memory_manager;
 
+    std::vector<u8> read_buffer;
+    std::vector<u8> write_buffer;
+
     /// Performs the copy from the source buffer to the destination buffer as configured in the
     /// registers.
     void HandleCopy();
@@ -165,6 +203,9 @@ ASSERT_REG_POSITION(src_pitch, 0x104);
 ASSERT_REG_POSITION(dst_pitch, 0x105);
 ASSERT_REG_POSITION(x_count, 0x106);
 ASSERT_REG_POSITION(y_count, 0x107);
+ASSERT_REG_POSITION(const0, 0x1C0);
+ASSERT_REG_POSITION(const1, 0x1C1);
+ASSERT_REG_POSITION(swizzle_config, 0x1C2);
 ASSERT_REG_POSITION(dst_params, 0x1C3);
 ASSERT_REG_POSITION(src_params, 0x1CA);
 
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index fce9733b9..e5b4eadea 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -937,21 +937,34 @@ union Instruction {
     } iset;
 
     union {
-        BitField<8, 2, Register::Size> dest_size;
-        BitField<10, 2, Register::Size> src_size;
-        BitField<12, 1, u64> is_output_signed;
-        BitField<13, 1, u64> is_input_signed;
-        BitField<41, 2, u64> selector;
+        BitField<41, 2, u64> selector; // i2i and i2f only
         BitField<45, 1, u64> negate_a;
         BitField<49, 1, u64> abs_a;
+        BitField<10, 2, Register::Size> src_size;
+        BitField<13, 1, u64> is_input_signed;
+        BitField<8, 2, Register::Size> dst_size;
+        BitField<12, 1, u64> is_output_signed;
+
+        union {
+            BitField<39, 2, u64> tab5cb8_2;
+        } i2f;
 
         union {
             BitField<39, 2, F2iRoundingOp> rounding;
         } f2i;
 
         union {
-            BitField<39, 4, F2fRoundingOp> rounding;
+            BitField<8, 2, Register::Size> src_size;
+            BitField<10, 2, Register::Size> dst_size;
+            BitField<39, 4, u64> rounding;
+            // H0, H1 extract for F16 missing
+            BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
+            F2fRoundingOp GetRoundingMode() const {
+                constexpr u64 rounding_mask = 0x0B;
+                return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
+            }
         } f2f;
+
     } conversion;
 
     union {
@@ -1734,7 +1747,7 @@ private:
             INST("0011100-00101---", Id::SHR_IMM, Type::Shift, "SHR_IMM"),
             INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"),
             INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"),
-            INST("01110001-1000---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
+            INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
             INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"),
             INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"),
             INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 4461083ff..52706505b 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{ren
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
-    kepler_compute = std::make_unique<Engines::KeplerCompute>(*memory_manager);
+    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
-    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, rasterizer, *memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
 }
 
 GPU::~GPU() = default;
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 0f4e820aa..6c98c6701 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -199,7 +199,15 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
     return {};
 }
 
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const {
+bool MemoryManager::IsBlockContinous(const GPUVAddr start, const std::size_t size) {
+    const GPUVAddr end = start + size;
+    const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start));
+    const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end));
+    const std::size_t range = static_cast<std::size_t>(host_ptr_end - host_ptr_start);
+    return range == size;
+}
+
+void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
     std::size_t remaining_size{size};
     std::size_t page_index{src_addr >> page_bits};
     std::size_t page_offset{src_addr & page_mask};
@@ -226,7 +234,30 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t
     }
 }
 
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) {
+void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+                                    const std::size_t size) const {
+    std::size_t remaining_size{size};
+    std::size_t page_index{src_addr >> page_bits};
+    std::size_t page_offset{src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+        const u8* page_pointer = page_table.pointers[page_index];
+        if (page_pointer) {
+            const u8* src_ptr{page_pointer + page_offset};
+            std::memcpy(dest_buffer, src_ptr, copy_amount);
+        } else {
+            std::memset(dest_buffer, 0, copy_amount);
+        }
+        page_index++;
+        page_offset = 0;
+        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
     std::size_t remaining_size{size};
     std::size_t page_index{dest_addr >> page_bits};
     std::size_t page_offset{dest_addr & page_mask};
@@ -253,7 +284,28 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::
     }
 }
 
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) {
+void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+                                     const std::size_t size) {
+    std::size_t remaining_size{size};
+    std::size_t page_index{dest_addr >> page_bits};
+    std::size_t page_offset{dest_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+        u8* page_pointer = page_table.pointers[page_index];
+        if (page_pointer) {
+            u8* dest_ptr{page_pointer + page_offset};
+            std::memcpy(dest_ptr, src_buffer, copy_amount);
+        }
+        page_index++;
+        page_offset = 0;
+        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
     std::size_t remaining_size{size};
     std::size_t page_index{src_addr >> page_bits};
     std::size_t page_offset{src_addr & page_mask};
@@ -281,6 +333,12 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t
     }
 }
 
+void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+    std::vector<u8> tmp_buffer(size);
+    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+}
+
 void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
                              VAddr backing_addr) {
     LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size,
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 647cbf93a..e4f0c4bd6 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -65,9 +65,32 @@ public:
     u8* GetPointer(GPUVAddr addr);
     const u8* GetPointer(GPUVAddr addr) const;
 
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    // Returns true if the block is continous in host memory, false otherwise
+    bool IsBlockContinous(const GPUVAddr start, const std::size_t size);
+
+    /**
+     * ReadBlock and WriteBlock are full read and write operations over virtual
+     * GPU Memory. It's important to use these when GPU memory may not be continous
+     * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
+     * Flushes and Invalidations, respectively to each operation.
+     */
+    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
+    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
+    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
+
+    /**
+     * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
+     * WriteBlock respectively. In this versions, no flushing or invalidation is actually
+     * done and their performance is similar to a memcpy. This functions can be used
+     * on either of this 2 scenarios instead of their safe counterpart:
+     * - Memory which is sure to never be represented in the Host GPU.
+     * - Memory Managed by a Cache Manager. Example: Texture Flushing should use
+     * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
+     * being flushed.
+     */
+    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
 
 private:
     using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>;
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
new file mode 100644
index 000000000..b6d9e0ddb
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -0,0 +1,45 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+#include <glad/glad.h>
+
+#include "common/logging/log.h"
+#include "video_core/renderer_opengl/gl_device.h"
+
+namespace OpenGL {
+
+namespace {
+template <typename T>
+T GetInteger(GLenum pname) {
+    GLint temporary;
+    glGetIntegerv(pname, &temporary);
+    return static_cast<T>(temporary);
+}
+} // Anonymous namespace
+
+Device::Device() {
+    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    has_variable_aoffi = TestVariableAoffi();
+}
+
+bool Device::TestVariableAoffi() {
+    const GLchar* AOFFI_TEST = R"(#version 430 core
+uniform sampler2D tex;
+uniform ivec2 variable_offset;
+void main() {
+    gl_Position = textureOffset(tex, vec2(0), variable_offset);
+}
+)";
+    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)};
+    GLint link_status{};
+    glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
+    glDeleteProgram(shader);
+
+    const bool supported{link_status == GL_TRUE};
+    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported);
+    return supported;
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
new file mode 100644
index 000000000..78ff5ee58
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -0,0 +1,30 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+
+namespace OpenGL {
+
+class Device {
+public:
+    Device();
+
+    std::size_t GetUniformBufferAlignment() const {
+        return uniform_buffer_alignment;
+    }
+
+    bool HasVariableAoffi() const {
+        return has_variable_aoffi;
+    }
+
+private:
+    static bool TestVariableAoffi();
+
+    std::size_t uniform_buffer_alignment{};
+    bool has_variable_aoffi{};
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 6034dc489..3cc945235 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -99,7 +99,7 @@ struct FramebufferCacheKey {
 };
 
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system},
+    : res_cache{*this}, shader_cache{*this, system, device}, global_cache{*this}, system{system},
       screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
     OpenGLState::ApplyDefaultState();
 
@@ -107,8 +107,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
     state.draw.shader_program = 0;
     state.Apply();
 
-    glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
-
     LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
     CheckExtensions();
 }
@@ -307,6 +305,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
             case Maxwell::ShaderProgram::Geometry:
                 shader_program_manager->UseTrivialGeometryShader();
                 break;
+            default:
+                break;
             }
             continue;
         }
@@ -315,8 +315,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu, stage);
-        const GLintptr offset = buffer_cache.UploadHostMemory(
-            &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
+        const GLintptr offset =
+            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
 
         // Bind the emulation info buffer
         bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
@@ -700,23 +700,24 @@ void RasterizerOpenGL::DrawArrays() {
     // Add space for index buffer (keeping in mind non-core primitives)
     switch (regs.draw.topology) {
     case Maxwell::PrimitiveTopology::Quads:
-        buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
+        buffer_size = Common::AlignUp(buffer_size, 4) +
                       primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
         break;
     default:
         if (is_indexed) {
-            buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + CalculateIndexBufferSize();
+            buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
         }
         break;
     }
 
     // Uniform space for the 5 shader stages
-    buffer_size =
-        Common::AlignUp<std::size_t>(buffer_size, 4) +
-        (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage;
+    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
+                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
+                      Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
-    buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
+    buffer_size +=
+        Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
 
     const bool invalidate = buffer_cache.Map(buffer_size);
     if (invalidate) {
@@ -848,8 +849,8 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        const GLintptr const_buffer_offset = buffer_cache.UploadMemory(
-            buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment));
+        const GLintptr const_buffer_offset =
+            buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
 
         bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
     }
@@ -921,8 +922,8 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
         viewport.y = viewport_rect.bottom;
         viewport.width = viewport_rect.GetWidth();
         viewport.height = viewport_rect.GetHeight();
-        viewport.depth_range_far = regs.viewports[i].depth_range_far;
-        viewport.depth_range_near = regs.viewports[i].depth_range_near;
+        viewport.depth_range_far = src.depth_range_far;
+        viewport.depth_range_near = src.depth_range_near;
     }
     state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0;
     state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index a0e056142..71b9c5ead 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -21,6 +21,7 @@
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_primitive_assembler.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
@@ -172,6 +173,7 @@ private:
     /// but are needed for correct emulation
     void CheckExtensions();
 
+    const Device device;
     OpenGLState state;
 
     RasterizerCacheOpenGL res_cache;
@@ -180,7 +182,6 @@ private:
     SamplerCacheOpenGL sampler_cache;
 
     Core::System& system;
-
     ScreenInfo& screen_info;
 
     std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
@@ -196,7 +197,6 @@ private:
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
     PrimitiveAssembler primitive_assembler{buffer_cache};
-    GLint uniform_buffer_alignment;
 
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 67f99a568..7d8fb670e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -642,13 +642,16 @@ void CachedSurface::LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) {
             SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i);
     } else {
         const u32 bpp = params.GetFormatBpp() / 8;
-        const u32 copy_size = params.width * bpp;
+        const u32 copy_size = (params.width * bpp + GetDefaultBlockWidth(params.pixel_format) - 1) /
+                              GetDefaultBlockWidth(params.pixel_format);
         if (params.pitch == copy_size) {
             std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);
         } else {
+            const u32 height = (params.height + GetDefaultBlockHeight(params.pixel_format) - 1) /
+                               GetDefaultBlockHeight(params.pixel_format);
             const u8* start{params.host_ptr};
             u8* write_to = gl_buffer[0].data();
-            for (u32 h = params.height; h > 0; h--) {
+            for (u32 h = height; h > 0; h--) {
                 std::memcpy(write_to, start, copy_size);
                 start += params.pitch;
                 write_to += copy_size;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 99f67494c..b1c8f7c35 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -38,13 +38,15 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
 }
 
 /// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(const u8* host_ptr) {
+ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
+                          const u8* host_ptr) {
     ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
     ASSERT_OR_EXECUTE(host_ptr != nullptr, {
         std::fill(program_code.begin(), program_code.end(), 0);
         return program_code;
     });
-    std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
+    memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(),
+                                   program_code.size() * sizeof(u64));
     return program_code;
 }
 
@@ -134,8 +136,8 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 }
 
 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, ProgramCode program_code,
-                                      ProgramCode program_code_b) {
+GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
+                                      ProgramCode program_code, ProgramCode program_code_b) {
     GLShader::ShaderSetup setup(program_code);
     if (program_type == Maxwell::ShaderProgram::VertexA) {
         // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
@@ -149,11 +151,11 @@ GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, Progr
     switch (program_type) {
     case Maxwell::ShaderProgram::VertexA:
     case Maxwell::ShaderProgram::VertexB:
-        return GLShader::GenerateVertexShader(setup);
+        return GLShader::GenerateVertexShader(device, setup);
     case Maxwell::ShaderProgram::Geometry:
-        return GLShader::GenerateGeometryShader(setup);
+        return GLShader::GenerateGeometryShader(device, setup);
     case Maxwell::ShaderProgram::Fragment:
-        return GLShader::GenerateFragmentShader(setup);
+        return GLShader::GenerateFragmentShader(device, setup);
     default:
         LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
         UNREACHABLE();
@@ -212,22 +214,20 @@ std::set<GLenum> GetSupportedFormats() {
     return supported_formats;
 }
 
-} // namespace
+} // Anonymous namespace
 
-CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
+CachedShader::CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
                            Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                            const PrecompiledPrograms& precompiled_programs,
                            ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr)
     : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr},
       unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache},
       precompiled_programs{precompiled_programs} {
-
-    const std::size_t code_size = CalculateProgramSize(program_code);
-    const std::size_t code_size_b =
-        program_code_b.empty() ? 0 : CalculateProgramSize(program_code_b);
-
-    GLShader::ProgramResult program_result =
-        CreateProgram(program_type, program_code, program_code_b);
+    const std::size_t code_size{CalculateProgramSize(program_code)};
+    const std::size_t code_size_b{program_code_b.empty() ? 0
+                                                         : CalculateProgramSize(program_code_b)};
+    GLShader::ProgramResult program_result{
+        CreateProgram(device, program_type, program_code, program_code_b)};
     if (program_result.first.empty()) {
         // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
         return;
@@ -251,7 +251,6 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
     : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier},
       program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{
                                                               precompiled_programs} {
-
     code = std::move(result.first);
     entries = result.second;
     shader_length = entries.shader_length;
@@ -344,8 +343,9 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
     return {unique_identifier, base_bindings, primitive_mode};
 }
 
-ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system)
-    : RasterizerCache{rasterizer}, disk_cache{system} {}
+ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
+                                     const Device& device)
+    : RasterizerCache{rasterizer}, disk_cache{system}, device{device} {}
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -363,6 +363,10 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
     if (stop_loading)
         return;
 
+    // Track if precompiled cache was altered during loading to know if we have to serialize the
+    // virtual precompiled cache file back to the hard drive
+    bool precompiled_cache_altered = false;
+
     // Build shaders
     if (callback)
         callback(VideoCore::LoadCallbackStage::Build, 0, usages.size());
@@ -384,6 +388,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             if (!shader) {
                 // Invalidate the precompiled cache if a shader dumped shader was rejected
                 disk_cache.InvalidatePrecompiled();
+                precompiled_cache_altered = true;
                 dumps.clear();
             }
         }
@@ -405,8 +410,13 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         if (dumps.find(usage) == dumps.end()) {
             const auto& program = precompiled_programs.at(usage);
             disk_cache.SaveDump(usage, program->handle);
+            precompiled_cache_altered = true;
         }
     }
+
+    if (precompiled_cache_altered) {
+        disk_cache.SaveVirtualPrecompiledFile();
+    }
 }
 
 CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
@@ -439,17 +449,18 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
     const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) {
     std::unordered_map<u64, UnspecializedShader> unspecialized;
 
-    if (callback)
+    if (callback) {
         callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
+    }
 
     for (std::size_t i = 0; i < raws.size(); ++i) {
-        if (stop_loading)
+        if (stop_loading) {
             return {};
-
+        }
         const auto& raw{raws[i]};
-        const u64 unique_identifier = raw.GetUniqueIdentifier();
-        const u64 calculated_hash =
-            GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB());
+        const u64 unique_identifier{raw.GetUniqueIdentifier()};
+        const u64 calculated_hash{
+            GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB())};
         if (unique_identifier != calculated_hash) {
             LOG_ERROR(
                 Render_OpenGL,
@@ -466,8 +477,8 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
             result = {stored_decompiled.code, stored_decompiled.entries};
         } else {
             // Otherwise decompile the shader at boot and save the result to the decompiled file
-            result =
-                CreateProgram(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB());
+            result = CreateProgram(device, raw.GetProgramType(), raw.GetProgramCode(),
+                                   raw.GetProgramCodeB());
             disk_cache.SaveDecompiled(unique_identifier, result.first, result.second);
         }
 
@@ -477,8 +488,9 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
             {raw.GetUniqueIdentifier(),
              {std::move(result.first), std::move(result.second), raw.GetProgramType()}});
 
-        if (callback)
+        if (callback) {
             callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
+        }
     }
     return unspecialized;
 }
@@ -497,11 +509,12 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     if (!shader) {
         // No shader found - create a new one
-        ProgramCode program_code{GetShaderCode(host_ptr)};
+        ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
         ProgramCode program_code_b;
         if (program == Maxwell::ShaderProgram::VertexA) {
-            program_code_b = GetShaderCode(
-                memory_manager.GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB)));
+            const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)};
+            program_code_b = GetShaderCode(memory_manager, program_addr_b,
+                                           memory_manager.GetPointer(program_addr_b));
         }
         const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
         const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
@@ -512,7 +525,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
                                                precompiled_programs, found->second, host_ptr);
         } else {
             shader = std::make_shared<CachedShader>(
-                cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
+                device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
                 std::move(program_code), std::move(program_code_b), host_ptr);
         }
         Register(shader);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index eae771c08..31b979987 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -27,6 +27,7 @@ class System;
 namespace OpenGL {
 
 class CachedShader;
+class Device;
 class RasterizerOpenGL;
 struct UnspecializedShader;
 
@@ -38,7 +39,7 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
 
 class CachedShader final : public RasterizerCacheObject {
 public:
-    explicit CachedShader(VAddr cpu_addr, u64 unique_identifier,
+    explicit CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                           const PrecompiledPrograms& precompiled_programs,
                           ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr);
@@ -109,7 +110,8 @@ private:
 
 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
 public:
-    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system);
+    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
+                               const Device& device);
 
     /// Loads disk cache for the current game
     void LoadDiskCache(const std::atomic_bool& stop_loading,
@@ -131,6 +133,8 @@ private:
     CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
                                              const std::set<GLenum>& supported_formats);
 
+    const Device& device;
+
     std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
 
     ShaderDiskCacheOpenGL disk_cache;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 445048daf..ef1a1995f 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -15,6 +15,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
@@ -119,14 +120,10 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
 
 /// Returns true if an object has to be treated as precise
 bool IsPrecise(Operation operand) {
-    const auto& meta = operand.GetMeta();
-
+    const auto& meta{operand.GetMeta()};
     if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) {
         return arithmetic->precise;
     }
-    if (const auto half_arithmetic = std::get_if<MetaHalfArithmetic>(&meta)) {
-        return half_arithmetic->precise;
-    }
     return false;
 }
 
@@ -139,8 +136,9 @@ bool IsPrecise(Node node) {
 
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const ShaderIR& ir, ShaderStage stage, std::string suffix)
-        : ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
+                            std::string suffix)
+        : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
 
     void Decompile() {
         DeclareVertex();
@@ -627,28 +625,7 @@ private:
     }
 
     std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
-        std::string value = VisitOperand(operation, operand_index);
-        switch (type) {
-        case Type::HalfFloat: {
-            const auto half_meta = std::get_if<MetaHalfArithmetic>(&operation.GetMeta());
-            if (!half_meta) {
-                value = "toHalf2(" + value + ')';
-            }
-
-            switch (half_meta->types.at(operand_index)) {
-            case Tegra::Shader::HalfType::H0_H1:
-                return "toHalf2(" + value + ')';
-            case Tegra::Shader::HalfType::F32:
-                return "vec2(" + value + ')';
-            case Tegra::Shader::HalfType::H0_H0:
-                return "vec2(toHalf2(" + value + ")[0])";
-            case Tegra::Shader::HalfType::H1_H1:
-                return "vec2(toHalf2(" + value + ")[1])";
-            }
-        }
-        default:
-            return CastOperand(value, type);
-        }
+        return CastOperand(VisitOperand(operation, operand_index), type);
     }
 
     std::string CastOperand(const std::string& value, Type type) const {
@@ -662,9 +639,7 @@ private:
         case Type::Uint:
             return "ftou(" + value + ')';
         case Type::HalfFloat:
-            // Can't be handled as a stand-alone value
-            UNREACHABLE();
-            return value;
+            return "toHalf2(" + value + ')';
         }
         UNREACHABLE();
         return value;
@@ -829,8 +804,12 @@ private:
                 // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
                 // to be constant by the standard).
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
-            } else {
+            } else if (device.HasVariableAoffi()) {
+                // Avoid using variable AOFFI on unsupported devices.
                 expr += "ftoi(" + Visit(operand) + ')';
+            } else {
+                // Insert 0 on devices not supporting variable AOFFI.
+                expr += '0';
             }
             if (index + 1 < aoffi.size()) {
                 expr += ", ";
@@ -1083,13 +1062,40 @@ private:
         return BitwiseCastResult(value, Type::HalfFloat);
     }
 
+    std::string HClamp(Operation operation) {
+        const std::string value = VisitOperand(operation, 0, Type::HalfFloat);
+        const std::string min = VisitOperand(operation, 1, Type::Float);
+        const std::string max = VisitOperand(operation, 2, Type::Float);
+        const std::string clamped = "clamp(" + value + ", vec2(" + min + "), vec2(" + max + "))";
+        return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
+    }
+
+    std::string HUnpack(Operation operation) {
+        const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
+        const auto value = [&]() -> std::string {
+            switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+            case Tegra::Shader::HalfType::H0_H1:
+                return operand;
+            case Tegra::Shader::HalfType::F32:
+                return "vec2(fromHalf2(" + operand + "))";
+            case Tegra::Shader::HalfType::H0_H0:
+                return "vec2(" + operand + "[0])";
+            case Tegra::Shader::HalfType::H1_H1:
+                return "vec2(" + operand + "[1])";
+            }
+            UNREACHABLE();
+            return "0";
+        }();
+        return "fromHalf2(" + value + ')';
+    }
+
     std::string HMergeF32(Operation operation) {
         return "float(toHalf2(" + Visit(operation[0]) + ")[0])";
     }
 
     std::string HMergeH0(Operation operation) {
-        return "fromHalf2(vec2(toHalf2(" + Visit(operation[0]) + ")[1], toHalf2(" +
-               Visit(operation[1]) + ")[0]))";
+        return "fromHalf2(vec2(toHalf2(" + Visit(operation[1]) + ")[0], toHalf2(" +
+               Visit(operation[0]) + ")[1]))";
     }
 
     std::string HMergeH1(Operation operation) {
@@ -1189,34 +1195,46 @@ private:
         return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
     }
 
+    template <bool with_nan>
+    std::string GenerateHalfComparison(Operation operation, std::string compare_op) {
+        std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
+                                                  Type::HalfFloat, Type::HalfFloat)};
+        if constexpr (!with_nan) {
+            return comparison;
+        }
+        return "halfFloatNanComparison(" + comparison + ", " +
+               VisitOperand(operation, 0, Type::HalfFloat) + ", " +
+               VisitOperand(operation, 1, Type::HalfFloat) + ')';
+    }
+
+    template <bool with_nan>
     std::string Logical2HLessThan(Operation operation) {
-        return GenerateBinaryCall(operation, "lessThan", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "lessThan");
     }
 
+    template <bool with_nan>
     std::string Logical2HEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "equal", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "equal");
     }
 
+    template <bool with_nan>
     std::string Logical2HLessEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "lessThanEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "lessThanEqual");
     }
 
+    template <bool with_nan>
     std::string Logical2HGreaterThan(Operation operation) {
-        return GenerateBinaryCall(operation, "greaterThan", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "greaterThan");
     }
 
+    template <bool with_nan>
     std::string Logical2HNotEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "notEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "notEqual");
     }
 
+    template <bool with_nan>
     std::string Logical2HGreaterEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "greaterThanEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual");
     }
 
     std::string Texture(Operation operation) {
@@ -1505,6 +1523,8 @@ private:
         &GLSLDecompiler::Fma<Type::HalfFloat>,
         &GLSLDecompiler::Absolute<Type::HalfFloat>,
         &GLSLDecompiler::HNegate,
+        &GLSLDecompiler::HClamp,
+        &GLSLDecompiler::HUnpack,
         &GLSLDecompiler::HMergeF32,
         &GLSLDecompiler::HMergeH0,
         &GLSLDecompiler::HMergeH1,
@@ -1541,12 +1561,18 @@ private:
         &GLSLDecompiler::LogicalNotEqual<Type::Uint>,
         &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>,
 
-        &GLSLDecompiler::Logical2HLessThan,
-        &GLSLDecompiler::Logical2HEqual,
-        &GLSLDecompiler::Logical2HLessEqual,
-        &GLSLDecompiler::Logical2HGreaterThan,
-        &GLSLDecompiler::Logical2HNotEqual,
-        &GLSLDecompiler::Logical2HGreaterEqual,
+        &GLSLDecompiler::Logical2HLessThan<false>,
+        &GLSLDecompiler::Logical2HEqual<false>,
+        &GLSLDecompiler::Logical2HLessEqual<false>,
+        &GLSLDecompiler::Logical2HGreaterThan<false>,
+        &GLSLDecompiler::Logical2HNotEqual<false>,
+        &GLSLDecompiler::Logical2HGreaterEqual<false>,
+        &GLSLDecompiler::Logical2HLessThan<true>,
+        &GLSLDecompiler::Logical2HEqual<true>,
+        &GLSLDecompiler::Logical2HLessEqual<true>,
+        &GLSLDecompiler::Logical2HGreaterThan<true>,
+        &GLSLDecompiler::Logical2HNotEqual<true>,
+        &GLSLDecompiler::Logical2HGreaterEqual<true>,
 
         &GLSLDecompiler::Texture,
         &GLSLDecompiler::TextureLod,
@@ -1625,6 +1651,7 @@ private:
         return name + '_' + std::to_string(index) + '_' + suffix;
     }
 
+    const Device& device;
     const ShaderIR& ir;
     const ShaderStage stage;
     const std::string suffix;
@@ -1647,11 +1674,18 @@ std::string GetCommonDeclarations() {
            "}\n\n"
            "vec2 toHalf2(float value) {\n"
            "    return unpackHalf2x16(ftou(value));\n"
+           "}\n\n"
+           "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {\n"
+           "    bvec2 is_nan1 = isnan(pair1);\n"
+           "    bvec2 is_nan2 = isnan(pair2);\n"
+           "    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
+           "is_nan2.y);\n"
            "}\n";
 }
 
-ProgramResult Decompile(const ShaderIR& ir, Maxwell::ShaderStage stage, const std::string& suffix) {
-    GLSLDecompiler decompiler(ir, stage, suffix);
+ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
+                        const std::string& suffix) {
+    GLSLDecompiler decompiler(device, ir, stage, suffix);
     decompiler.Decompile();
     return {decompiler.GetResult(), decompiler.GetShaderEntries()};
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 74032d237..c1569e737 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,6 +12,10 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/shader/shader_ir.h"
 
+namespace OpenGL {
+class Device;
+}
+
 namespace VideoCommon::Shader {
 class ShaderIR;
 }
@@ -77,7 +81,7 @@ struct ShaderEntries {
 
 std::string GetCommonDeclarations();
 
-ProgramResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage,
-                        const std::string& suffix);
+ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                        Maxwell::ShaderStage stage, const std::string& suffix);
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 53752b38d..254c0d499 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -104,7 +104,8 @@ bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const {
     return true;
 }
 
-ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
+ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system)
+    : system{system}, precompiled_cache_virtual_file_offset{0} {}
 
 std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
 ShaderDiskCacheOpenGL::LoadTransferable() {
@@ -177,6 +178,7 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
             return {};
         }
     }
+
     return {{raws, usages}};
 }
 
@@ -208,59 +210,64 @@ ShaderDiskCacheOpenGL::LoadPrecompiled() {
 std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
                         std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>>
 ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
+    // Read compressed file from disk and decompress to virtual precompiled cache file
+    std::vector<u8> compressed(file.GetSize());
+    file.ReadBytes(compressed.data(), compressed.size());
+    const std::vector<u8> decompressed = Common::Compression::DecompressDataZSTD(compressed);
+    SaveArrayToPrecompiled(decompressed.data(), decompressed.size());
+    precompiled_cache_virtual_file_offset = 0;
+
     ShaderCacheVersionHash file_hash{};
-    if (file.ReadArray(file_hash.data(), file_hash.size()) != file_hash.size()) {
+    if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) {
+        precompiled_cache_virtual_file_offset = 0;
         return {};
     }
     if (GetShaderCacheVersionHash() != file_hash) {
         LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator");
+        precompiled_cache_virtual_file_offset = 0;
         return {};
     }
 
     std::unordered_map<u64, ShaderDiskCacheDecompiled> decompiled;
     std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> dumps;
-    while (file.Tell() < file.GetSize()) {
+    while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) {
         PrecompiledEntryKind kind{};
-        if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) {
+        if (!LoadObjectFromPrecompiled(kind)) {
             return {};
         }
 
         switch (kind) {
         case PrecompiledEntryKind::Decompiled: {
             u64 unique_identifier{};
-            if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64))
+            if (!LoadObjectFromPrecompiled(unique_identifier)) {
                 return {};
+            }
 
-            const auto entry = LoadDecompiledEntry(file);
-            if (!entry)
+            const auto entry = LoadDecompiledEntry();
+            if (!entry) {
                 return {};
+            }
             decompiled.insert({unique_identifier, std::move(*entry)});
             break;
         }
         case PrecompiledEntryKind::Dump: {
             ShaderDiskCacheUsage usage;
-            if (file.ReadBytes(&usage, sizeof(usage)) != sizeof(usage))
+            if (!LoadObjectFromPrecompiled(usage)) {
                 return {};
+            }
 
             ShaderDiskCacheDump dump;
-            if (file.ReadBytes(&dump.binary_format, sizeof(u32)) != sizeof(u32))
-                return {};
-
-            u32 binary_length{};
-            u32 compressed_size{};
-            if (file.ReadBytes(&binary_length, sizeof(u32)) != sizeof(u32) ||
-                file.ReadBytes(&compressed_size, sizeof(u32)) != sizeof(u32)) {
+            if (!LoadObjectFromPrecompiled(dump.binary_format)) {
                 return {};
             }
 
-            std::vector<u8> compressed_binary(compressed_size);
-            if (file.ReadArray(compressed_binary.data(), compressed_binary.size()) !=
-                compressed_binary.size()) {
+            u32 binary_length{};
+            if (!LoadObjectFromPrecompiled(binary_length)) {
                 return {};
             }
 
-            dump.binary = Common::Compression::DecompressDataZSTD(compressed_binary);
-            if (dump.binary.empty()) {
+            dump.binary.resize(binary_length);
+            if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) {
                 return {};
             }
 
@@ -274,45 +281,41 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
     return {{decompiled, dumps}};
 }
 
-std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEntry(
-    FileUtil::IOFile& file) {
+std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEntry() {
     u32 code_size{};
-    u32 compressed_code_size{};
-    if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) ||
-        file.ReadBytes(&compressed_code_size, sizeof(u32)) != sizeof(u32)) {
+    if (!LoadObjectFromPrecompiled(code_size)) {
         return {};
     }
 
-    std::vector<u8> compressed_code(compressed_code_size);
-    if (file.ReadArray(compressed_code.data(), compressed_code.size()) != compressed_code.size()) {
+    std::vector<u8> code(code_size);
+    if (!LoadArrayFromPrecompiled(code.data(), code.size())) {
         return {};
     }
 
-    const std::vector<u8> code = Common::Compression::DecompressDataZSTD(compressed_code);
-    if (code.empty()) {
-        return {};
-    }
     ShaderDiskCacheDecompiled entry;
     entry.code = std::string(reinterpret_cast<const char*>(code.data()), code_size);
 
     u32 const_buffers_count{};
-    if (file.ReadBytes(&const_buffers_count, sizeof(u32)) != sizeof(u32))
+    if (!LoadObjectFromPrecompiled(const_buffers_count)) {
         return {};
+    }
+
     for (u32 i = 0; i < const_buffers_count; ++i) {
         u32 max_offset{};
         u32 index{};
         u8 is_indirect{};
-        if (file.ReadBytes(&max_offset, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&index, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&is_indirect, sizeof(u8)) != sizeof(u8)) {
+        if (!LoadObjectFromPrecompiled(max_offset) || !LoadObjectFromPrecompiled(index) ||
+            !LoadObjectFromPrecompiled(is_indirect)) {
             return {};
         }
         entry.entries.const_buffers.emplace_back(max_offset, is_indirect != 0, index);
     }
 
     u32 samplers_count{};
-    if (file.ReadBytes(&samplers_count, sizeof(u32)) != sizeof(u32))
+    if (!LoadObjectFromPrecompiled(samplers_count)) {
         return {};
+    }
+
     for (u32 i = 0; i < samplers_count; ++i) {
         u64 offset{};
         u64 index{};
@@ -320,12 +323,9 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
         u8 is_array{};
         u8 is_shadow{};
         u8 is_bindless{};
-        if (file.ReadBytes(&offset, sizeof(u64)) != sizeof(u64) ||
-            file.ReadBytes(&index, sizeof(u64)) != sizeof(u64) ||
-            file.ReadBytes(&type, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&is_array, sizeof(u8)) != sizeof(u8) ||
-            file.ReadBytes(&is_shadow, sizeof(u8)) != sizeof(u8) ||
-            file.ReadBytes(&is_bindless, sizeof(u8)) != sizeof(u8)) {
+        if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) ||
+            !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_array) ||
+            !LoadObjectFromPrecompiled(is_shadow) || !LoadObjectFromPrecompiled(is_bindless)) {
             return {};
         }
         entry.entries.samplers.emplace_back(static_cast<std::size_t>(offset),
@@ -335,17 +335,17 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
     }
 
     u32 global_memory_count{};
-    if (file.ReadBytes(&global_memory_count, sizeof(u32)) != sizeof(u32))
+    if (!LoadObjectFromPrecompiled(global_memory_count)) {
         return {};
+    }
+
     for (u32 i = 0; i < global_memory_count; ++i) {
         u32 cbuf_index{};
         u32 cbuf_offset{};
         u8 is_read{};
         u8 is_written{};
-        if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&is_read, sizeof(u8)) != sizeof(u8) ||
-            file.ReadBytes(&is_written, sizeof(u8)) != sizeof(u8)) {
+        if (!LoadObjectFromPrecompiled(cbuf_index) || !LoadObjectFromPrecompiled(cbuf_offset) ||
+            !LoadObjectFromPrecompiled(is_read) || !LoadObjectFromPrecompiled(is_written)) {
             return {};
         }
         entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0,
@@ -354,74 +354,81 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
 
     for (auto& clip_distance : entry.entries.clip_distances) {
         u8 clip_distance_raw{};
-        if (file.ReadBytes(&clip_distance_raw, sizeof(u8)) != sizeof(u8))
+        if (!LoadObjectFromPrecompiled(clip_distance_raw))
             return {};
         clip_distance = clip_distance_raw != 0;
     }
 
     u64 shader_length{};
-    if (file.ReadBytes(&shader_length, sizeof(u64)) != sizeof(u64))
+    if (!LoadObjectFromPrecompiled(shader_length)) {
         return {};
+    }
+
     entry.entries.shader_length = static_cast<std::size_t>(shader_length);
 
     return entry;
 }
 
-bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 unique_identifier,
-                                               const std::string& code,
-                                               const std::vector<u8>& compressed_code,
+bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std::string& code,
                                                const GLShader::ShaderEntries& entries) {
-    if (file.WriteObject(static_cast<u32>(PrecompiledEntryKind::Decompiled)) != 1 ||
-        file.WriteObject(unique_identifier) != 1 ||
-        file.WriteObject(static_cast<u32>(code.size())) != 1 ||
-        file.WriteObject(static_cast<u32>(compressed_code.size())) != 1 ||
-        file.WriteArray(compressed_code.data(), compressed_code.size()) != compressed_code.size()) {
+    if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Decompiled)) ||
+        !SaveObjectToPrecompiled(unique_identifier) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(code.size())) ||
+        !SaveArrayToPrecompiled(code.data(), code.size())) {
         return false;
     }
 
-    if (file.WriteObject(static_cast<u32>(entries.const_buffers.size())) != 1)
+    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.const_buffers.size()))) {
         return false;
+    }
     for (const auto& cbuf : entries.const_buffers) {
-        if (file.WriteObject(static_cast<u32>(cbuf.GetMaxOffset())) != 1 ||
-            file.WriteObject(static_cast<u32>(cbuf.GetIndex())) != 1 ||
-            file.WriteObject(static_cast<u8>(cbuf.IsIndirect() ? 1 : 0)) != 1) {
+        if (!SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetMaxOffset())) ||
+            !SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetIndex())) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(cbuf.IsIndirect() ? 1 : 0))) {
             return false;
         }
     }
 
-    if (file.WriteObject(static_cast<u32>(entries.samplers.size())) != 1)
+    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.samplers.size()))) {
         return false;
+    }
     for (const auto& sampler : entries.samplers) {
-        if (file.WriteObject(static_cast<u64>(sampler.GetOffset())) != 1 ||
-            file.WriteObject(static_cast<u64>(sampler.GetIndex())) != 1 ||
-            file.WriteObject(static_cast<u32>(sampler.GetType())) != 1 ||
-            file.WriteObject(static_cast<u8>(sampler.IsArray() ? 1 : 0)) != 1 ||
-            file.WriteObject(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) != 1 ||
-            file.WriteObject(static_cast<u8>(sampler.IsBindless() ? 1 : 0)) != 1) {
+        if (!SaveObjectToPrecompiled(static_cast<u64>(sampler.GetOffset())) ||
+            !SaveObjectToPrecompiled(static_cast<u64>(sampler.GetIndex())) ||
+            !SaveObjectToPrecompiled(static_cast<u32>(sampler.GetType())) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsArray() ? 1 : 0)) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsBindless() ? 1 : 0))) {
             return false;
         }
     }
 
-    if (file.WriteObject(static_cast<u32>(entries.global_memory_entries.size())) != 1)
+    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.global_memory_entries.size()))) {
         return false;
+    }
     for (const auto& gmem : entries.global_memory_entries) {
-        if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 ||
-            file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1 ||
-            file.WriteObject(static_cast<u8>(gmem.IsRead() ? 1 : 0)) != 1 ||
-            file.WriteObject(static_cast<u8>(gmem.IsWritten() ? 1 : 0)) != 1) {
+        if (!SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufIndex())) ||
+            !SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufOffset())) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsRead() ? 1 : 0)) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsWritten() ? 1 : 0))) {
             return false;
         }
     }
 
     for (const bool clip_distance : entries.clip_distances) {
-        if (file.WriteObject(static_cast<u8>(clip_distance ? 1 : 0)) != 1)
+        if (!SaveObjectToPrecompiled(static_cast<u8>(clip_distance ? 1 : 0))) {
             return false;
+        }
     }
 
-    return file.WriteObject(static_cast<u64>(entries.shader_length)) == 1;
+    if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) {
+        return false;
+    }
+
+    return true;
 }
 
-void ShaderDiskCacheOpenGL::InvalidateTransferable() const {
+void ShaderDiskCacheOpenGL::InvalidateTransferable() {
     if (!FileUtil::Delete(GetTransferablePath())) {
         LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}",
                   GetTransferablePath());
@@ -429,7 +436,10 @@ void ShaderDiskCacheOpenGL::InvalidateTransferable() const {
     InvalidatePrecompiled();
 }
 
-void ShaderDiskCacheOpenGL::InvalidatePrecompiled() const {
+void ShaderDiskCacheOpenGL::InvalidatePrecompiled() {
+    // Clear virtaul precompiled cache file
+    precompiled_cache_virtual_file.Resize(0);
+
     if (!FileUtil::Delete(GetPrecompiledPath())) {
         LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath());
     }
@@ -465,7 +475,10 @@ void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) {
     ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously");
 
     auto& usages{it->second};
-    ASSERT(usages.find(usage) == usages.end());
+    if (usages.find(usage) != usages.end()) {
+        // Skip this variant since the shader is already stored.
+        return;
+    }
     usages.insert(usage);
 
     FileUtil::IOFile file = AppendTransferableFile();
@@ -485,22 +498,13 @@ void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::str
     if (!IsUsable())
         return;
 
-    const std::vector<u8> compressed_code{Common::Compression::CompressDataZSTDDefault(
-        reinterpret_cast<const u8*>(code.data()), code.size())};
-    if (compressed_code.empty()) {
-        LOG_ERROR(Render_OpenGL, "Failed to compress GLSL code - skipping shader {:016x}",
-                  unique_identifier);
-        return;
+    if (precompiled_cache_virtual_file.GetSize() == 0) {
+        SavePrecompiledHeaderToVirtualPrecompiledCache();
     }
 
-    FileUtil::IOFile file = AppendPrecompiledFile();
-    if (!file.IsOpen())
-        return;
-
-    if (!SaveDecompiledFile(file, unique_identifier, code, compressed_code, entries)) {
+    if (!SaveDecompiledFile(unique_identifier, code, entries)) {
         LOG_ERROR(Render_OpenGL,
                   "Failed to save decompiled entry to the precompiled file - removing");
-        file.Close();
         InvalidatePrecompiled();
     }
 }
@@ -516,28 +520,13 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
     std::vector<u8> binary(binary_length);
     glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());
 
-    const std::vector<u8> compressed_binary =
-        Common::Compression::CompressDataZSTDDefault(binary.data(), binary.size());
-
-    if (compressed_binary.empty()) {
-        LOG_ERROR(Render_OpenGL, "Failed to compress binary program in shader={:016x}",
-                  usage.unique_identifier);
-        return;
-    }
-
-    FileUtil::IOFile file = AppendPrecompiledFile();
-    if (!file.IsOpen())
-        return;
-
-    if (file.WriteObject(static_cast<u32>(PrecompiledEntryKind::Dump)) != 1 ||
-        file.WriteObject(usage) != 1 || file.WriteObject(static_cast<u32>(binary_format)) != 1 ||
-        file.WriteObject(static_cast<u32>(binary_length)) != 1 ||
-        file.WriteObject(static_cast<u32>(compressed_binary.size())) != 1 ||
-        file.WriteArray(compressed_binary.data(), compressed_binary.size()) !=
-            compressed_binary.size()) {
+    if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Dump)) ||
+        !SaveObjectToPrecompiled(usage) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(binary_format)) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) ||
+        !SaveArrayToPrecompiled(binary.data(), binary.size())) {
         LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016x} - removing",
                   usage.unique_identifier);
-        file.Close();
         InvalidatePrecompiled();
         return;
     }
@@ -570,28 +559,33 @@ FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const {
     return file;
 }
 
-FileUtil::IOFile ShaderDiskCacheOpenGL::AppendPrecompiledFile() const {
-    if (!EnsureDirectories())
-        return {};
+void ShaderDiskCacheOpenGL::SavePrecompiledHeaderToVirtualPrecompiledCache() {
+    const auto hash{GetShaderCacheVersionHash()};
+    if (!SaveArrayToPrecompiled(hash.data(), hash.size())) {
+        LOG_ERROR(
+            Render_OpenGL,
+            "Failed to write precompiled cache version hash to virtual precompiled cache file");
+    }
+}
+
+void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
+    precompiled_cache_virtual_file_offset = 0;
+    const std::vector<u8>& uncompressed = precompiled_cache_virtual_file.ReadAllBytes();
+    const std::vector<u8>& compressed =
+        Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size());
 
     const auto precompiled_path{GetPrecompiledPath()};
-    const bool existed = FileUtil::Exists(precompiled_path);
+    FileUtil::IOFile file(precompiled_path, "wb");
 
-    FileUtil::IOFile file(precompiled_path, "ab");
     if (!file.IsOpen()) {
         LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path);
-        return {};
+        return;
     }
-
-    if (!existed || file.GetSize() == 0) {
-        const auto hash{GetShaderCacheVersionHash()};
-        if (file.WriteArray(hash.data(), hash.size()) != hash.size()) {
-            LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version hash in path={}",
-                      precompiled_path);
-            return {};
-        }
+    if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) {
+        LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}",
+                  precompiled_path);
+        return;
     }
-    return file;
 }
 
 bool ShaderDiskCacheOpenGL::EnsureDirectories() const {
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 6be0c0547..0142b2e3b 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -16,6 +16,7 @@
 
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "core/file_sys/vfs_vector.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 
@@ -172,10 +173,10 @@ public:
     LoadPrecompiled();
 
     /// Removes the transferable (and precompiled) cache file.
-    void InvalidateTransferable() const;
+    void InvalidateTransferable();
 
-    /// Removes the precompiled cache file.
-    void InvalidatePrecompiled() const;
+    /// Removes the precompiled cache file and clears virtual precompiled cache file.
+    void InvalidatePrecompiled();
 
     /// Saves a raw dump to the transferable file. Checks for collisions.
     void SaveRaw(const ShaderDiskCacheRaw& entry);
@@ -190,18 +191,21 @@ public:
     /// Saves a dump entry to the precompiled file. Does not check for collisions.
     void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program);
 
+    /// Serializes virtual precompiled shader cache file to real file
+    void SaveVirtualPrecompiledFile();
+
 private:
     /// Loads the transferable cache. Returns empty on failure.
     std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
                             std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>>
     LoadPrecompiledFile(FileUtil::IOFile& file);
 
-    /// Loads a decompiled cache entry from the passed file. Returns empty on failure.
-    std::optional<ShaderDiskCacheDecompiled> LoadDecompiledEntry(FileUtil::IOFile& file);
+    /// Loads a decompiled cache entry from m_precompiled_cache_virtual_file. Returns empty on
+    /// failure.
+    std::optional<ShaderDiskCacheDecompiled> LoadDecompiledEntry();
 
     /// Saves a decompiled entry to the passed file. Returns true on success.
-    bool SaveDecompiledFile(FileUtil::IOFile& file, u64 unique_identifier, const std::string& code,
-                            const std::vector<u8>& compressed_code,
+    bool SaveDecompiledFile(u64 unique_identifier, const std::string& code,
                             const GLShader::ShaderEntries& entries);
 
     /// Returns if the cache can be used
@@ -210,8 +214,8 @@ private:
     /// Opens current game's transferable file and write it's header if it doesn't exist
     FileUtil::IOFile AppendTransferableFile() const;
 
-    /// Opens current game's precompiled file and write it's header if it doesn't exist
-    FileUtil::IOFile AppendPrecompiledFile() const;
+    /// Save precompiled header to precompiled_cache_in_memory
+    void SavePrecompiledHeaderToVirtualPrecompiledCache();
 
     /// Create shader disk cache directories. Returns true on success.
     bool EnsureDirectories() const;
@@ -234,10 +238,42 @@ private:
     /// Get current game's title id
     std::string GetTitleID() const;
 
+    template <typename T>
+    bool SaveArrayToPrecompiled(const T* data, std::size_t length) {
+        const std::size_t write_length = precompiled_cache_virtual_file.WriteArray(
+            data, length, precompiled_cache_virtual_file_offset);
+        precompiled_cache_virtual_file_offset += write_length;
+        return write_length == sizeof(T) * length;
+    }
+
+    template <typename T>
+    bool LoadArrayFromPrecompiled(T* data, std::size_t length) {
+        const std::size_t read_length = precompiled_cache_virtual_file.ReadArray(
+            data, length, precompiled_cache_virtual_file_offset);
+        precompiled_cache_virtual_file_offset += read_length;
+        return read_length == sizeof(T) * length;
+    }
+
+    template <typename T>
+    bool SaveObjectToPrecompiled(const T& object) {
+        return SaveArrayToPrecompiled(&object, 1);
+    }
+
+    template <typename T>
+    bool LoadObjectFromPrecompiled(T& object) {
+        return LoadArrayFromPrecompiled(&object, 1);
+    }
+
     // Copre system
     Core::System& system;
     // Stored transferable shaders
     std::map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable;
+    // Stores whole precompiled cache which will be read from or saved to the precompiled chache
+    // file
+    FileSys::VectorVfsFile precompiled_cache_virtual_file;
+    // Stores the current offset of the precompiled cache file for IO purposes
+    std::size_t precompiled_cache_virtual_file_offset;
+
     // The cache has been loaded at boot
     bool tried_to_load{};
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 8763d9c71..6abf948f8 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -16,7 +16,7 @@ using VideoCommon::Shader::ShaderIR;
 
 static constexpr u32 PROGRAM_OFFSET{10};
 
-ProgramResult GenerateVertexShader(const ShaderSetup& setup) {
+ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -34,14 +34,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 
 )";
     ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
-    ProgramResult program = Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
+    ProgramResult program =
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
 
     out += program.first;
 
     if (setup.IsDualProgram()) {
         ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
         ProgramResult program_b =
-            Decompile(program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
+            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
 
         out += program_b.first;
     }
@@ -57,6 +58,9 @@ void main() {
     }
 
     out += R"(
+
+    // Set Position Y direction
+    position.y *= utof(config_pack[2]);
     // Check if the flip stage is VertexB
     // Config pack's second value is flip_stage
     if (config_pack[1] == 1) {
@@ -75,7 +79,7 @@ void main() {
     return {out, program.second};
 }
 
-ProgramResult GenerateGeometryShader(const ShaderSetup& setup) {
+ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -95,7 +99,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 )";
     ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
-        Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
     out += program.first;
 
     out += R"(
@@ -106,7 +110,7 @@ void main() {
     return {out, program.second};
 }
 
-ProgramResult GenerateFragmentShader(const ShaderSetup& setup) {
+ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -158,7 +162,7 @@ bool AlphaFunc(in float value) {
 )";
     ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
-        Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
 
     out += program.first;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index fad346b48..0536c8a03 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -10,6 +10,10 @@
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
 
+namespace OpenGL {
+class Device;
+}
+
 namespace OpenGL::GLShader {
 
 using VideoCommon::Shader::ProgramCode;
@@ -39,22 +43,13 @@ private:
     bool has_program_b{};
 };
 
-/**
- * Generates the GLSL vertex shader program source code for the given VS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateVertexShader(const ShaderSetup& setup);
-
-/**
- * Generates the GLSL geometry shader program source code for the given GS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateGeometryShader(const ShaderSetup& setup);
-
-/**
- * Generates the GLSL fragment shader program source code for the given FS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateFragmentShader(const ShaderSetup& setup);
+/// Generates the GLSL vertex shader program source code for the given VS program
+ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup);
+
+/// Generates the GLSL geometry shader program source code for the given GS program
+ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup);
+
+/// Generates the GLSL fragment shader program source code for the given FS program
+ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 52d569a1b..7425fbe5d 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -471,8 +471,9 @@ void OpenGLState::ApplyTextures() const {
         const auto& texture_unit = texture_units[i];
         auto& cur_state_texture_unit = cur_state.texture_units[i];
         textures[i] = texture_unit.texture;
-        if (cur_state_texture_unit.texture == textures[i])
+        if (cur_state_texture_unit.texture == textures[i]) {
             continue;
+        }
         cur_state_texture_unit.texture = textures[i];
         if (!has_delta) {
             first = i;
@@ -493,10 +494,11 @@ void OpenGLState::ApplySamplers() const {
     std::array<GLuint, Maxwell::NumTextureSamplers> samplers;
 
     for (std::size_t i = 0; i < std::size(samplers); ++i) {
-        if (cur_state.texture_units[i].sampler == texture_units[i].sampler)
+        samplers[i] = texture_units[i].sampler;
+        if (cur_state.texture_units[i].sampler == texture_units[i].sampler) {
             continue;
+        }
         cur_state.texture_units[i].sampler = texture_units[i].sampler;
-        samplers[i] = texture_units[i].sampler;
         if (!has_delta) {
             first = i;
             has_delta = true;
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index a8833c06e..95b773135 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -27,8 +27,7 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
     case Maxwell::VertexAttribute::Type::UnsignedInt:
-    case Maxwell::VertexAttribute::Type::UnsignedNorm: {
-
+    case Maxwell::VertexAttribute::Type::UnsignedNorm:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -47,16 +46,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            UNREACHABLE();
+            return {};
         }
-
-        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-        UNREACHABLE();
-        return {};
-    }
-
     case Maxwell::VertexAttribute::Type::SignedInt:
-    case Maxwell::VertexAttribute::Type::SignedNorm: {
-
+    case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -75,14 +71,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            UNREACHABLE();
+            return {};
         }
-
-        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-        UNREACHABLE();
-        return {};
-    }
-
-    case Maxwell::VertexAttribute::Type::Float: {
+    case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
         case Maxwell::VertexAttribute::Size::Size_16_16:
@@ -94,13 +88,16 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            UNREACHABLE();
+            return {};
         }
+    default:
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
+        UNREACHABLE();
+        return {};
     }
-    }
-
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-    UNREACHABLE();
-    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -129,10 +126,11 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
         return GL_TRIANGLES;
     case Maxwell::PrimitiveTopology::TriangleStrip:
         return GL_TRIANGLE_STRIP;
+    default:
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
+        UNREACHABLE();
+        return {};
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
-    UNREACHABLE();
-    return {};
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
@@ -186,9 +184,10 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
+    default:
+        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+        return GL_REPEAT;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 34bf26ff2..9fe1e3280 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -62,9 +62,10 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode) {
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
         UNIMPLEMENTED();
         return vk::SamplerAddressMode::eMirrorClampToEdge;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
+        return {};
     }
-    UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
-    return {};
 }
 
 vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) {
@@ -225,9 +226,10 @@ vk::PrimitiveTopology PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
         return vk::PrimitiveTopology::eTriangleList;
     case Maxwell::PrimitiveTopology::TriangleStrip:
         return vk::PrimitiveTopology::eTriangleStrip;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
+        return {};
     }
-    UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
-    return {};
 }
 
 vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 25500f9a3..23d9b10db 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -76,14 +76,10 @@ constexpr u32 GetGenericAttributeLocation(Attribute::Index attribute) {
 
 /// Returns true if an object has to be treated as precise
 bool IsPrecise(Operation operand) {
-    const auto& meta = operand.GetMeta();
-
+    const auto& meta{operand.GetMeta()};
     if (std::holds_alternative<MetaArithmetic>(meta)) {
         return std::get<MetaArithmetic>(meta).precise;
     }
-    if (std::holds_alternative<MetaHalfArithmetic>(meta)) {
-        return std::get<MetaHalfArithmetic>(meta).precise;
-    }
     return false;
 }
 
@@ -746,6 +742,16 @@ private:
         return {};
     }
 
+    Id HClamp(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id HUnpack(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id HMergeF32(Operation operation) {
         UNIMPLEMENTED();
         return {};
@@ -1218,6 +1224,8 @@ private:
         &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>,
         &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
         &SPIRVDecompiler::HNegate,
+        &SPIRVDecompiler::HClamp,
+        &SPIRVDecompiler::HUnpack,
         &SPIRVDecompiler::HMergeF32,
         &SPIRVDecompiler::HMergeH0,
         &SPIRVDecompiler::HMergeH1,
@@ -1260,6 +1268,13 @@ private:
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
+        // TODO(Rodrigo): Should these use the OpFUnord* variants?
+        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
 
         &SPIRVDecompiler::Texture,
         &SPIRVDecompiler::TextureLod,
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index e4c438792..2da595c0d 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -116,6 +116,8 @@ ExitMethod ShaderIR::Scan(u32 begin, u32 end, std::set<u32>& labels) {
             // Continue scanning for an exit method.
             break;
         }
+        default:
+            break;
         }
     }
     return exit_method = ExitMethod::AlwaysReturn;
@@ -206,4 +208,4 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
     return pc + 1;
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index baee89107..2098c1170 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -9,6 +9,7 @@
 
 namespace VideoCommon::Shader {
 
+using Tegra::Shader::HalfType;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 
@@ -18,48 +19,50 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
         opcode->get().GetId() == OpCode::Id::HADD2_R) {
-        UNIMPLEMENTED_IF(instr.alu_half.ftz != 0);
+        if (instr.alu_half.ftz != 0) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     }
-    UNIMPLEMENTED_IF_MSG(instr.alu_half.saturate != 0, "Half float saturation not implemented");
 
     const bool negate_a =
         opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0;
     const bool negate_b =
         opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0;
 
-    const Node op_a = GetOperandAbsNegHalf(GetRegister(instr.gpr8), instr.alu_half.abs_a, negate_a);
-
-    // instr.alu_half.type_a
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a);
 
-    Node op_b = [&]() {
+    auto [type_b, op_b] = [&]() -> std::tuple<HalfType, Node> {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HMUL2_C:
-            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            return {HalfType::F32, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
         case OpCode::Id::HADD2_R:
         case OpCode::Id::HMUL2_R:
-            return GetRegister(instr.gpr20);
+            return {instr.alu_half.type_b, GetRegister(instr.gpr20)};
         default:
             UNREACHABLE();
-            return Immediate(0);
+            return {HalfType::F32, Immediate(0)};
         }
     }();
-    op_b = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
+    op_b = UnpackHalfFloat(op_b, type_b);
+    // redeclaration to avoid a bug in clang with reusing local bindings in lambdas
+    Node op_b_alt = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
 
     Node value = [&]() {
-        MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a, instr.alu_half.type_b}};
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HADD2_R:
-            return Operation(OperationCode::HAdd, meta, op_a, op_b);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b_alt);
         case OpCode::Id::HMUL2_C:
         case OpCode::Id::HMUL2_R:
-            return Operation(OperationCode::HMul, meta, op_a, op_b);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b_alt);
         default:
             UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName());
             return Immediate(0);
         }
     }();
+    value = GetSaturatedHalfFloat(value, instr.alu_half.saturate);
     value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half.merge);
 
     SetRegister(bb, instr.gpr0, value);
@@ -67,4 +70,4 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index c2164ba50..fbcd35b18 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -17,34 +17,33 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) {
-        UNIMPLEMENTED_IF(instr.alu_half_imm.ftz != 0);
+        if (instr.alu_half_imm.ftz != 0) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     } else {
         UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
     }
-    UNIMPLEMENTED_IF_MSG(instr.alu_half_imm.saturate != 0,
-                         "Half float immediate saturation not implemented");
 
-    Node op_a = GetRegister(instr.gpr8);
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.alu_half_imm.abs_a, instr.alu_half_imm.negate_a);
 
     const Node op_b = UnpackHalfImmediate(instr, true);
 
     Node value = [&]() {
-        MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a}};
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_IMM:
-            return Operation(OperationCode::HAdd, meta, op_a, op_b);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
         case OpCode::Id::HMUL2_IMM:
-            return Operation(OperationCode::HMul, meta, op_a, op_b);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
         default:
             UNREACHABLE();
             return Immediate(0);
         }
     }();
-    value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
 
+    value = GetSaturatedHalfFloat(value, instr.alu_half_imm.saturate);
+    value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
     SetRegister(bb, instr.gpr0, value);
-
     return pc;
 }
 
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 55a6fbbf2..b5ec9a6f5 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -18,13 +18,29 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     switch (opcode->get().GetId()) {
-    case OpCode::Id::I2I_R: {
+    case OpCode::Id::I2I_R:
+    case OpCode::Id::I2I_C:
+    case OpCode::Id::I2I_IMM: {
         UNIMPLEMENTED_IF(instr.conversion.selector);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.alu.saturate_d);
 
         const bool input_signed = instr.conversion.is_input_signed;
         const bool output_signed = instr.conversion.is_output_signed;
 
-        Node value = GetRegister(instr.gpr20);
+        Node value = [&]() {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::I2I_R:
+                return GetRegister(instr.gpr20);
+            case OpCode::Id::I2I_C:
+                return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::I2I_IMM:
+                return Immediate(instr.alu.GetSignedImm20_20());
+            default:
+                UNREACHABLE();
+                return Immediate(0);
+            }
+        }();
         value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
 
         value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a,
@@ -38,17 +54,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::I2F_R:
-    case OpCode::Id::I2F_C: {
-        UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word);
+    case OpCode::Id::I2F_C:
+    case OpCode::Id::I2F_IMM: {
+        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
         UNIMPLEMENTED_IF(instr.conversion.selector);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in I2F is not implemented");
 
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::I2F_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::I2F_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::I2F_IMM:
+                return Immediate(instr.alu.GetSignedImm20_20());
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
         const bool input_signed = instr.conversion.is_input_signed;
@@ -62,24 +85,31 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::F2F_R:
-    case OpCode::Id::F2F_C: {
-        UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
+    case OpCode::Id::F2F_C:
+    case OpCode::Id::F2F_IMM: {
+        UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2F is not implemented");
 
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::F2F_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::F2F_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::F2F_IMM:
+                return GetImmediate19(instr);
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
 
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
 
         value = [&]() {
-            switch (instr.conversion.f2f.rounding) {
+            switch (instr.conversion.f2f.GetRoundingMode()) {
             case Tegra::Shader::F2fRoundingOp::None:
                 return value;
             case Tegra::Shader::F2fRoundingOp::Round:
@@ -90,10 +120,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
                 return Operation(OperationCode::FCeil, PRECISE, value);
             case Tegra::Shader::F2fRoundingOp::Trunc:
                 return Operation(OperationCode::FTrunc, PRECISE, value);
+            default:
+                UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
+                                  static_cast<u32>(instr.conversion.f2f.rounding.Value()));
+                return Immediate(0);
             }
-            UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
-                              static_cast<u32>(instr.conversion.f2f.rounding.Value()));
-            return Immediate(0);
         }();
         value = GetSaturatedFloat(value, instr.alu.saturate_d);
 
@@ -102,15 +133,22 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::F2I_R:
-    case OpCode::Id::F2I_C: {
+    case OpCode::Id::F2I_C:
+    case OpCode::Id::F2I_IMM: {
         UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2I is not implemented");
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::F2I_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::F2I_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::F2I_IMM:
+                return GetImmediate19(instr);
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
 
@@ -134,7 +172,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         }();
         const bool is_signed = instr.conversion.is_output_signed;
         value = SignedOperation(OperationCode::ICastFloat, is_signed, PRECISE, value);
-        value = ConvertIntegerSize(value, instr.conversion.dest_size, is_signed);
+        value = ConvertIntegerSize(value, instr.conversion.dst_size, is_signed);
 
         SetRegister(bb, instr.gpr0, value);
         break;
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 748368555..1dd94bf9d 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -18,11 +18,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.hset2.ftz != 0);
+    if (instr.hset2.ftz != 0) {
+        LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
 
-    // instr.hset2.type_a
-    // instr.hset2.type_b
-    Node op_a = GetRegister(instr.gpr8);
     Node op_b = [&]() {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HSET2_R:
@@ -32,14 +34,12 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
             return Immediate(0);
         }
     }();
-
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
+    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
     op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
     const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
 
-    MetaHalfArithmetic meta{false, {instr.hset2.type_a, instr.hset2.type_b}};
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, meta, op_a, op_b);
+    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index e68512692..6e59eb650 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -19,10 +19,10 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
 
     UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
 
-    Node op_a = GetRegister(instr.gpr8);
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
 
-    const Node op_b = [&]() {
+    Node op_b = [&]() {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HSETP2_R:
             return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
@@ -32,6 +32,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
             return Immediate(0);
         }
     }();
+    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
 
     // We can't use the constant predicate as destination.
     ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
@@ -42,8 +43,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     const OperationCode pair_combiner =
         instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
 
-    MetaHalfArithmetic meta = {false, {instr.hsetp2.type_a, instr.hsetp2.type_b}};
-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, meta, op_a, op_b);
+    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
     const Node first_pred = Operation(pair_combiner, comparison);
 
     // Set the primary predicate to the result of Predicate OP SecondPredicate
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index 7a07c5ec6..a425f9eb7 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -27,10 +27,6 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     }
 
     constexpr auto identity = HalfType::H0_H1;
-
-    const HalfType type_a = instr.hfma2.type_a;
-    const Node op_a = GetRegister(instr.gpr8);
-
     bool neg_b{}, neg_c{};
     auto [saturate, type_b, op_b, type_c,
           op_c] = [&]() -> std::tuple<bool, HalfType, Node, HalfType, Node> {
@@ -38,15 +34,14 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
         case OpCode::Id::HFMA2_CR:
             neg_b = instr.hfma2.negate_b;
             neg_c = instr.hfma2.negate_c;
-            return {instr.hfma2.saturate, instr.hfma2.type_b,
+            return {instr.hfma2.saturate, HalfType::F32,
                     GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()),
                     instr.hfma2.type_reg39, GetRegister(instr.gpr39)};
         case OpCode::Id::HFMA2_RC:
             neg_b = instr.hfma2.negate_b;
             neg_c = instr.hfma2.negate_c;
             return {instr.hfma2.saturate, instr.hfma2.type_reg39, GetRegister(instr.gpr39),
-                    instr.hfma2.type_b,
-                    GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
+                    HalfType::F32, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
         case OpCode::Id::HFMA2_RR:
             neg_b = instr.hfma2.rr.negate_b;
             neg_c = instr.hfma2.rr.negate_c;
@@ -60,13 +55,13 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
             return {false, identity, Immediate(0), identity, Immediate(0)};
         }
     }();
-    UNIMPLEMENTED_IF_MSG(saturate, "HFMA2 saturation is not implemented");
 
-    op_b = GetOperandAbsNegHalf(op_b, false, neg_b);
-    op_c = GetOperandAbsNegHalf(op_c, false, neg_c);
+    const Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hfma2.type_a);
+    op_b = GetOperandAbsNegHalf(UnpackHalfFloat(op_b, type_b), false, neg_b);
+    op_c = GetOperandAbsNegHalf(UnpackHalfFloat(op_c, type_c), false, neg_c);
 
-    MetaHalfArithmetic meta{true, {type_a, type_b, type_c}};
-    Node value = Operation(OperationCode::HFma, meta, op_a, op_b, op_c);
+    Node value = Operation(OperationCode::HFma, PRECISE, op_a, op_b, op_c);
+    value = GetSaturatedHalfFloat(value, saturate);
     value = HalfMerge(GetRegister(instr.gpr0), value, instr.hfma2.merge);
 
     SetRegister(bb, instr.gpr0, value);
@@ -74,4 +69,4 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index fa65ac9a9..8b574d4e5 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -296,7 +296,7 @@ const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg,
     ASSERT(cbuf_offset_imm != nullptr);
     const auto cbuf_offset = cbuf_offset_imm->GetValue();
     const auto cbuf_index = cbuf->GetIndex();
-    const u64 cbuf_key = (cbuf_index << 32) | cbuf_offset;
+    const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);
 
     // If this sampler has already been used, return the existing mapping.
     const auto itr =
@@ -541,7 +541,6 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
                             bool is_array, bool is_aoffi) {
     const std::size_t coord_count = GetCoordCount(texture_type);
     const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0);
-    const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0);
 
     // If enabled arrays index is always stored in the gpr8 field
     const u64 array_register = instr.gpr8.Value();
diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp
index db15c0718..04a776398 100644
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -56,9 +56,10 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
                     instr.xmad.mode,
                     Immediate(static_cast<u32>(instr.xmad.imm20_16)),
                     GetRegister(instr.gpr39)};
+        default:
+            UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName());
+            return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)};
         }
-        UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName());
-        return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)};
     }();
 
     op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16);
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index ac5112d78..e4eb0dfd9 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -189,7 +189,11 @@ Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) {
     const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
     const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
 
-    return Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, first_negate, second_negate);
+    return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate);
+}
+
+Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) {
+    return Operation(OperationCode::HUnpack, type, value);
 }
 
 Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
@@ -209,17 +213,26 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
 
 Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {
     if (absolute) {
-        value = Operation(OperationCode::HAbsolute, HALF_NO_PRECISE, value);
+        value = Operation(OperationCode::HAbsolute, NO_PRECISE, value);
     }
     if (negate) {
-        value = Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, GetPredicate(true),
+        value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true),
                           GetPredicate(true));
     }
     return value;
 }
 
+Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
+    if (!saturate) {
+        return value;
+    }
+    const Node positive_zero = Immediate(std::copysignf(0, 1));
+    const Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one);
+}
+
 Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::LogicalFLessThan},
         {PredCondition::Equal, OperationCode::LogicalFEqual},
         {PredCondition::LessEqual, OperationCode::LogicalFLessEqual},
@@ -255,7 +268,7 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N
 
 Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a,
                                              Node op_b) {
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::LogicalILessThan},
         {PredCondition::Equal, OperationCode::LogicalIEqual},
         {PredCondition::LessEqual, OperationCode::LogicalILessEqual},
@@ -283,40 +296,32 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
     return predicate;
 }
 
-Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition,
-                                          const MetaHalfArithmetic& meta, Node op_a, Node op_b) {
-
-    UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
-                             condition == PredCondition::NotEqualWithNan ||
-                             condition == PredCondition::LessEqualWithNan ||
-                             condition == PredCondition::GreaterThanWithNan ||
-                             condition == PredCondition::GreaterEqualWithNan,
-                         "Unimplemented NaN comparison for half floats");
-
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a,
+                                          Node op_b) {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::Logical2HLessThan},
         {PredCondition::Equal, OperationCode::Logical2HEqual},
         {PredCondition::LessEqual, OperationCode::Logical2HLessEqual},
         {PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan},
         {PredCondition::NotEqual, OperationCode::Logical2HNotEqual},
         {PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual},
-        {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThan},
-        {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqual},
-        {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqual},
-        {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThan},
-        {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqual}};
+        {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan},
+        {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan},
+        {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan},
+        {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan},
+        {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan}};
 
     const auto comparison{PredicateComparisonTable.find(condition)};
     UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                          "Unknown predicate comparison operation");
 
-    const Node predicate = Operation(comparison->second, meta, op_a, op_b);
+    const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
 
     return predicate;
 }
 
 OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) {
-    static const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = {
+    const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = {
         {PredOperation::And, OperationCode::LogicalAnd},
         {PredOperation::Or, OperationCode::LogicalOr},
         {PredOperation::Xor, OperationCode::LogicalXor},
@@ -434,11 +439,14 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
         return OperationCode::LogicalUGreaterEqual;
     case OperationCode::INegate:
         UNREACHABLE_MSG("Can't negate an unsigned integer");
+        return {};
     case OperationCode::IAbsolute:
         UNREACHABLE_MSG("Can't apply absolute to an unsigned integer");
+        return {};
+    default:
+        UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code));
+        return {};
     }
-    UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code));
-    return {};
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 57af8b10f..65f1e1de9 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -109,11 +109,13 @@ enum class OperationCode {
     UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
     UBitCount,        /// (MetaArithmetic, uint) -> uint
 
-    HAdd,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HMul,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HFma,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
+    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
     HAbsolute, /// (f16vec2 a) -> f16vec2
     HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
+    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
+    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
     HMergeF32, /// (f16vec2 src) -> float
     HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
     HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
@@ -150,12 +152,18 @@ enum class OperationCode {
     LogicalUNotEqual,     /// (uint a, uint b) -> bool
     LogicalUGreaterEqual, /// (uint a, uint b) -> bool
 
-    Logical2HLessThan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HLessEqual,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterThan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HNotEqual,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessThan,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqual,               /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqual,           /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThan,         /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqual,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessThanWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqualWithNan,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqualWithNan,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThanWithNan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqualWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
 
     Texture,                /// (MetaTexture, float[N] coords) -> float4
     TextureLod,             /// (MetaTexture, float[N] coords) -> float4
@@ -243,8 +251,9 @@ public:
     }
 
     bool operator<(const Sampler& rhs) const {
-        return std::tie(offset, index, type, is_array, is_shadow) <
-               std::tie(rhs.offset, rhs.index, rhs.type, rhs.is_array, rhs.is_shadow);
+        return std::tie(index, offset, type, is_array, is_shadow, is_bindless) <
+               std::tie(rhs.index, rhs.offset, rhs.type, rhs.is_array, rhs.is_shadow,
+                        rhs.is_bindless);
     }
 
 private:
@@ -308,13 +317,6 @@ struct MetaArithmetic {
     bool precise{};
 };
 
-struct MetaHalfArithmetic {
-    bool precise{};
-    std::array<Tegra::Shader::HalfType, 3> types = {Tegra::Shader::HalfType::H0_H1,
-                                                    Tegra::Shader::HalfType::H0_H1,
-                                                    Tegra::Shader::HalfType::H0_H1};
-};
-
 struct MetaTexture {
     const Sampler& sampler;
     Node array{};
@@ -326,11 +328,10 @@ struct MetaTexture {
     u32 element{};
 };
 
-constexpr MetaArithmetic PRECISE = {true};
-constexpr MetaArithmetic NO_PRECISE = {false};
-constexpr MetaHalfArithmetic HALF_NO_PRECISE = {false};
+inline constexpr MetaArithmetic PRECISE = {true};
+inline constexpr MetaArithmetic NO_PRECISE = {false};
 
-using Meta = std::variant<MetaArithmetic, MetaHalfArithmetic, MetaTexture>;
+using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
 
 /// Holds any kind of operation that can be done in the IR
 class OperationNode final {
@@ -734,10 +735,14 @@ private:
 
     /// Unpacks a half immediate from an instruction
     Node UnpackHalfImmediate(Tegra::Shader::Instruction instr, bool has_negation);
+    /// Unpacks a binary value into a half float pair with a type format
+    Node UnpackHalfFloat(Node value, Tegra::Shader::HalfType type);
     /// Merges a half pair into another value
     Node HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge);
     /// Conditionally absolute/negated half float pair. Absolute is applied first
     Node GetOperandAbsNegHalf(Node value, bool absolute, bool negate);
+    /// Conditionally saturates a half float pair
+    Node GetSaturatedHalfFloat(Node value, bool saturate = true);
 
     /// Returns a predicate comparing two floats
     Node GetPredicateComparisonFloat(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
@@ -745,8 +750,7 @@ private:
     Node GetPredicateComparisonInteger(Tegra::Shader::PredCondition condition, bool is_signed,
                                        Node op_a, Node op_b);
     /// Returns a predicate comparing two half floats. meta consumes how both pairs will be compared
-    Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition,
-                                    const MetaHalfArithmetic& meta, Node op_a, Node op_b);
+    Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
 
     /// Returns a predicate combiner operation
     OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index 3b022a456..6384fa8d2 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -178,39 +178,44 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::ABGR8S;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::ABGR8UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::B5G6R5:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::B5G6R5U;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::A2B10G10R10:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::A2B10G10R10U;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::A1B5G5R5:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::A1B5G5R5U;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R8:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::R8U;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::R8UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::G8R8:
         // TextureFormat::G8R8 is actually ordered red then green, as such we can use
         // PixelFormat::RG8U and PixelFormat::RG8S. This was tested with The Legend of Zelda: Breath
@@ -220,50 +225,55 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::RG8U;
         case Tegra::Texture::ComponentType::SNORM:
             return PixelFormat::RG8S;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R16_G16_B16_A16:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::RGBA16U;
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RGBA16F;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::BF10GF11RF11:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::R11FG11FB10F;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
     case Tegra::Texture::TextureFormat::R32_G32_B32_A32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RGBA32F;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::RGBA32UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R32_G32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RG32F;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::RG32UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R32_G32_B32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RGB32F;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R16:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
@@ -276,18 +286,20 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::R16UI;
         case Tegra::Texture::ComponentType::SINT:
             return PixelFormat::R16I;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::R32F;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::R32UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::ZF32:
         return PixelFormat::Z32F;
     case Tegra::Texture::TextureFormat::Z16:
@@ -310,9 +322,10 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::DXN2UNORM;
         case Tegra::Texture::ComponentType::SNORM:
             return PixelFormat::DXN2SNORM;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::BC7U:
         return is_srgb ? PixelFormat::BC7U_SRGB : PixelFormat::BC7U;
     case Tegra::Texture::TextureFormat::BC6H_UF16:
@@ -343,15 +356,17 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::RG16UI;
         case Tegra::Texture::ComponentType::SINT:
             return PixelFormat::RG16I;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", static_cast<u32>(format),
-                     static_cast<u32>(component_type));
-        UNREACHABLE();
-        return PixelFormat::ABGR8U;
+        break;
     }
+    LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", static_cast<u32>(format),
+                 static_cast<u32>(component_type));
+    UNREACHABLE();
+    return PixelFormat::ABGR8U;
 }
 
 ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type) {
@@ -513,8 +528,9 @@ bool IsFormatBCn(PixelFormat format) {
     case PixelFormat::DXT45_SRGB:
     case PixelFormat::BC7U_SRGB:
         return true;
+    default:
+        return false;
     }
-    return false;
 }
 
 } // namespace VideoCore::Surface
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index b508d64e9..eafb6b73a 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -1616,6 +1616,7 @@ namespace Tegra::Texture::ASTC {
 std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height,
                                 uint32_t depth, uint32_t block_width, uint32_t block_height) {
     uint32_t blockIdx = 0;
+    std::size_t depth_offset = 0;
     std::vector<uint8_t> outData(height * width * depth * 4);
     for (uint32_t k = 0; k < depth; k++) {
         for (uint32_t j = 0; j < height; j += block_height) {
@@ -1630,7 +1631,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he
                 uint32_t decompWidth = std::min(block_width, width - i);
                 uint32_t decompHeight = std::min(block_height, height - j);
 
-                uint8_t* outRow = outData.data() + (j * width + i) * 4;
+                uint8_t* outRow = depth_offset + outData.data() + (j * width + i) * 4;
                 for (uint32_t jj = 0; jj < decompHeight; jj++) {
                     memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4);
                 }
@@ -1638,6 +1639,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he
                 blockIdx++;
             }
         }
+        depth_offset += height * width * 4;
     }
 
     return outData;
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 995d0e068..217805386 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -288,6 +288,29 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
     }
 }
 
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data) {
+    const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x};
+    std::size_t count = 0;
+    for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
+        const std::size_t gob_address_y =
+            (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
+            ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
+        const auto& table = legacy_swizzle_table[y % gob_size_y];
+        for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
+            const std::size_t gob_address =
+                gob_address_y + (x / gob_size_x) * gob_size * block_height;
+            const std::size_t swizzled_offset = gob_address + table[x % gob_size_x];
+            const u8* source_line = source_data + count;
+            u8* dest_addr = swizzle_data + swizzled_offset;
+            count++;
+
+            std::memcpy(dest_addr, source_line, 1);
+        }
+    }
+}
+
 std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
                               u32 height) {
     std::vector<u8> rgba_data;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index e078fa274..e072d8401 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -51,4 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
                       u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
                       u32 offset_x, u32 offset_y);
 
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data);
+
 } // namespace Tegra::Texture