39 files changed, 652 insertions, 282 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 1e31a2900..6821f275d 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -36,6 +36,8 @@ add_library(video_core STATIC
     renderer_base.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
+    renderer_opengl/gl_device.cpp
+    renderer_opengl/gl_device.h
     renderer_opengl/gl_global_cache.cpp
     renderer_opengl/gl_global_cache.h
     renderer_opengl/gl_primitive_assembler.cpp
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 046d047cb..6674d9405 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -57,8 +57,8 @@ bool DmaPusher::Step() {
 
     // Push buffer non-empty, read a word
     command_headers.resize(command_list_header.size);
-    gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(),
-                                  command_list_header.size * sizeof(u32));
+    gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                        command_list_header.size * sizeof(u32));
 
     for (const CommandHeader& command_header : command_headers) {
 
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index cd51a31d7..7387886a3 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -10,6 +10,7 @@
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
@@ -27,30 +28,46 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
 
     switch (method_call.method) {
     case KEPLERMEMORY_REG_INDEX(exec): {
-        state.write_offset = 0;
+        ProcessExec();
         break;
     }
     case KEPLERMEMORY_REG_INDEX(data): {
-        ProcessData(method_call.argument);
+        ProcessData(method_call.argument, method_call.IsLastCall());
         break;
     }
     }
 }
 
-void KeplerMemory::ProcessData(u32 data) {
-    ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
-    ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
-
-    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
-    // We do this before actually writing the new data because the destination address might
-    // contain a dirty surface that will have to be written back to memory.
-    const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)};
-    rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32));
-    memory_manager.Write<u32>(address, data);
-
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+void KeplerMemory::ProcessExec() {
+    state.write_offset = 0;
+    state.copy_size = regs.line_length_in * regs.line_count;
+    state.inner_buffer.resize(state.copy_size);
+}
 
-    state.write_offset++;
+void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
+    const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset);
+    std::memcpy(&state.inner_buffer[state.write_offset], &regs.data, sub_copy_size);
+    state.write_offset += sub_copy_size;
+    if (is_last_call) {
+        const GPUVAddr address{regs.dest.Address()};
+        if (regs.exec.linear != 0) {
+            memory_manager.WriteBlock(address, state.inner_buffer.data(), state.copy_size);
+        } else {
+            UNIMPLEMENTED_IF(regs.dest.z != 0);
+            UNIMPLEMENTED_IF(regs.dest.depth != 1);
+            UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
+            UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
+            const std::size_t dst_size = Tegra::Texture::CalculateSize(
+                true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
+            std::vector<u8> tmp_buffer(dst_size);
+            memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
+            Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x,
+                                          regs.dest.y, regs.dest.BlockHeight(), state.copy_size,
+                                          state.inner_buffer.data(), tmp_buffer.data());
+            memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+        }
+        system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+    }
 }
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 78b6c3e45..5f892ddad 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -51,7 +52,11 @@ public:
                     u32 address_high;
                     u32 address_low;
                     u32 pitch;
-                    u32 block_dimensions;
+                    union {
+                        BitField<0, 4, u32> block_width;
+                        BitField<4, 4, u32> block_height;
+                        BitField<8, 4, u32> block_depth;
+                    };
                     u32 width;
                     u32 height;
                     u32 depth;
@@ -63,6 +68,18 @@ public:
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                      address_low);
                     }
+
+                    u32 BlockWidth() const {
+                        return 1U << block_width.Value();
+                    }
+
+                    u32 BlockHeight() const {
+                        return 1U << block_height.Value();
+                    }
+
+                    u32 BlockDepth() const {
+                        return 1U << block_depth.Value();
+                    }
                 } dest;
 
                 struct {
@@ -81,6 +98,8 @@ public:
 
     struct {
         u32 write_offset = 0;
+        u32 copy_size = 0;
+        std::vector<u8> inner_buffer;
     } state{};
 
 private:
@@ -88,7 +107,8 @@ private:
     VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
 
-    void ProcessData(u32 data);
+    void ProcessExec();
+    void ProcessData(u32 data, bool is_last_call);
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index b198793bc..9780417f2 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -418,7 +418,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
 
     Texture::TICEntry tic_entry;
-    memory_manager.ReadBlock(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
 
     ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
                    tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
@@ -439,7 +439,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
     const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
 
     Texture::TSCEntry tsc_entry;
-    memory_manager.ReadBlock(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
     return tsc_entry;
 }
 
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index fce9733b9..e5b4eadea 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -937,21 +937,34 @@ union Instruction {
     } iset;
 
     union {
-        BitField<8, 2, Register::Size> dest_size;
-        BitField<10, 2, Register::Size> src_size;
-        BitField<12, 1, u64> is_output_signed;
-        BitField<13, 1, u64> is_input_signed;
-        BitField<41, 2, u64> selector;
+        BitField<41, 2, u64> selector; // i2i and i2f only
         BitField<45, 1, u64> negate_a;
         BitField<49, 1, u64> abs_a;
+        BitField<10, 2, Register::Size> src_size;
+        BitField<13, 1, u64> is_input_signed;
+        BitField<8, 2, Register::Size> dst_size;
+        BitField<12, 1, u64> is_output_signed;
+
+        union {
+            BitField<39, 2, u64> tab5cb8_2;
+        } i2f;
 
         union {
             BitField<39, 2, F2iRoundingOp> rounding;
         } f2i;
 
         union {
-            BitField<39, 4, F2fRoundingOp> rounding;
+            BitField<8, 2, Register::Size> src_size;
+            BitField<10, 2, Register::Size> dst_size;
+            BitField<39, 4, u64> rounding;
+            // H0, H1 extract for F16 missing
+            BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
+            F2fRoundingOp GetRoundingMode() const {
+                constexpr u64 rounding_mask = 0x0B;
+                return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
+            }
         } f2f;
+
     } conversion;
 
     union {
@@ -1734,7 +1747,7 @@ private:
             INST("0011100-00101---", Id::SHR_IMM, Type::Shift, "SHR_IMM"),
             INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"),
             INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"),
-            INST("01110001-1000---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
+            INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
             INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"),
             INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"),
             INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"),
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index de30ea354..fe6628923 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -207,6 +207,11 @@ public:
         };
     } regs{};
 
+    /// Performs any additional setup necessary in order to begin GPU emulation.
+    /// This can be used to launch any necessary threads and register any necessary
+    /// core timing events.
+    virtual void Start() = 0;
+
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index db507cf04..d4e2553a9 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -9,10 +9,14 @@
 namespace VideoCommon {
 
 GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : Tegra::GPU(system, renderer), gpu_thread{system, renderer, *dma_pusher} {}
+    : GPU(system, renderer), gpu_thread{system} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
+void GPUAsynch::Start() {
+    gpu_thread.StartThread(renderer, *dma_pusher);
+}
+
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 1dcc61a6c..30be74cba 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -13,16 +13,13 @@ class RendererBase;
 
 namespace VideoCommon {
 
-namespace GPUThread {
-class ThreadManager;
-} // namespace GPUThread
-
 /// Implementation of GPU interface that runs the GPU asynchronously
 class GPUAsynch : public Tegra::GPU {
 public:
     explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer);
     ~GPUAsynch() override;
 
+    void Start() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 2cfc900ed..45e43b1dc 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -8,10 +8,12 @@
 namespace VideoCommon {
 
 GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : Tegra::GPU(system, renderer) {}
+    : GPU(system, renderer) {}
 
 GPUSynch::~GPUSynch() = default;
 
+void GPUSynch::Start() {}
+
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->Push(std::move(entries));
     dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 766b5631c..3031fcf72 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -18,6 +18,7 @@ public:
     explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer);
     ~GPUSynch() override;
 
+    void Start() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index cc56cf467..c9a2077de 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -55,19 +55,24 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
     }
 }
 
-ThreadManager::ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
-                             Tegra::DmaPusher& dma_pusher)
-    : system{system}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)} {
-    synchronization_event = system.CoreTiming().RegisterEvent(
-        "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
-}
+ThreadManager::ThreadManager(Core::System& system) : system{system} {}
 
 ThreadManager::~ThreadManager() {
+    if (!thread.joinable()) {
+        return;
+    }
+
     // Notify GPU thread that a shutdown is pending
     PushCommand(EndProcessingCommand());
     thread.join();
 }
 
+void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) {
+    thread = std::thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)};
+    synchronization_event = system.CoreTiming().RegisterEvent(
+        "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
+}
+
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
     const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
     const s64 synchronization_ticks{Core::Timing::usToCycles(9000)};
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 62bcea5bb..cc14527c7 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -138,10 +138,12 @@ struct SynchState final {
 /// Class used to manage the GPU thread
 class ThreadManager final {
 public:
-    explicit ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
-                           Tegra::DmaPusher& dma_pusher);
+    explicit ThreadManager(Core::System& system);
     ~ThreadManager();
 
+    /// Creates and starts the GPU thread.
+    void StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
+
     /// Push GPU command entries to be processed
     void SubmitList(Tegra::CommandList&& entries);
 
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 0f4e820aa..6c98c6701 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -199,7 +199,15 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
     return {};
 }
 
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const {
+bool MemoryManager::IsBlockContinous(const GPUVAddr start, const std::size_t size) {
+    const GPUVAddr end = start + size;
+    const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start));
+    const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end));
+    const std::size_t range = static_cast<std::size_t>(host_ptr_end - host_ptr_start);
+    return range == size;
+}
+
+void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
     std::size_t remaining_size{size};
     std::size_t page_index{src_addr >> page_bits};
     std::size_t page_offset{src_addr & page_mask};
@@ -226,7 +234,30 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t
     }
 }
 
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) {
+void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+                                    const std::size_t size) const {
+    std::size_t remaining_size{size};
+    std::size_t page_index{src_addr >> page_bits};
+    std::size_t page_offset{src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+        const u8* page_pointer = page_table.pointers[page_index];
+        if (page_pointer) {
+            const u8* src_ptr{page_pointer + page_offset};
+            std::memcpy(dest_buffer, src_ptr, copy_amount);
+        } else {
+            std::memset(dest_buffer, 0, copy_amount);
+        }
+        page_index++;
+        page_offset = 0;
+        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
     std::size_t remaining_size{size};
     std::size_t page_index{dest_addr >> page_bits};
     std::size_t page_offset{dest_addr & page_mask};
@@ -253,7 +284,28 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::
     }
 }
 
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) {
+void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+                                     const std::size_t size) {
+    std::size_t remaining_size{size};
+    std::size_t page_index{dest_addr >> page_bits};
+    std::size_t page_offset{dest_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+        u8* page_pointer = page_table.pointers[page_index];
+        if (page_pointer) {
+            u8* dest_ptr{page_pointer + page_offset};
+            std::memcpy(dest_ptr, src_buffer, copy_amount);
+        }
+        page_index++;
+        page_offset = 0;
+        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
     std::size_t remaining_size{size};
     std::size_t page_index{src_addr >> page_bits};
     std::size_t page_offset{src_addr & page_mask};
@@ -281,6 +333,12 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t
     }
 }
 
+void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+    std::vector<u8> tmp_buffer(size);
+    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+}
+
 void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
                              VAddr backing_addr) {
     LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size,
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 647cbf93a..e4f0c4bd6 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -65,9 +65,32 @@ public:
     u8* GetPointer(GPUVAddr addr);
     const u8* GetPointer(GPUVAddr addr) const;
 
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    // Returns true if the block is continous in host memory, false otherwise
+    bool IsBlockContinous(const GPUVAddr start, const std::size_t size);
+
+    /**
+     * ReadBlock and WriteBlock are full read and write operations over virtual
+     * GPU Memory. It's important to use these when GPU memory may not be continous
+     * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
+     * Flushes and Invalidations, respectively to each operation.
+     */
+    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
+    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
+    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
+
+    /**
+     * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
+     * WriteBlock respectively. In this versions, no flushing or invalidation is actually
+     * done and their performance is similar to a memcpy. This functions can be used
+     * on either of this 2 scenarios instead of their safe counterpart:
+     * - Memory which is sure to never be represented in the Host GPU.
+     * - Memory Managed by a Cache Manager. Example: Texture Flushing should use
+     * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
+     * being flushed.
+     */
+    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
 
 private:
     using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>;
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
new file mode 100644
index 000000000..b6d9e0ddb
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -0,0 +1,45 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+#include <glad/glad.h>
+
+#include "common/logging/log.h"
+#include "video_core/renderer_opengl/gl_device.h"
+
+namespace OpenGL {
+
+namespace {
+template <typename T>
+T GetInteger(GLenum pname) {
+    GLint temporary;
+    glGetIntegerv(pname, &temporary);
+    return static_cast<T>(temporary);
+}
+} // Anonymous namespace
+
+Device::Device() {
+    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    has_variable_aoffi = TestVariableAoffi();
+}
+
+bool Device::TestVariableAoffi() {
+    const GLchar* AOFFI_TEST = R"(#version 430 core
+uniform sampler2D tex;
+uniform ivec2 variable_offset;
+void main() {
+    gl_Position = textureOffset(tex, vec2(0), variable_offset);
+}
+)";
+    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)};
+    GLint link_status{};
+    glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
+    glDeleteProgram(shader);
+
+    const bool supported{link_status == GL_TRUE};
+    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported);
+    return supported;
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
new file mode 100644
index 000000000..78ff5ee58
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -0,0 +1,30 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+
+namespace OpenGL {
+
+class Device {
+public:
+    Device();
+
+    std::size_t GetUniformBufferAlignment() const {
+        return uniform_buffer_alignment;
+    }
+
+    bool HasVariableAoffi() const {
+        return has_variable_aoffi;
+    }
+
+private:
+    static bool TestVariableAoffi();
+
+    std::size_t uniform_buffer_alignment{};
+    bool has_variable_aoffi{};
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 6034dc489..9a088a503 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -99,7 +99,7 @@ struct FramebufferCacheKey {
 };
 
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system},
+    : res_cache{*this}, shader_cache{*this, system, device}, global_cache{*this}, system{system},
       screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
     OpenGLState::ApplyDefaultState();
 
@@ -107,8 +107,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
     state.draw.shader_program = 0;
     state.Apply();
 
-    glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
-
     LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
     CheckExtensions();
 }
@@ -315,8 +313,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu, stage);
-        const GLintptr offset = buffer_cache.UploadHostMemory(
-            &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
+        const GLintptr offset =
+            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
 
         // Bind the emulation info buffer
         bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
@@ -700,23 +698,24 @@ void RasterizerOpenGL::DrawArrays() {
     // Add space for index buffer (keeping in mind non-core primitives)
     switch (regs.draw.topology) {
     case Maxwell::PrimitiveTopology::Quads:
-        buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
+        buffer_size = Common::AlignUp(buffer_size, 4) +
                       primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
         break;
     default:
         if (is_indexed) {
-            buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + CalculateIndexBufferSize();
+            buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
         }
         break;
     }
 
     // Uniform space for the 5 shader stages
-    buffer_size =
-        Common::AlignUp<std::size_t>(buffer_size, 4) +
-        (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage;
+    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
+                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
+                      Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
-    buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
+    buffer_size +=
+        Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
 
     const bool invalidate = buffer_cache.Map(buffer_size);
     if (invalidate) {
@@ -848,8 +847,8 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        const GLintptr const_buffer_offset = buffer_cache.UploadMemory(
-            buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment));
+        const GLintptr const_buffer_offset =
+            buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
 
         bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
     }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index a0e056142..71b9c5ead 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -21,6 +21,7 @@
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_primitive_assembler.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
@@ -172,6 +173,7 @@ private:
     /// but are needed for correct emulation
     void CheckExtensions();
 
+    const Device device;
     OpenGLState state;
 
     RasterizerCacheOpenGL res_cache;
@@ -180,7 +182,6 @@ private:
     SamplerCacheOpenGL sampler_cache;
 
     Core::System& system;
-
     ScreenInfo& screen_info;
 
     std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
@@ -196,7 +197,6 @@ private:
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
     PrimitiveAssembler primitive_assembler{buffer_cache};
-    GLint uniform_buffer_alignment;
 
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 7a68b8738..5a25f5b37 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -640,13 +640,16 @@ void CachedSurface::LoadGLBuffer() {
             SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i);
     } else {
         const u32 bpp = params.GetFormatBpp() / 8;
-        const u32 copy_size = params.width * bpp;
+        const u32 copy_size = (params.width * bpp + GetDefaultBlockWidth(params.pixel_format) - 1) /
+                              GetDefaultBlockWidth(params.pixel_format);
         if (params.pitch == copy_size) {
             std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);
         } else {
+            const u32 height = (params.height + GetDefaultBlockHeight(params.pixel_format) - 1) /
+                               GetDefaultBlockHeight(params.pixel_format);
             const u8* start{params.host_ptr};
             u8* write_to = gl_buffer[0].data();
-            for (u32 h = params.height; h > 0; h--) {
+            for (u32 h = height; h > 0; h--) {
                 std::memcpy(write_to, start, copy_size);
                 start += params.pitch;
                 write_to += copy_size;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 99f67494c..2a81b1169 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -38,13 +38,15 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
 }
 
 /// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(const u8* host_ptr) {
+ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
+                          const u8* host_ptr) {
     ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
     ASSERT_OR_EXECUTE(host_ptr != nullptr, {
         std::fill(program_code.begin(), program_code.end(), 0);
         return program_code;
     });
-    std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
+    memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(),
+                                   program_code.size() * sizeof(u64));
     return program_code;
 }
 
@@ -134,8 +136,8 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 }
 
 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, ProgramCode program_code,
-                                      ProgramCode program_code_b) {
+GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
+                                      ProgramCode program_code, ProgramCode program_code_b) {
     GLShader::ShaderSetup setup(program_code);
     if (program_type == Maxwell::ShaderProgram::VertexA) {
         // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
@@ -149,11 +151,11 @@ GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, Progr
     switch (program_type) {
     case Maxwell::ShaderProgram::VertexA:
     case Maxwell::ShaderProgram::VertexB:
-        return GLShader::GenerateVertexShader(setup);
+        return GLShader::GenerateVertexShader(device, setup);
     case Maxwell::ShaderProgram::Geometry:
-        return GLShader::GenerateGeometryShader(setup);
+        return GLShader::GenerateGeometryShader(device, setup);
     case Maxwell::ShaderProgram::Fragment:
-        return GLShader::GenerateFragmentShader(setup);
+        return GLShader::GenerateFragmentShader(device, setup);
     default:
         LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
         UNREACHABLE();
@@ -212,22 +214,20 @@ std::set<GLenum> GetSupportedFormats() {
     return supported_formats;
 }
 
-} // namespace
+} // Anonymous namespace
 
-CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
+CachedShader::CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
                            Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                            const PrecompiledPrograms& precompiled_programs,
                            ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr)
     : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr},
       unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache},
       precompiled_programs{precompiled_programs} {
-
-    const std::size_t code_size = CalculateProgramSize(program_code);
-    const std::size_t code_size_b =
-        program_code_b.empty() ? 0 : CalculateProgramSize(program_code_b);
-
-    GLShader::ProgramResult program_result =
-        CreateProgram(program_type, program_code, program_code_b);
+    const std::size_t code_size{CalculateProgramSize(program_code)};
+    const std::size_t code_size_b{program_code_b.empty() ? 0
+                                                         : CalculateProgramSize(program_code_b)};
+    GLShader::ProgramResult program_result{
+        CreateProgram(device, program_type, program_code, program_code_b)};
     if (program_result.first.empty()) {
         // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
         return;
@@ -251,7 +251,6 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
     : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier},
       program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{
                                                               precompiled_programs} {
-
     code = std::move(result.first);
     entries = result.second;
     shader_length = entries.shader_length;
@@ -344,8 +343,9 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
     return {unique_identifier, base_bindings, primitive_mode};
 }
 
-ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system)
-    : RasterizerCache{rasterizer}, disk_cache{system} {}
+ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
+                                     const Device& device)
+    : RasterizerCache{rasterizer}, disk_cache{system}, device{device} {}
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -439,17 +439,18 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
     const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) {
     std::unordered_map<u64, UnspecializedShader> unspecialized;
 
-    if (callback)
+    if (callback) {
         callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
+    }
 
     for (std::size_t i = 0; i < raws.size(); ++i) {
-        if (stop_loading)
+        if (stop_loading) {
             return {};
-
+        }
         const auto& raw{raws[i]};
-        const u64 unique_identifier = raw.GetUniqueIdentifier();
-        const u64 calculated_hash =
-            GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB());
+        const u64 unique_identifier{raw.GetUniqueIdentifier()};
+        const u64 calculated_hash{
+            GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB())};
         if (unique_identifier != calculated_hash) {
             LOG_ERROR(
                 Render_OpenGL,
@@ -466,8 +467,8 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
             result = {stored_decompiled.code, stored_decompiled.entries};
         } else {
             // Otherwise decompile the shader at boot and save the result to the decompiled file
-            result =
-                CreateProgram(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB());
+            result = CreateProgram(device, raw.GetProgramType(), raw.GetProgramCode(),
+                                   raw.GetProgramCodeB());
             disk_cache.SaveDecompiled(unique_identifier, result.first, result.second);
         }
 
@@ -477,8 +478,9 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
             {raw.GetUniqueIdentifier(),
              {std::move(result.first), std::move(result.second), raw.GetProgramType()}});
 
-        if (callback)
+        if (callback) {
             callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
+        }
     }
     return unspecialized;
 }
@@ -497,11 +499,12 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     if (!shader) {
         // No shader found - create a new one
-        ProgramCode program_code{GetShaderCode(host_ptr)};
+        ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
         ProgramCode program_code_b;
         if (program == Maxwell::ShaderProgram::VertexA) {
-            program_code_b = GetShaderCode(
-                memory_manager.GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB)));
+            const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)};
+            program_code_b = GetShaderCode(memory_manager, program_addr_b,
+                                           memory_manager.GetPointer(program_addr_b));
         }
         const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
         const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
@@ -512,7 +515,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
                                                precompiled_programs, found->second, host_ptr);
         } else {
             shader = std::make_shared<CachedShader>(
-                cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
+                device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
                 std::move(program_code), std::move(program_code_b), host_ptr);
         }
         Register(shader);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 0cf8e0b3d..a332087f8 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -27,6 +27,7 @@ class System;
 namespace OpenGL {
 
 class CachedShader;
+class Device;
 class RasterizerOpenGL;
 struct UnspecializedShader;
 
@@ -38,7 +39,7 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
 
 class CachedShader final : public RasterizerCacheObject {
 public:
-    explicit CachedShader(VAddr cpu_addr, u64 unique_identifier,
+    explicit CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                           const PrecompiledPrograms& precompiled_programs,
                           ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr);
@@ -112,7 +113,8 @@ private:
 
 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
 public:
-    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system);
+    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
+                               const Device& device);
 
     /// Loads disk cache for the current game
     void LoadDiskCache(const std::atomic_bool& stop_loading,
@@ -130,6 +132,8 @@ private:
     CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
                                              const std::set<GLenum>& supported_formats);
 
+    const Device& device;
+
     std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
 
     ShaderDiskCacheOpenGL disk_cache;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 445048daf..ef1a1995f 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -15,6 +15,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
@@ -119,14 +120,10 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
 
 /// Returns true if an object has to be treated as precise
 bool IsPrecise(Operation operand) {
-    const auto& meta = operand.GetMeta();
-
+    const auto& meta{operand.GetMeta()};
     if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) {
         return arithmetic->precise;
     }
-    if (const auto half_arithmetic = std::get_if<MetaHalfArithmetic>(&meta)) {
-        return half_arithmetic->precise;
-    }
     return false;
 }
 
@@ -139,8 +136,9 @@ bool IsPrecise(Node node) {
 
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const ShaderIR& ir, ShaderStage stage, std::string suffix)
-        : ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
+                            std::string suffix)
+        : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
 
     void Decompile() {
         DeclareVertex();
@@ -627,28 +625,7 @@ private:
     }
 
     std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
-        std::string value = VisitOperand(operation, operand_index);
-        switch (type) {
-        case Type::HalfFloat: {
-            const auto half_meta = std::get_if<MetaHalfArithmetic>(&operation.GetMeta());
-            if (!half_meta) {
-                value = "toHalf2(" + value + ')';
-            }
-
-            switch (half_meta->types.at(operand_index)) {
-            case Tegra::Shader::HalfType::H0_H1:
-                return "toHalf2(" + value + ')';
-            case Tegra::Shader::HalfType::F32:
-                return "vec2(" + value + ')';
-            case Tegra::Shader::HalfType::H0_H0:
-                return "vec2(toHalf2(" + value + ")[0])";
-            case Tegra::Shader::HalfType::H1_H1:
-                return "vec2(toHalf2(" + value + ")[1])";
-            }
-        }
-        default:
-            return CastOperand(value, type);
-        }
+        return CastOperand(VisitOperand(operation, operand_index), type);
     }
 
     std::string CastOperand(const std::string& value, Type type) const {
@@ -662,9 +639,7 @@ private:
         case Type::Uint:
             return "ftou(" + value + ')';
         case Type::HalfFloat:
-            // Can't be handled as a stand-alone value
-            UNREACHABLE();
-            return value;
+            return "toHalf2(" + value + ')';
         }
         UNREACHABLE();
         return value;
@@ -829,8 +804,12 @@ private:
                 // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
                 // to be constant by the standard).
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
-            } else {
+            } else if (device.HasVariableAoffi()) {
+                // Avoid using variable AOFFI on unsupported devices.
                 expr += "ftoi(" + Visit(operand) + ')';
+            } else {
+                // Insert 0 on devices not supporting variable AOFFI.
+                expr += '0';
             }
             if (index + 1 < aoffi.size()) {
                 expr += ", ";
@@ -1083,13 +1062,40 @@ private:
         return BitwiseCastResult(value, Type::HalfFloat);
     }
 
+    std::string HClamp(Operation operation) {
+        const std::string value = VisitOperand(operation, 0, Type::HalfFloat);
+        const std::string min = VisitOperand(operation, 1, Type::Float);
+        const std::string max = VisitOperand(operation, 2, Type::Float);
+        const std::string clamped = "clamp(" + value + ", vec2(" + min + "), vec2(" + max + "))";
+        return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
+    }
+
+    std::string HUnpack(Operation operation) {
+        const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
+        const auto value = [&]() -> std::string {
+            switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+            case Tegra::Shader::HalfType::H0_H1:
+                return operand;
+            case Tegra::Shader::HalfType::F32:
+                return "vec2(fromHalf2(" + operand + "))";
+            case Tegra::Shader::HalfType::H0_H0:
+                return "vec2(" + operand + "[0])";
+            case Tegra::Shader::HalfType::H1_H1:
+                return "vec2(" + operand + "[1])";
+            }
+            UNREACHABLE();
+            return "0";
+        }();
+        return "fromHalf2(" + value + ')';
+    }
+
     std::string HMergeF32(Operation operation) {
         return "float(toHalf2(" + Visit(operation[0]) + ")[0])";
     }
 
     std::string HMergeH0(Operation operation) {
-        return "fromHalf2(vec2(toHalf2(" + Visit(operation[0]) + ")[1], toHalf2(" +
-               Visit(operation[1]) + ")[0]))";
+        return "fromHalf2(vec2(toHalf2(" + Visit(operation[1]) + ")[0], toHalf2(" +
+               Visit(operation[0]) + ")[1]))";
     }
 
     std::string HMergeH1(Operation operation) {
@@ -1189,34 +1195,46 @@ private:
         return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
     }
 
+    template <bool with_nan>
+    std::string GenerateHalfComparison(Operation operation, std::string compare_op) {
+        std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
+                                                  Type::HalfFloat, Type::HalfFloat)};
+        if constexpr (!with_nan) {
+            return comparison;
+        }
+        return "halfFloatNanComparison(" + comparison + ", " +
+               VisitOperand(operation, 0, Type::HalfFloat) + ", " +
+               VisitOperand(operation, 1, Type::HalfFloat) + ')';
+    }
+
+    template <bool with_nan>
     std::string Logical2HLessThan(Operation operation) {
-        return GenerateBinaryCall(operation, "lessThan", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "lessThan");
     }
 
+    template <bool with_nan>
     std::string Logical2HEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "equal", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "equal");
     }
 
+    template <bool with_nan>
     std::string Logical2HLessEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "lessThanEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "lessThanEqual");
     }
 
+    template <bool with_nan>
     std::string Logical2HGreaterThan(Operation operation) {
-        return GenerateBinaryCall(operation, "greaterThan", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "greaterThan");
     }
 
+    template <bool with_nan>
     std::string Logical2HNotEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "notEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "notEqual");
     }
 
+    template <bool with_nan>
     std::string Logical2HGreaterEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "greaterThanEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual");
     }
 
     std::string Texture(Operation operation) {
@@ -1505,6 +1523,8 @@ private:
         &GLSLDecompiler::Fma<Type::HalfFloat>,
         &GLSLDecompiler::Absolute<Type::HalfFloat>,
         &GLSLDecompiler::HNegate,
+        &GLSLDecompiler::HClamp,
+        &GLSLDecompiler::HUnpack,
         &GLSLDecompiler::HMergeF32,
         &GLSLDecompiler::HMergeH0,
         &GLSLDecompiler::HMergeH1,
@@ -1541,12 +1561,18 @@ private:
         &GLSLDecompiler::LogicalNotEqual<Type::Uint>,
         &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>,
 
-        &GLSLDecompiler::Logical2HLessThan,
-        &GLSLDecompiler::Logical2HEqual,
-        &GLSLDecompiler::Logical2HLessEqual,
-        &GLSLDecompiler::Logical2HGreaterThan,
-        &GLSLDecompiler::Logical2HNotEqual,
-        &GLSLDecompiler::Logical2HGreaterEqual,
+        &GLSLDecompiler::Logical2HLessThan<false>,
+        &GLSLDecompiler::Logical2HEqual<false>,
+        &GLSLDecompiler::Logical2HLessEqual<false>,
+        &GLSLDecompiler::Logical2HGreaterThan<false>,
+        &GLSLDecompiler::Logical2HNotEqual<false>,
+        &GLSLDecompiler::Logical2HGreaterEqual<false>,
+        &GLSLDecompiler::Logical2HLessThan<true>,
+        &GLSLDecompiler::Logical2HEqual<true>,
+        &GLSLDecompiler::Logical2HLessEqual<true>,
+        &GLSLDecompiler::Logical2HGreaterThan<true>,
+        &GLSLDecompiler::Logical2HNotEqual<true>,
+        &GLSLDecompiler::Logical2HGreaterEqual<true>,
 
         &GLSLDecompiler::Texture,
         &GLSLDecompiler::TextureLod,
@@ -1625,6 +1651,7 @@ private:
         return name + '_' + std::to_string(index) + '_' + suffix;
     }
 
+    const Device& device;
     const ShaderIR& ir;
     const ShaderStage stage;
     const std::string suffix;
@@ -1647,11 +1674,18 @@ std::string GetCommonDeclarations() {
            "}\n\n"
            "vec2 toHalf2(float value) {\n"
            "    return unpackHalf2x16(ftou(value));\n"
+           "}\n\n"
+           "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {\n"
+           "    bvec2 is_nan1 = isnan(pair1);\n"
+           "    bvec2 is_nan2 = isnan(pair2);\n"
+           "    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
+           "is_nan2.y);\n"
            "}\n";
 }
 
-ProgramResult Decompile(const ShaderIR& ir, Maxwell::ShaderStage stage, const std::string& suffix) {
-    GLSLDecompiler decompiler(ir, stage, suffix);
+ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
+                        const std::string& suffix) {
+    GLSLDecompiler decompiler(device, ir, stage, suffix);
     decompiler.Decompile();
     return {decompiler.GetResult(), decompiler.GetShaderEntries()};
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 74032d237..c1569e737 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,6 +12,10 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/shader/shader_ir.h"
 
+namespace OpenGL {
+class Device;
+}
+
 namespace VideoCommon::Shader {
 class ShaderIR;
 }
@@ -77,7 +81,7 @@ struct ShaderEntries {
 
 std::string GetCommonDeclarations();
 
-ProgramResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage,
-                        const std::string& suffix);
+ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                        Maxwell::ShaderStage stage, const std::string& suffix);
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 8763d9c71..6abf948f8 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -16,7 +16,7 @@ using VideoCommon::Shader::ShaderIR;
 
 static constexpr u32 PROGRAM_OFFSET{10};
 
-ProgramResult GenerateVertexShader(const ShaderSetup& setup) {
+ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -34,14 +34,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 
 )";
     ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
-    ProgramResult program = Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
+    ProgramResult program =
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
 
     out += program.first;
 
     if (setup.IsDualProgram()) {
         ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
         ProgramResult program_b =
-            Decompile(program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
+            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
 
         out += program_b.first;
     }
@@ -57,6 +58,9 @@ void main() {
     }
 
     out += R"(
+
+    // Set Position Y direction
+    position.y *= utof(config_pack[2]);
     // Check if the flip stage is VertexB
     // Config pack's second value is flip_stage
     if (config_pack[1] == 1) {
@@ -75,7 +79,7 @@ void main() {
     return {out, program.second};
 }
 
-ProgramResult GenerateGeometryShader(const ShaderSetup& setup) {
+ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -95,7 +99,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 )";
     ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
-        Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
     out += program.first;
 
     out += R"(
@@ -106,7 +110,7 @@ void main() {
     return {out, program.second};
 }
 
-ProgramResult GenerateFragmentShader(const ShaderSetup& setup) {
+ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -158,7 +162,7 @@ bool AlphaFunc(in float value) {
 )";
     ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
-        Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
 
     out += program.first;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index fad346b48..0536c8a03 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -10,6 +10,10 @@
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
 
+namespace OpenGL {
+class Device;
+}
+
 namespace OpenGL::GLShader {
 
 using VideoCommon::Shader::ProgramCode;
@@ -39,22 +43,13 @@ private:
     bool has_program_b{};
 };
 
-/**
- * Generates the GLSL vertex shader program source code for the given VS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateVertexShader(const ShaderSetup& setup);
-
-/**
- * Generates the GLSL geometry shader program source code for the given GS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateGeometryShader(const ShaderSetup& setup);
-
-/**
- * Generates the GLSL fragment shader program source code for the given FS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateFragmentShader(const ShaderSetup& setup);
+/// Generates the GLSL vertex shader program source code for the given VS program
+ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup);
+
+/// Generates the GLSL geometry shader program source code for the given GS program
+ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup);
+
+/// Generates the GLSL fragment shader program source code for the given FS program
+ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 25500f9a3..23d9b10db 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -76,14 +76,10 @@ constexpr u32 GetGenericAttributeLocation(Attribute::Index attribute) {
 
 /// Returns true if an object has to be treated as precise
 bool IsPrecise(Operation operand) {
-    const auto& meta = operand.GetMeta();
-
+    const auto& meta{operand.GetMeta()};
     if (std::holds_alternative<MetaArithmetic>(meta)) {
         return std::get<MetaArithmetic>(meta).precise;
     }
-    if (std::holds_alternative<MetaHalfArithmetic>(meta)) {
-        return std::get<MetaHalfArithmetic>(meta).precise;
-    }
     return false;
 }
 
@@ -746,6 +742,16 @@ private:
         return {};
     }
 
+    Id HClamp(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id HUnpack(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id HMergeF32(Operation operation) {
         UNIMPLEMENTED();
         return {};
@@ -1218,6 +1224,8 @@ private:
         &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>,
         &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
         &SPIRVDecompiler::HNegate,
+        &SPIRVDecompiler::HClamp,
+        &SPIRVDecompiler::HUnpack,
         &SPIRVDecompiler::HMergeF32,
         &SPIRVDecompiler::HMergeH0,
         &SPIRVDecompiler::HMergeH1,
@@ -1260,6 +1268,13 @@ private:
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
+        // TODO(Rodrigo): Should these use the OpFUnord* variants?
+        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
 
         &SPIRVDecompiler::Texture,
         &SPIRVDecompiler::TextureLod,
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index baee89107..9467f9417 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -18,7 +18,9 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
         opcode->get().GetId() == OpCode::Id::HADD2_R) {
-        UNIMPLEMENTED_IF(instr.alu_half.ftz != 0);
+        if (instr.alu_half.ftz != 0) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     }
     UNIMPLEMENTED_IF_MSG(instr.alu_half.saturate != 0, "Half float saturation not implemented");
 
@@ -27,9 +29,8 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
     const bool negate_b =
         opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0;
 
-    const Node op_a = GetOperandAbsNegHalf(GetRegister(instr.gpr8), instr.alu_half.abs_a, negate_a);
-
-    // instr.alu_half.type_a
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a);
 
     Node op_b = [&]() {
         switch (opcode->get().GetId()) {
@@ -44,17 +45,17 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
             return Immediate(0);
         }
     }();
+    op_b = UnpackHalfFloat(op_b, instr.alu_half.type_b);
     op_b = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
 
     Node value = [&]() {
-        MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a, instr.alu_half.type_b}};
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HADD2_R:
-            return Operation(OperationCode::HAdd, meta, op_a, op_b);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
         case OpCode::Id::HMUL2_C:
         case OpCode::Id::HMUL2_R:
-            return Operation(OperationCode::HMul, meta, op_a, op_b);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
         default:
             UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName());
             return Immediate(0);
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index c2164ba50..fbcd35b18 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -17,34 +17,33 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) {
-        UNIMPLEMENTED_IF(instr.alu_half_imm.ftz != 0);
+        if (instr.alu_half_imm.ftz != 0) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     } else {
         UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
     }
-    UNIMPLEMENTED_IF_MSG(instr.alu_half_imm.saturate != 0,
-                         "Half float immediate saturation not implemented");
 
-    Node op_a = GetRegister(instr.gpr8);
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.alu_half_imm.abs_a, instr.alu_half_imm.negate_a);
 
     const Node op_b = UnpackHalfImmediate(instr, true);
 
     Node value = [&]() {
-        MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a}};
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_IMM:
-            return Operation(OperationCode::HAdd, meta, op_a, op_b);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
         case OpCode::Id::HMUL2_IMM:
-            return Operation(OperationCode::HMul, meta, op_a, op_b);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
         default:
             UNREACHABLE();
             return Immediate(0);
         }
     }();
-    value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
 
+    value = GetSaturatedHalfFloat(value, instr.alu_half_imm.saturate);
+    value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
     SetRegister(bb, instr.gpr0, value);
-
     return pc;
 }
 
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 55a6fbbf2..ba15b1115 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -18,13 +18,29 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     switch (opcode->get().GetId()) {
-    case OpCode::Id::I2I_R: {
+    case OpCode::Id::I2I_R:
+    case OpCode::Id::I2I_C:
+    case OpCode::Id::I2I_IMM: {
         UNIMPLEMENTED_IF(instr.conversion.selector);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.alu.saturate_d);
 
         const bool input_signed = instr.conversion.is_input_signed;
         const bool output_signed = instr.conversion.is_output_signed;
 
-        Node value = GetRegister(instr.gpr20);
+        Node value = [&]() {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::I2I_R:
+                return GetRegister(instr.gpr20);
+            case OpCode::Id::I2I_C:
+                return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::I2I_IMM:
+                return Immediate(instr.alu.GetSignedImm20_20());
+            default:
+                UNREACHABLE();
+                return Immediate(0);
+            }
+        }();
         value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
 
         value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a,
@@ -38,17 +54,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::I2F_R:
-    case OpCode::Id::I2F_C: {
-        UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word);
+    case OpCode::Id::I2F_C:
+    case OpCode::Id::I2F_IMM: {
+        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
         UNIMPLEMENTED_IF(instr.conversion.selector);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in I2F is not implemented");
 
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::I2F_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::I2F_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::I2F_IMM:
+                return Immediate(instr.alu.GetSignedImm20_20());
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
         const bool input_signed = instr.conversion.is_input_signed;
@@ -62,24 +85,31 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::F2F_R:
-    case OpCode::Id::F2F_C: {
-        UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
+    case OpCode::Id::F2F_C:
+    case OpCode::Id::F2F_IMM: {
+        UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2F is not implemented");
 
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::F2F_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::F2F_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::F2F_IMM:
+                return GetImmediate19(instr);
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
 
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
 
         value = [&]() {
-            switch (instr.conversion.f2f.rounding) {
+            switch (instr.conversion.f2f.GetRoundingMode()) {
             case Tegra::Shader::F2fRoundingOp::None:
                 return value;
             case Tegra::Shader::F2fRoundingOp::Round:
@@ -102,15 +132,22 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::F2I_R:
-    case OpCode::Id::F2I_C: {
+    case OpCode::Id::F2I_C:
+    case OpCode::Id::F2I_IMM: {
         UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2I is not implemented");
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::F2I_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::F2I_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::F2I_IMM:
+                return GetImmediate19(instr);
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
 
@@ -134,7 +171,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         }();
         const bool is_signed = instr.conversion.is_output_signed;
         value = SignedOperation(OperationCode::ICastFloat, is_signed, PRECISE, value);
-        value = ConvertIntegerSize(value, instr.conversion.dest_size, is_signed);
+        value = ConvertIntegerSize(value, instr.conversion.dst_size, is_signed);
 
         SetRegister(bb, instr.gpr0, value);
         break;
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 748368555..1dd94bf9d 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -18,11 +18,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.hset2.ftz != 0);
+    if (instr.hset2.ftz != 0) {
+        LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
 
-    // instr.hset2.type_a
-    // instr.hset2.type_b
-    Node op_a = GetRegister(instr.gpr8);
     Node op_b = [&]() {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HSET2_R:
@@ -32,14 +34,12 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
             return Immediate(0);
         }
     }();
-
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
+    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
     op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
     const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
 
-    MetaHalfArithmetic meta{false, {instr.hset2.type_a, instr.hset2.type_b}};
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, meta, op_a, op_b);
+    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index e68512692..6e59eb650 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -19,10 +19,10 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
 
     UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
 
-    Node op_a = GetRegister(instr.gpr8);
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
 
-    const Node op_b = [&]() {
+    Node op_b = [&]() {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HSETP2_R:
             return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
@@ -32,6 +32,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
             return Immediate(0);
         }
     }();
+    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
 
     // We can't use the constant predicate as destination.
     ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
@@ -42,8 +43,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     const OperationCode pair_combiner =
         instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
 
-    MetaHalfArithmetic meta = {false, {instr.hsetp2.type_a, instr.hsetp2.type_b}};
-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, meta, op_a, op_b);
+    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
     const Node first_pred = Operation(pair_combiner, comparison);
 
     // Set the primary predicate to the result of Predicate OP SecondPredicate
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index 7a07c5ec6..5c1becce5 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -27,10 +27,6 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     }
 
     constexpr auto identity = HalfType::H0_H1;
-
-    const HalfType type_a = instr.hfma2.type_a;
-    const Node op_a = GetRegister(instr.gpr8);
-
     bool neg_b{}, neg_c{};
     auto [saturate, type_b, op_b, type_c,
           op_c] = [&]() -> std::tuple<bool, HalfType, Node, HalfType, Node> {
@@ -62,11 +58,11 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     }();
     UNIMPLEMENTED_IF_MSG(saturate, "HFMA2 saturation is not implemented");
 
-    op_b = GetOperandAbsNegHalf(op_b, false, neg_b);
-    op_c = GetOperandAbsNegHalf(op_c, false, neg_c);
+    const Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hfma2.type_a);
+    op_b = GetOperandAbsNegHalf(UnpackHalfFloat(op_b, type_b), false, neg_b);
+    op_c = GetOperandAbsNegHalf(UnpackHalfFloat(op_c, type_c), false, neg_c);
 
-    MetaHalfArithmetic meta{true, {type_a, type_b, type_c}};
-    Node value = Operation(OperationCode::HFma, meta, op_a, op_b, op_c);
+    Node value = Operation(OperationCode::HFma, PRECISE, op_a, op_b, op_c);
     value = HalfMerge(GetRegister(instr.gpr0), value, instr.hfma2.merge);
 
     SetRegister(bb, instr.gpr0, value);
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index ac5112d78..17f2f711c 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -189,7 +189,11 @@ Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) {
     const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
     const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
 
-    return Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, first_negate, second_negate);
+    return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate);
+}
+
+Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) {
+    return Operation(OperationCode::HUnpack, type, value);
 }
 
 Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
@@ -209,17 +213,26 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
 
 Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {
     if (absolute) {
-        value = Operation(OperationCode::HAbsolute, HALF_NO_PRECISE, value);
+        value = Operation(OperationCode::HAbsolute, NO_PRECISE, value);
     }
     if (negate) {
-        value = Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, GetPredicate(true),
+        value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true),
                           GetPredicate(true));
     }
     return value;
 }
 
+Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
+    if (!saturate) {
+        return value;
+    }
+    const Node positive_zero = Immediate(std::copysignf(0, 1));
+    const Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one);
+}
+
 Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::LogicalFLessThan},
         {PredCondition::Equal, OperationCode::LogicalFEqual},
         {PredCondition::LessEqual, OperationCode::LogicalFLessEqual},
@@ -255,7 +268,7 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N
 
 Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a,
                                              Node op_b) {
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::LogicalILessThan},
         {PredCondition::Equal, OperationCode::LogicalIEqual},
         {PredCondition::LessEqual, OperationCode::LogicalILessEqual},
@@ -283,40 +296,32 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
     return predicate;
 }
 
-Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition,
-                                          const MetaHalfArithmetic& meta, Node op_a, Node op_b) {
-
-    UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
-                             condition == PredCondition::NotEqualWithNan ||
-                             condition == PredCondition::LessEqualWithNan ||
-                             condition == PredCondition::GreaterThanWithNan ||
-                             condition == PredCondition::GreaterEqualWithNan,
-                         "Unimplemented NaN comparison for half floats");
-
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a,
+                                          Node op_b) {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::Logical2HLessThan},
         {PredCondition::Equal, OperationCode::Logical2HEqual},
         {PredCondition::LessEqual, OperationCode::Logical2HLessEqual},
         {PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan},
         {PredCondition::NotEqual, OperationCode::Logical2HNotEqual},
         {PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual},
-        {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThan},
-        {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqual},
-        {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqual},
-        {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThan},
-        {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqual}};
+        {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan},
+        {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan},
+        {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan},
+        {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan},
+        {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan}};
 
     const auto comparison{PredicateComparisonTable.find(condition)};
     UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                          "Unknown predicate comparison operation");
 
-    const Node predicate = Operation(comparison->second, meta, op_a, op_b);
+    const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
 
     return predicate;
 }
 
 OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) {
-    static const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = {
+    const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = {
         {PredOperation::And, OperationCode::LogicalAnd},
         {PredOperation::Or, OperationCode::LogicalOr},
         {PredOperation::Xor, OperationCode::LogicalXor},
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 57af8b10f..81278fb33 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -109,11 +109,13 @@ enum class OperationCode {
     UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
     UBitCount,        /// (MetaArithmetic, uint) -> uint
 
-    HAdd,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HMul,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HFma,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
+    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
     HAbsolute, /// (f16vec2 a) -> f16vec2
     HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
+    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
+    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
     HMergeF32, /// (f16vec2 src) -> float
     HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
     HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
@@ -150,12 +152,18 @@ enum class OperationCode {
     LogicalUNotEqual,     /// (uint a, uint b) -> bool
     LogicalUGreaterEqual, /// (uint a, uint b) -> bool
 
-    Logical2HLessThan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HLessEqual,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterThan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HNotEqual,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessThan,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqual,               /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqual,           /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThan,         /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqual,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessThanWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqualWithNan,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqualWithNan,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThanWithNan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqualWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
 
     Texture,                /// (MetaTexture, float[N] coords) -> float4
     TextureLod,             /// (MetaTexture, float[N] coords) -> float4
@@ -308,13 +316,6 @@ struct MetaArithmetic {
     bool precise{};
 };
 
-struct MetaHalfArithmetic {
-    bool precise{};
-    std::array<Tegra::Shader::HalfType, 3> types = {Tegra::Shader::HalfType::H0_H1,
-                                                    Tegra::Shader::HalfType::H0_H1,
-                                                    Tegra::Shader::HalfType::H0_H1};
-};
-
 struct MetaTexture {
     const Sampler& sampler;
     Node array{};
@@ -326,11 +327,10 @@ struct MetaTexture {
     u32 element{};
 };
 
-constexpr MetaArithmetic PRECISE = {true};
-constexpr MetaArithmetic NO_PRECISE = {false};
-constexpr MetaHalfArithmetic HALF_NO_PRECISE = {false};
+inline constexpr MetaArithmetic PRECISE = {true};
+inline constexpr MetaArithmetic NO_PRECISE = {false};
 
-using Meta = std::variant<MetaArithmetic, MetaHalfArithmetic, MetaTexture>;
+using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
 
 /// Holds any kind of operation that can be done in the IR
 class OperationNode final {
@@ -734,10 +734,14 @@ private:
 
     /// Unpacks a half immediate from an instruction
     Node UnpackHalfImmediate(Tegra::Shader::Instruction instr, bool has_negation);
+    /// Unpacks a binary value into a half float pair with a type format
+    Node UnpackHalfFloat(Node value, Tegra::Shader::HalfType type);
     /// Merges a half pair into another value
     Node HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge);
     /// Conditionally absolute/negated half float pair. Absolute is applied first
     Node GetOperandAbsNegHalf(Node value, bool absolute, bool negate);
+    /// Conditionally saturates a half float pair
+    Node GetSaturatedHalfFloat(Node value, bool saturate = true);
 
     /// Returns a predicate comparing two floats
     Node GetPredicateComparisonFloat(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
@@ -745,8 +749,7 @@ private:
     Node GetPredicateComparisonInteger(Tegra::Shader::PredCondition condition, bool is_signed,
                                        Node op_a, Node op_b);
     /// Returns a predicate comparing two half floats. meta consumes how both pairs will be compared
-    Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition,
-                                    const MetaHalfArithmetic& meta, Node op_a, Node op_b);
+    Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
 
     /// Returns a predicate combiner operation
     OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 995d0e068..217805386 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -288,6 +288,29 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
     }
 }
 
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data) {
+    const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x};
+    std::size_t count = 0;
+    for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
+        const std::size_t gob_address_y =
+            (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
+            ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
+        const auto& table = legacy_swizzle_table[y % gob_size_y];
+        for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
+            const std::size_t gob_address =
+                gob_address_y + (x / gob_size_x) * gob_size * block_height;
+            const std::size_t swizzled_offset = gob_address + table[x % gob_size_x];
+            const u8* source_line = source_data + count;
+            u8* dest_addr = swizzle_data + swizzled_offset;
+            count++;
+
+            std::memcpy(dest_addr, source_line, 1);
+        }
+    }
+}
+
 std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
                               u32 height) {
     std::vector<u8> rgba_data;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index e078fa274..e072d8401 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -51,4 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
                       u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
                       u32 offset_x, u32 offset_y);
 
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data);
+
 } // namespace Tegra::Texture
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index cb82ecf3f..60cda0ca3 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -5,6 +5,8 @@
 #include <memory>
 #include "core/core.h"
 #include "core/settings.h"
+#include "video_core/gpu_asynch.h"
+#include "video_core/gpu_synch.h"
 #include "video_core/renderer_base.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/video_core.h"
@@ -16,6 +18,14 @@ std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_wind
     return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system);
 }
 
+std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system) {
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        return std::make_unique<VideoCommon::GPUAsynch>(system, system.Renderer());
+    }
+
+    return std::make_unique<VideoCommon::GPUSynch>(system, system.Renderer());
+}
+
 u16 GetResolutionScaleFactor(const RendererBase& renderer) {
     return static_cast<u16>(
         Settings::values.resolution_factor
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 3c583f195..b8e0ac372 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -14,6 +14,10 @@ namespace Core::Frontend {
 class EmuWindow;
 }
 
+namespace Tegra {
+class GPU;
+}
+
 namespace VideoCore {
 
 class RendererBase;
@@ -27,6 +31,9 @@ class RendererBase;
 std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window,
                                              Core::System& system);
 
+/// Creates an emulated GPU instance using the given system context.
+std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system);
+
 u16 GetResolutionScaleFactor(const RendererBase& renderer);
 
 } // namespace VideoCore