46 files changed, 1034 insertions, 394 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 099bb446e..21c46a567 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
     buffer_cache/buffer_cache.h
     buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
+    compatible_formats.cpp
+    compatible_formats.h
     dirty_flags.cpp
     dirty_flags.h
     dma_pusher.cpp
@@ -27,6 +29,8 @@ add_library(video_core STATIC
     engines/shader_type.h
     macro/macro.cpp
     macro/macro.h
+    macro/macro_hle.cpp
+    macro/macro_hle.h
     macro/macro_interpreter.cpp
     macro/macro_interpreter.h
     macro/macro_jit_x64.cpp
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index bae1d527c..cf8bdd021 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -41,7 +41,11 @@ class BufferCache {
     static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
 
 public:
-    using BufferInfo = std::pair<BufferType, u64>;
+    struct BufferInfo {
+        BufferType handle;
+        u64 offset;
+        u64 address;
+    };
 
     BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                             bool is_written = false, bool use_fast_cbuf = false) {
@@ -50,7 +54,7 @@ public:
         auto& memory_manager = system.GPU().MemoryManager();
         const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr_opt) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         const VAddr cpu_addr = *cpu_addr_opt;
 
@@ -88,7 +92,7 @@ public:
         Buffer* const block = GetBlock(cpu_addr, size);
         MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
         if (!map) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
         }
         if (is_written) {
             map->MarkAsModified(true, GetModifiedTicks());
@@ -101,7 +105,7 @@ public:
             }
         }
 
-        return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))};
+        return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};
     }
 
     /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
@@ -254,27 +258,17 @@ public:
         committed_flushes.pop_front();
     }
 
-    virtual BufferType GetEmptyBuffer(std::size_t size) = 0;
+    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
 
 protected:
     explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer_)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)},
-          stream_buffer_handle{stream_buffer->Handle()} {}
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}
 
     ~BufferCache() = default;
 
     virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
 
-    virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                 const u8* data) = 0;
-
-    virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                   u8* data) = 0;
-
-    virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                           std::size_t dst_offset, std::size_t size) = 0;
-
     virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
         return {};
     }
@@ -336,11 +330,11 @@ private:
             const VAddr cpu_addr_end = cpu_addr + size;
             if (memory_manager.IsGranularRange(gpu_addr, size)) {
                 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr);
+                block->Upload(block->Offset(cpu_addr), size, host_ptr);
             } else {
                 staging_buffer.resize(size);
                 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data());
+                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
             }
             return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
         }
@@ -399,7 +393,7 @@ private:
             }
             staging_buffer.resize(size);
             system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-            UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data());
+            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
         }
     }
 
@@ -436,7 +430,7 @@ private:
 
         const std::size_t size = map->end - map->start;
         staging_buffer.resize(size);
-        DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data());
+        block->Download(block->Offset(map->start), size, staging_buffer.data());
         system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
@@ -449,7 +443,7 @@ private:
 
         buffer_ptr += size;
         buffer_offset += size;
-        return {stream_buffer_handle, uploaded_offset};
+        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
     }
 
     void AlignBuffer(std::size_t alignment) {
@@ -464,7 +458,7 @@ private:
         const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
         const VAddr cpu_addr = buffer->CpuAddr();
         std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(*buffer, *new_buffer, 0, 0, old_size);
+        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
         QueueDestruction(std::move(buffer));
 
         const VAddr cpu_addr_end = cpu_addr + new_size - 1;
@@ -486,8 +480,8 @@ private:
         const std::size_t new_size = size_1 + size_2;
 
         std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1);
-        CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2);
+        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
         QueueDestruction(std::move(first));
         QueueDestruction(std::move(second));
 
diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
new file mode 100644
index 000000000..6c426b035
--- /dev/null
+++ b/src/video_core/compatible_formats.cpp
@@ -0,0 +1,162 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/compatible_formats.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+namespace {
+
+// Compatibility table taken from Table 3.X.2 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
+
+constexpr std::array VIEW_CLASS_128_BITS = {
+    PixelFormat::RGBA32F,
+    PixelFormat::RGBA32UI,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+
+constexpr std::array VIEW_CLASS_96_BITS = {
+    PixelFormat::RGB32F,
+};
+// Missing formats:
+// PixelFormat::RGB32UI,
+// PixelFormat::RGB32I,
+
+constexpr std::array VIEW_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI, PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16F, PixelFormat::RGBA16S,
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I
+
+// TODO: How should we handle 48 bits?
+
+constexpr std::array VIEW_CLASS_32_BITS = {
+    PixelFormat::RG16F,        PixelFormat::R11FG11FB10F, PixelFormat::R32F,
+    PixelFormat::A2B10G10R10U, PixelFormat::RG16UI,       PixelFormat::R32UI,
+    PixelFormat::RG16I,        PixelFormat::R32I,         PixelFormat::ABGR8U,
+    PixelFormat::RG16,         PixelFormat::ABGR8S,       PixelFormat::RG16S,
+    PixelFormat::RGBA8_SRGB,   PixelFormat::E5B9G9R9F,    PixelFormat::BGRA8,
+    PixelFormat::BGRA8_SRGB,
+};
+// Missing formats:
+// PixelFormat::RGBA8UI
+// PixelFormat::RGBA8I
+// PixelFormat::RGB10_A2_UI
+
+// TODO: How should we handle 24 bits?
+
+constexpr std::array VIEW_CLASS_16_BITS = {
+    PixelFormat::R16F, PixelFormat::RG8UI, PixelFormat::R16UI, PixelFormat::R16I,
+    PixelFormat::RG8U, PixelFormat::R16U,  PixelFormat::RG8S,  PixelFormat::R16S,
+};
+// Missing formats:
+// PixelFormat::RG8I
+
+constexpr std::array VIEW_CLASS_8_BITS = {
+    PixelFormat::R8UI,
+    PixelFormat::R8U,
+};
+// Missing formats:
+// PixelFormat::R8I
+// PixelFormat::R8S
+
+constexpr std::array VIEW_CLASS_RGTC1_RED = {
+    PixelFormat::DXN1,
+};
+// Missing formats:
+// COMPRESSED_SIGNED_RED_RGTC1
+
+constexpr std::array VIEW_CLASS_RGTC2_RG = {
+    PixelFormat::DXN2UNORM,
+    PixelFormat::DXN2SNORM,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+    PixelFormat::BC7U,
+    PixelFormat::BC7U_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+    PixelFormat::BC6H_SF16,
+    PixelFormat::BC6H_UF16,
+};
+
+// Compatibility table taken from Table 4.X.1 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
+
+constexpr std::array COPY_CLASS_128_BITS = {
+    PixelFormat::RGBA32UI,   PixelFormat::RGBA32F,   PixelFormat::DXT23,
+    PixelFormat::DXT23_SRGB, PixelFormat::DXT45,     PixelFormat::DXT45_SRGB,
+    PixelFormat::DXN2SNORM,  PixelFormat::BC7U,      PixelFormat::BC7U_SRGB,
+    PixelFormat::BC6H_SF16,  PixelFormat::BC6H_UF16,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+// COMPRESSED_RG_RGTC2
+
+constexpr std::array COPY_CLASS_64_BITS = {
+    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI,  PixelFormat::RG32UI,
+    PixelFormat::RGBA16U, PixelFormat::RGBA16S, PixelFormat::DXT1_SRGB, PixelFormat::DXT1,
+
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I,
+// COMPRESSED_RGB_S3TC_DXT1_EXT
+// COMPRESSED_SRGB_S3TC_DXT1_EXT
+// COMPRESSED_RGBA_S3TC_DXT1_EXT
+// COMPRESSED_SIGNED_RED_RGTC1
+
+void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
+    compatiblity[format_a][format_b] = true;
+    compatiblity[format_b][format_a] = true;
+}
+
+void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
+    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+}
+
+template <typename Range>
+void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
+        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
+            Enable(compatibility, *it_a, *it_b);
+        }
+    }
+}
+
+} // Anonymous namespace
+
+FormatCompatibility::FormatCompatibility() {
+    for (size_t i = 0; i < MaxPixelFormat; ++i) {
+        // Identity is allowed
+        Enable(view, i, i);
+    }
+
+    EnableRange(view, VIEW_CLASS_128_BITS);
+    EnableRange(view, VIEW_CLASS_96_BITS);
+    EnableRange(view, VIEW_CLASS_64_BITS);
+    EnableRange(view, VIEW_CLASS_32_BITS);
+    EnableRange(view, VIEW_CLASS_16_BITS);
+    EnableRange(view, VIEW_CLASS_8_BITS);
+    EnableRange(view, VIEW_CLASS_RGTC1_RED);
+    EnableRange(view, VIEW_CLASS_RGTC2_RG);
+    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
+    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+
+    copy = view;
+    EnableRange(copy, COPY_CLASS_128_BITS);
+    EnableRange(copy, COPY_CLASS_64_BITS);
+}
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h
new file mode 100644
index 000000000..d1082566d
--- /dev/null
+++ b/src/video_core/compatible_formats.h
@@ -0,0 +1,32 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+class FormatCompatibility {
+public:
+    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+
+    explicit FormatCompatibility();
+
+    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
+        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+    }
+
+private:
+    Table view;
+    Table copy;
+};
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index ea3c8a963..c01436295 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters)
         ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
 
     // Execute the current macro.
-    macro_engine->Execute(macro_positions[entry], parameters);
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
     if (mme_draw.current_mode != MMEDrawMode::Undefined) {
         FlushMMEInlineDraw();
     }
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index d5fe25065..ef1618990 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1418,6 +1418,14 @@ public:
         return execute_on;
     }
 
+    VideoCore::RasterizerInterface& GetRasterizer() {
+        return rasterizer;
+    }
+
+    const VideoCore::RasterizerInterface& GetRasterizer() const {
+        return rasterizer;
+    }
+
     /// Notify a memory write has happened.
     void OnMemoryWrite() {
         dirty.flags |= dirty.on_write_stores;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index e7cb87589..d374b73cf 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -661,6 +661,10 @@ union Instruction {
     constexpr Instruction(u64 value) : value{value} {}
     constexpr Instruction(const Instruction& instr) : value(instr.value) {}
 
+    constexpr bool Bit(u64 offset) const {
+        return ((value >> offset) & 1) != 0;
+    }
+
     BitField<0, 8, Register> gpr0;
     BitField<8, 8, Register> gpr8;
     union {
@@ -1874,7 +1878,9 @@ public:
         HSETP2_C,
         HSETP2_R,
         HSETP2_IMM,
+        HSET2_C,
         HSET2_R,
+        HSET2_IMM,
         POPC_C,
         POPC_R,
         POPC_IMM,
@@ -2194,7 +2200,9 @@ private:
             INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
             INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
             INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
+            INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
+            INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
             INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
             INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 8eb017f65..482e49711 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <chrono>
+
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
@@ -154,8 +156,7 @@ u64 GPU::GetTicks() const {
     constexpr u64 gpu_ticks_num = 384;
     constexpr u64 gpu_ticks_den = 625;
 
-    const u64 cpu_ticks = system.CoreTiming().GetTicks();
-    u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
+    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
     if (Settings::values.use_fast_gpu_time) {
         nanoseconds /= 256;
     }
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a1b4c305c..2c42483bd 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -284,6 +284,12 @@ public:
     /// core timing events.
     virtual void Start() = 0;
 
+    /// Obtain the CPU Context
+    virtual void ObtainContext() = 0;
+
+    /// Release the CPU Context
+    virtual void ReleaseContext() = 0;
+
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 53305ab43..7b855f63e 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -19,10 +19,17 @@ GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBa
 GPUAsynch::~GPUAsynch() = default;
 
 void GPUAsynch::Start() {
-    cpu_context->MakeCurrent();
     gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher);
 }
 
+void GPUAsynch::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 517658612..15e9f1d38 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -25,6 +25,8 @@ public:
     ~GPUAsynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 6f38a672a..aaeb9811d 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -13,10 +13,16 @@ GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase
 
 GPUSynch::~GPUSynch() = default;
 
-void GPUSynch::Start() {
+void GPUSynch::Start() {}
+
+void GPUSynch::ObtainContext() {
     context->MakeCurrent();
 }
 
+void GPUSynch::ReleaseContext() {
+    context->DoneCurrent();
+}
+
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->Push(std::move(entries));
     dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 4a6e9a01d..762c20aa5 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -24,6 +24,8 @@ public:
     ~GPUSynch() override;
 
     void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c3bb4fe06..738c6f0c1 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 #include "core/settings.h"
@@ -18,7 +19,11 @@ namespace VideoCommon::GPUThread {
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                       Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
                       SynchState& state) {
-    MicroProfileOnThreadCreate("GpuThread");
+    std::string name = "yuzu:GPU";
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    system.RegisterHostThread();
 
     // Wait for first GPU command before acquiring the window context
     while (state.queue.Empty())
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index 89077a2d8..a50e7b4e0 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -2,32 +2,78 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <optional>
+#include <boost/container_hash/hash.hpp>
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
 #include "video_core/macro/macro_interpreter.h"
 #include "video_core/macro/macro_jit_x64.h"
 
 namespace Tegra {
 
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() = default;
+
 void MacroEngine::AddCode(u32 method, u32 data) {
     uploaded_macro_code[method].push_back(data);
 }
 
-void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+                          const std::vector<u32>& parameters) {
     auto compiled_macro = macro_cache.find(method);
     if (compiled_macro != macro_cache.end()) {
-        compiled_macro->second->Execute(parameters, method);
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
     } else {
         // Macro not compiled, check if it's uploaded and if so, compile it
-        auto macro_code = uploaded_macro_code.find(method);
+        std::optional<u32> mid_method = std::nullopt;
+        const auto macro_code = uploaded_macro_code.find(method);
         if (macro_code == uploaded_macro_code.end()) {
-            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
-            return;
+            for (const auto& [method_base, code] : uploaded_macro_code) {
+                if (method >= method_base && (method - method_base) < code.size()) {
+                    mid_method = method_base;
+                    break;
+                }
+            }
+            if (!mid_method.has_value()) {
+                UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+                return;
+            }
+        }
+        auto& cache_info = macro_cache[method];
+
+        if (!mid_method.has_value()) {
+            cache_info.lle_program = Compile(macro_code->second);
+            cache_info.hash = boost::hash_value(macro_code->second);
+        } else {
+            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+            const auto rebased_method = method - mid_method.value();
+            auto& code = uploaded_macro_code[method];
+            code.resize(macro_cached.size() - rebased_method);
+            std::memcpy(code.data(), macro_cached.data() + rebased_method,
+                        code.size() * sizeof(u32));
+            cache_info.hash = boost::hash_value(code);
+            cache_info.lle_program = Compile(code);
+        }
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (hle_program.has_value()) {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program.value());
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
         }
-        macro_cache[method] = Compile(macro_code->second);
-        macro_cache[method]->Execute(parameters, method);
     }
 }
 
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
index b76ed891f..4d00b84b0 100644
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -11,9 +11,11 @@
 #include "common/common_types.h"
 
 namespace Tegra {
+
 namespace Engines {
 class Maxwell3D;
 }
+
 namespace Macro {
 constexpr std::size_t NUM_MACRO_REGISTERS = 8;
 enum class Operation : u32 {
@@ -94,6 +96,8 @@ union MethodAddress {
 
 } // namespace Macro
 
+class HLEMacro;
+
 class CachedMacro {
 public:
     virtual ~CachedMacro() = default;
@@ -107,20 +111,29 @@ public:
 
 class MacroEngine {
 public:
-    virtual ~MacroEngine() = default;
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
+    virtual ~MacroEngine();
 
     // Store the uploaded macro code to compile them when they're called.
     void AddCode(u32 method, u32 data);
 
     // Compiles the macro if its not in the cache, and executes the compiled macro
-    void Execute(u32 method, const std::vector<u32>& parameters);
+    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
 
 protected:
     virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
 
 private:
-    std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
+    struct CacheInfo {
+        std::unique_ptr<CachedMacro> lle_program{};
+        std::unique_ptr<CachedMacro> hle_program{};
+        u64 hash{};
+        bool has_hle_program{};
+    };
+
+    std::unordered_map<u32, CacheInfo> macro_cache;
     std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+    std::unique_ptr<HLEMacro> hle_macros;
 };
 
 std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
new file mode 100644
index 000000000..410f99018
--- /dev/null
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,113 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+namespace {
+// HLE'd functions
+static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] &
+                                                                        ~(0x3ffffff << 26)));
+    maxwell3d.regs.vb_base_instance = parameters[5];
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.vb_element_base = parameters[3];
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.index_array.first = parameters[4];
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+    maxwell3d.regs.vertex_buffer.first = parameters[3];
+    maxwell3d.regs.vertex_buffer.count = parameters[1];
+    maxwell3d.regs.vb_base_instance = parameters[4];
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    maxwell3d.mme_draw.instance_count = count;
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(false, true);
+    }
+    maxwell3d.regs.vertex_buffer.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+    const u32 element_base = parameters[4];
+    const u32 base_instance = parameters[5];
+    maxwell3d.regs.index_array.first = parameters[3];
+    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.vb_element_base = element_base;
+    maxwell3d.regs.vb_base_instance = base_instance;
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, element_base);
+    maxwell3d.CallMethodFromMME(0x8e5, base_instance);
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.regs.vb_element_base = 0x0;
+    maxwell3d.regs.vb_base_instance = 0x0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+} // namespace
+
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
+    std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
+    std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
+    std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
+}};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+                                 [hash](const auto& pair) { return pair.first == hash; });
+    if (it == hle_funcs.end()) {
+        return std::nullopt;
+    }
+    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
+    : maxwell3d(maxwell3d), func(func) {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+    func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
new file mode 100644
index 000000000..37af875a0
--- /dev/null
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
+    ~HLEMacro();
+
+    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+    ~HLEMacroImpl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 5edff27aa..aa5256419 100644
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -11,7 +11,8 @@
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
 
 namespace Tegra {
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
 std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
     return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
index 30abb66e5..07292702f 100644
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -28,7 +28,8 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
     BRANCH_HOLDER,
 });
 
-MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
 
 std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
     return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index ad0577a4f..d9f7b4cc6 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,21 +22,46 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
+    : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
 }
 
 Buffer::~Buffer() = default;
 
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                            data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
     if (!device.HasFastBufferSubData()) {
         return;
     }
 
-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
     for (const GLuint cbuf : cbufs) {
         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -48,39 +73,20 @@ OGLBufferCache::~OGLBufferCache() {
 }
 
 std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<Buffer>(cpu_addr, size);
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }
 
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return 0;
-}
-
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }
 
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
     const GLuint cbuf = cbufs[cbuf_cursor++];
+
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0};
+    return {cbuf, 0, 0};
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a49aaf9c4..59d95adbc 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -25,15 +25,27 @@ class RasterizerOpenGL;
 
 class Buffer : public VideoCommon::BufferBlock {
 public:
-    explicit Buffer(VAddr cpu_addr, const std::size_t size);
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
-    GLuint Handle() const {
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
+
+    GLuint Handle() const noexcept {
         return gl_buffer.handle;
     }
 
+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
 private:
     OGLBuffer gl_buffer;
+    u64 gpu_address = 0;
 };
 
 using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
@@ -43,7 +55,7 @@ public:
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
-    GLuint GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;
 
     void Acquire() noexcept {
         cbuf_cursor = 0;
@@ -52,22 +64,16 @@ public:
 protected:
     std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
-
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
     std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 1011c7738..208fc6167 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -178,7 +178,7 @@ bool IsASTCSupported() {
         for (const GLenum format : formats) {
             for (const GLenum support : required_support) {
                 GLint value;
-                glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
+                glGetInternalformativ(target, format, support, 1, &value);
                 if (value != GL_FULL_SUPPORT) {
                     return false;
                 }
@@ -193,6 +193,7 @@ bool IsASTCSupported() {
 Device::Device()
     : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
     const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
@@ -221,8 +222,13 @@ Device::Device()
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
     has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
     use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
                            GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
                            GLAD_GL_NV_transform_feedback2;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index c86e709b1..e1d811966 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -72,6 +72,10 @@ public:
         return has_texture_shadow_lod;
     }
 
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -115,6 +119,7 @@ private:
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
     bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 2d6c11320..e960a0ef1 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -61,7 +61,8 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
 constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
     NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
 
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
@@ -193,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
     // avoid OpenGL errors.
     // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
     // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexFormat0 + index]) {
             continue;
         }
@@ -212,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
         if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
             attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
             glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
-                                  MaxwellToGL::VertexType(attrib), attrib.offset);
+                                  MaxwellToGL::VertexFormat(attrib), attrib.offset);
         } else {
-            glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+            glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+                                 MaxwellToGL::VertexFormat(attrib),
                                  attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
         }
         glVertexAttribBinding(gl_index, attrib.buffer);
@@ -231,9 +233,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
     // Upload all guest vertex arrays sequentially to our buffer
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
@@ -246,16 +250,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
         ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
         const u64 size = end - start;
         if (size == 0) {
-            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
             continue;
         }
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
-                           vertex_array.stride);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
     }
 }
 
@@ -268,7 +281,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
     flags[Dirty::VertexInstances] = false;
 
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
         }
@@ -285,9 +298,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
     const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
@@ -643,9 +656,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     if (!device.UseAssemblyShaders()) {
         MaxwellUniformData ubo;
         ubo.SetFromRegs(gpu);
-        const auto [buffer, offset] =
+        const auto info =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
                           static_cast<GLsizeiptr>(sizeof(ubo)));
     }
 
@@ -956,8 +969,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
         if (device.UseAssemblyShaders()) {
             glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
         } else {
-            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
-                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
         }
         return;
     }
@@ -970,24 +982,25 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
 
     const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
     const GPUVAddr gpu_addr = buffer.address;
-    auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
 
     if (device.UseAssemblyShaders()) {
         UNIMPLEMENTED_IF(use_unified);
-        if (offset != 0) {
+        if (info.offset != 0) {
             const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
-            glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
-            cbuf = staging_cbuf;
-            offset = 0;
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
         }
-        glBindBufferRangeNV(stage, binding, cbuf, offset, size);
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
         return;
     }
 
     if (use_unified) {
-        glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
     } else {
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
     }
 }
 
@@ -1023,9 +1036,8 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                       static_cast<GLsizeiptr>(size));
 }
 
@@ -1712,8 +1724,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
         const GLuint handle = transform_feedback_buffers[index].handle;
         const GPUVAddr gpu_addr = binding.Address();
         const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 46e780a06..c6a3bf3a1 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -460,8 +460,9 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         const u8* host_ptr_b = memory_manager.GetPointer(address_b);
         code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
     }
+    const std::size_t code_size = code.size() * sizeof(u64);
 
-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
     const ShaderParameters params{system,    disk_cache, device,
@@ -477,7 +478,7 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     Shader* const result = shader.get();
     if (cpu_addr) {
-        Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64));
+        Register(std::move(shader), *cpu_addr, code_size);
     } else {
         null_shader = std::move(shader);
     }
@@ -495,8 +496,9 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
 
     const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
@@ -511,7 +513,7 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
 
     Shader* const result = kernel.get();
     if (cpu_addr) {
-        Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64));
+        Register(std::move(kernel), *cpu_addr, code_size);
     } else {
         null_kernel = std::move(kernel);
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 6848f1388..994aaeaf2 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -37,7 +37,6 @@ namespace OpenGL {
 
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 932a2f69e..3655ff629 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <deque>
+#include <tuple>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
     : buffer_size(size) {
     gl_buffer.Create();
 
@@ -29,23 +30,19 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
     }
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
     gl_buffer.Release();
 }
 
@@ -60,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 
     bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
         buffer_pos = 0;
         invalidate = true;
-
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
     }
 
-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
-    }
-
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }
 
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
-    if (!coherent && size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
+    if (size > 0) {
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
     }
 
     buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index 866da3594..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,10 +11,11 @@
 
 namespace OpenGL {
 
+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
     /*
@@ -33,19 +34,20 @@ public:
         return gl_buffer.handle;
     }
 
-    GLsizeiptr Size() const {
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
         return buffer_size;
     }
 
 private:
     OGLBuffer gl_buffer;
 
-    bool coherent = false;
-    bool persistent = false;
-
+    GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
     GLsizeiptr mapped_size = 0;
     u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 35e329240..774e70a5b 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,10 +24,11 @@ namespace MaxwellToGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
-    case Maxwell::VertexAttribute::Type::UnsignedInt:
     case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::UnsignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -48,8 +49,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT_2_10_10_10_REV;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedInt:
     case Maxwell::VertexAttribute::Type::SignedNorm:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -84,36 +86,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_FLOAT;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_UNSIGNED_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_UNSIGNED_SHORT;
-        }
-        break;
-    case Maxwell::VertexAttribute::Type::SignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_SHORT;
-        }
-        break;
     }
-    UNIMPLEMENTED_MSG("Unimplemented vertex type={} and size={}", attrib.TypeString(),
+    UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
                       attrib.SizeString());
     return {};
 }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6214fcbc3..c40adb6e7 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     // Clear screen to black
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 void RendererOpenGL::AddTelemetryFields() {
@@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     glBindTextureUnit(0, screen_info.display_texture);
     glBindSampler(0, 0);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 61bf507f4..8b18d32e6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -107,6 +107,9 @@ private:
     OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
+    // GPU address of the vertex buffer
+    GLuint64EXT vertex_buffer_address = 0;
+
     /// Display information for Switch screen
     ScreenInfo screen_info;
 
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 1f2b6734b..d7f1ae89f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -294,6 +294,28 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
 
 VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
     switch (type) {
+    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+        switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return VK_FORMAT_R8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+            return VK_FORMAT_R8G8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return VK_FORMAT_R8G8B8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return VK_FORMAT_R8G8B8A8_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return VK_FORMAT_R16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return VK_FORMAT_R16G16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return VK_FORMAT_R16G16B16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return VK_FORMAT_R16G16B16A16_UNORM;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+        }
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -314,62 +336,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R16G16B16A16_SNORM;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return VK_FORMAT_A2B10G10R10_SNORM_PACK32;
-        default:
-            break;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_UNORM;
+            return VK_FORMAT_R8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_UNORM;
+            return VK_FORMAT_R8G8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_UNORM;
+            return VK_FORMAT_R8G8B8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_UNORM;
+            return VK_FORMAT_R8G8B8A8_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_UNORM;
+            return VK_FORMAT_R16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_UNORM;
+            return VK_FORMAT_R16G16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_UNORM;
+            return VK_FORMAT_R16G16B16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_UNORM;
+            return VK_FORMAT_R16G16B16A16_USCALED;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
-            return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
-        default:
-            break;
+            return VK_FORMAT_A2B10G10R10_USCALED_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedInt:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SINT;
+            return VK_FORMAT_R8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SINT;
+            return VK_FORMAT_R8G8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SINT;
+            return VK_FORMAT_R8G8B8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SINT;
+            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SINT;
+            return VK_FORMAT_R16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SINT;
+            return VK_FORMAT_R16G16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SINT;
+            return VK_FORMAT_R16G16B16_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32:
-            return VK_FORMAT_R32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32:
-            return VK_FORMAT_R32G32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32:
-            return VK_FORMAT_R32G32B32_SINT;
-        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
-            return VK_FORMAT_R32G32B32A32_SINT;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SSCALED;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;
         }
         break;
     case Maxwell::VertexAttribute::Type::UnsignedInt:
@@ -398,56 +408,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_UINT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_UINT;
-        default:
-            break;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_UINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_USCALED;
+            return VK_FORMAT_R8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_USCALED;
+            return VK_FORMAT_R8G8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_USCALED;
+            return VK_FORMAT_R8G8B8_SINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_USCALED;
+            return VK_FORMAT_R8G8B8A8_SINT;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_USCALED;
+            return VK_FORMAT_R16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_USCALED;
+            return VK_FORMAT_R16G16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_USCALED;
+            return VK_FORMAT_R16G16B16_SINT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_USCALED;
-        default:
-            break;
+            return VK_FORMAT_R16G16B16A16_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32:
+            return VK_FORMAT_R32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32:
+            return VK_FORMAT_R32G32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+            return VK_FORMAT_R32G32B32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return VK_FORMAT_R32G32B32A32_SINT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return VK_FORMAT_A2B10G10R10_SINT_PACK32;
         }
         break;
-    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::Float:
         switch (size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-            return VK_FORMAT_R8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-            return VK_FORMAT_R8G8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-            return VK_FORMAT_R8G8B8_SSCALED;
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return VK_FORMAT_R8G8B8A8_SSCALED;
         case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SSCALED;
+            return VK_FORMAT_R16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SSCALED;
+            return VK_FORMAT_R16G16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SSCALED;
+            return VK_FORMAT_R16G16B16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SSCALED;
-        default:
-            break;
-        }
-        break;
-    case Maxwell::VertexAttribute::Type::Float:
-        switch (size) {
+            return VK_FORMAT_R16G16B16A16_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32:
             return VK_FORMAT_R32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32:
@@ -456,16 +460,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R32G32B32_SFLOAT;
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return VK_FORMAT_R32G32B32A32_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16:
-            return VK_FORMAT_R16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-            return VK_FORMAT_R16G16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-            return VK_FORMAT_R16G16B16_SFLOAT;
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return VK_FORMAT_R16G16B16A16_SFLOAT;
-        default:
-            break;
         }
         break;
     }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index cd9673d1f..2d9b18ed9 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -155,11 +155,31 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
         }
     }
 
-    static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"};
-    vk::Span<const char*> layers = layers_data;
-    if (!enable_layers) {
-        layers = {};
+    std::vector<const char*> layers;
+    layers.reserve(1);
+    if (enable_layers) {
+        layers.push_back("VK_LAYER_KHRONOS_validation");
+    }
+
+    const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld);
+    if (!layer_properties) {
+        LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers");
+        layers.clear();
+    }
+
+    for (auto layer_it = layers.begin(); layer_it != layers.end();) {
+        const char* const layer = *layer_it;
+        const auto it = std::find_if(
+            layer_properties->begin(), layer_properties->end(),
+            [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); });
+        if (it == layer_properties->end()) {
+            LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer);
+            layer_it = layers.erase(layer_it);
+        } else {
+            ++layer_it;
+        }
     }
+
     vk::Instance instance = vk::Instance::Create(layers, extensions, dld);
     if (!instance) {
         LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance");
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 1fde38328..f10f96cd8 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -37,9 +37,9 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
 
 } // Anonymous namespace
 
-Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
-               std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_,
+               VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size)
+    : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} {
     VkBufferCreateInfo ci;
     ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
     ci.pNext = nullptr;
@@ -56,40 +56,15 @@ Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cp
 
 Buffer::~Buffer() = default;
 
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                             const VKDevice& device, VKMemoryManager& memory_manager,
-                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
-    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
-                                                                 CreateStreamBuffer(device,
-                                                                                    scheduler)},
-      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
-                                                                                staging_pool} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);
-}
-
-VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
-    size = std::max(size, std::size_t(4));
-    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
-    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
-        cmdbuf.FillBuffer(buffer, 0, size, 0);
-    });
-    return *empty.handle;
-}
-
-void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                    const u8* data) {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     std::memcpy(staging.commit->Map(size), data, size);
 
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
-        cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
+        cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size});
 
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -98,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
         barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
@@ -106,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
     });
 }
 
-void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                      u8* data) {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
-                      size](vk::CommandBuffer cmdbuf) {
+
+    const VkBuffer handle = Handle();
+    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
         barrier.pNext = nullptr;
@@ -119,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
         barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
         barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
         barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
+        barrier.buffer = handle;
         barrier.offset = offset;
         barrier.size = size;
 
@@ -127,17 +102,19 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
-        cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size});
+        cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size});
     });
     scheduler.Finish();
 
     std::memcpy(data, staging.commit->Map(size), size);
 }
 
-void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                              std::size_t dst_offset, std::size_t size) {
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset,
+
+    const VkBuffer dst_buffer = Handle();
+    scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
                       size](vk::CommandBuffer cmdbuf) {
         cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
 
@@ -165,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
     });
 }
 
+VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                             const VKDevice& device, VKMemoryManager& memory_manager,
+                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
+    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
+                                                                 CreateStreamBuffer(device,
+                                                                                    scheduler)},
+      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
+                                                                                staging_pool} {}
+
+VKBufferCache::~VKBufferCache() = default;
+
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr,
+                                    size);
+}
+
+VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
+    size = std::max(size, std::size_t(4));
+    const auto& empty = staging_pool.GetUnusedBuffer(size, false);
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
+        cmdbuf.FillBuffer(buffer, 0, size, 0);
+    });
+    return {*empty.handle, 0, 0};
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 9ebbef835..3630aca77 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -25,15 +25,29 @@ class VKScheduler;
 
 class Buffer final : public VideoCommon::BufferBlock {
 public:
-    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
-                    std::size_t size);
+    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler,
+                    VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
+
     VkBuffer Handle() const {
         return *buffer.handle;
     }
 
+    u64 Address() const {
+        return 0;
+    }
+
 private:
+    VKScheduler& scheduler;
+    VKStagingBufferPool& staging_pool;
+
     VKBuffer buffer;
 };
 
@@ -44,20 +58,11 @@ public:
                            VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
     ~VKBufferCache();
 
-    VkBuffer GetEmptyBuffer(std::size_t size) override;
+    BufferInfo GetEmptyBuffer(std::size_t size) override;
 
 protected:
     std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
-
 private:
     const VKDevice& device;
     VKMemoryManager& memory_manager;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 29001953c..a8d94eac3 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -143,6 +143,49 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
     }
 }
 
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) {
+    if (!is_clear) {
+        return true;
+    }
+    // First we have to make sure all clear masks are enabled.
+    if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B ||
+        !regs.clear_buffers.A) {
+        return true;
+    }
+    // If scissors are disabled, the whole screen is cleared
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Then we have to confirm scissor testing clears the whole image
+    const std::size_t index = regs.clear_buffers.RT;
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width ||
+           scissor.max_y < regs.rt[index].height;
+}
+
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) {
+    // If we are not clearing, the contents have to be preserved
+    if (!is_clear) {
+        return true;
+    }
+    // For depth stencil clears we only have to confirm scissor test covers the whole image
+    if (!regs.clear_flags.scissor) {
+        return false;
+    }
+    // Make sure the clear cover the whole image
+    const auto& scissor = regs.scissor_test[0];
+    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width ||
+           scissor.max_y < regs.zeta_height;
+}
+
 } // Anonymous namespace
 
 class BufferBindings final {
@@ -344,7 +387,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 
     buffer_cache.Unmap();
 
-    const Texceptions texceptions = UpdateAttachments();
+    const Texceptions texceptions = UpdateAttachments(false);
     SetupImageTransitions(texceptions, color_attachments, zeta_attachment);
 
     key.renderpass_params = GetRenderPassParams(texceptions);
@@ -400,7 +443,7 @@ void RasterizerVulkan::Clear() {
         return;
     }
 
-    [[maybe_unused]] const auto texceptions = UpdateAttachments();
+    [[maybe_unused]] const auto texceptions = UpdateAttachments(true);
     DEBUG_ASSERT(texceptions.none());
     SetupImageTransitions(0, color_attachments, zeta_attachment);
 
@@ -677,9 +720,12 @@ void RasterizerVulkan::FlushWork() {
     draw_counter = 0;
 }
 
-RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
+RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {
     MICROPROFILE_SCOPE(Vulkan_RenderTargets);
-    auto& dirty = system.GPU().Maxwell3D().dirty.flags;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    auto& dirty = maxwell3d.dirty.flags;
+    auto& regs = maxwell3d.regs;
+
     const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];
     dirty[VideoCommon::Dirty::RenderTargets] = false;
 
@@ -688,7 +734,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     Texceptions texceptions;
     for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
         if (update_rendertargets) {
-            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true);
+            const bool preserve_contents = HasToPreserveColorContents(is_clear, regs);
+            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents);
         }
         if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
             texceptions[rt] = true;
@@ -696,7 +743,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
     }
 
     if (update_rendertargets) {
-        zeta_attachment = texture_cache.GetDepthBufferSurface(true);
+        const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs);
+        zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents);
     }
     if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
         texceptions[ZETA_TEXCEPTION_INDEX] = true;
@@ -870,10 +918,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
     UNIMPLEMENTED_IF(binding.buffer_offset != 0);
 
     const GPUVAddr gpu_addr = binding.Address();
-    const auto size = static_cast<VkDeviceSize>(binding.buffer_size);
-    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+    const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
 
-    scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
         cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
         cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
     });
@@ -925,8 +973,8 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
             buffer_bindings.AddVertexBinding(DefaultBuffer(), 0);
             continue;
         }
-        const auto [buffer, offset] = buffer_cache.UploadMemory(start, size);
-        buffer_bindings.AddVertexBinding(buffer, offset);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        buffer_bindings.AddVertexBinding(info.handle, info.offset);
     }
 }
 
@@ -948,7 +996,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
         std::tie(buffer, offset) = quad_indexed_pass.Assemble(
             regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
 
@@ -962,7 +1012,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
             break;
         }
         const GPUVAddr gpu_addr = regs.index_array.IndexStart();
-        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+        VkBuffer buffer = info.handle;
+        u64 offset = info.offset;
 
         auto format = regs.index_array.format;
         const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
@@ -1109,10 +1161,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
         Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
     ASSERT(size <= MaxConstbufferSize);
 
-    const auto [buffer_handle, offset] =
+    const auto info =
         buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
-
-    update_descriptor_queue.AddBuffer(buffer_handle, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
@@ -1126,14 +1177,14 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
         // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
         // default buffer.
         static constexpr std::size_t dummy_size = 4;
-        const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size);
-        update_descriptor_queue.AddBuffer(buffer, 0, dummy_size);
+        const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
+        update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
         return;
     }
 
-    const auto [buffer, offset] = buffer_cache.UploadMemory(
+    const auto info = buffer_cache.UploadMemory(
         actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
-    update_descriptor_queue.AddBuffer(buffer, offset, size);
+    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
 }
 
 void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index c8c187606..83e00e7e9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -159,7 +159,10 @@ private:
 
     void FlushWork();
 
-    Texceptions UpdateAttachments();
+    /// @brief Updates the currently bound attachments
+    /// @param is_clear True when the framebuffer is updated as a clear
+    /// @return Bitfield of attachments being used as sampled textures
+    Texceptions UpdateAttachments(bool is_clear);
 
     std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass);
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 82ec9180e..56524e6f3 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
@@ -133,6 +134,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) {
 }
 
 void VKScheduler::WorkerThread() {
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     std::unique_lock lock{mutex};
     do {
         cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; });
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index c765c60a0..689f0d276 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,10 +35,14 @@ public:
     /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
     void Unmap(u64 size);
 
-    VkBuffer Handle() const {
+    VkBuffer Handle() const noexcept {
         return *buffer;
     }
 
+    u64 Address() const noexcept {
+        return 0;
+    }
+
 private:
     struct Watch final {
         VKFenceWatch fence;
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 42eff85d3..0d485a662 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -153,7 +153,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
 
 bool Load(InstanceDispatch& dld) noexcept {
 #define X(name) Proc(dld.name, dld, #name)
-    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties);
+    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) &&
+           X(vkEnumerateInstanceLayerProperties);
 #undef X
 }
 
@@ -770,4 +771,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp
     return properties;
 }
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld) {
+    u32 num;
+    if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    std::vector<VkLayerProperties> properties(num);
+    if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) {
+        return std::nullopt;
+    }
+    return properties;
+}
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index da42ca88e..d56fdb3f9 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -141,6 +141,7 @@ struct InstanceDispatch {
     PFN_vkCreateInstance vkCreateInstance;
     PFN_vkDestroyInstance vkDestroyInstance;
     PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties;
+    PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties;
 
     PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT;
     PFN_vkCreateDevice vkCreateDevice;
@@ -996,4 +997,7 @@ private:
 std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
     const InstanceDispatch& dld);
 
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+    const InstanceDispatch& dld);
+
 } // namespace Vulkan::vk
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 848e46874..b2e88fa20 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -13,55 +13,101 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
+using Tegra::Shader::PredCondition;
 
 u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (instr.hset2.ftz == 0) {
-        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    PredCondition cond;
+    bool bf;
+    bool ftz;
+    bool neg_a;
+    bool abs_a;
+    bool neg_b;
+    bool abs_b;
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_C:
+    case OpCode::Id::HSET2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        bf = instr.Bit(53);
+        ftz = instr.Bit(54);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(56);
+        abs_b = instr.Bit(54);
+        break;
+    case OpCode::Id::HSET2_R:
+        cond = instr.hsetp2.reg.cond;
+        bf = instr.Bit(49);
+        ftz = instr.Bit(50);
+        neg_a = instr.Bit(43);
+        abs_a = instr.Bit(44);
+        neg_b = instr.Bit(31);
+        abs_b = instr.Bit(30);
+        break;
+    default:
+        UNREACHABLE();
     }
 
-    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
-
-    Node op_b = [&]() {
+    Node op_b = [this, instr, opcode] {
         switch (opcode->get().GetId()) {
+        case OpCode::Id::HSET2_C:
+            // Inform as unimplemented as this is not tested.
+            UNIMPLEMENTED_MSG("HSET2_C is not implemented");
+            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
         case OpCode::Id::HSET2_R:
             return GetRegister(instr.gpr20);
+        case OpCode::Id::HSET2_IMM:
+            return UnpackHalfImmediate(instr, true);
         default:
             UNREACHABLE();
-            return Immediate(0);
+            return Node{};
         }
     }();
-    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
-    op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
-    const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+    if (!ftz) {
+        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSET2_R:
+        op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b);
+        [[fallthrough]];
+    case OpCode::Id::HSET2_C:
+        op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b);
+        break;
+    default:
+        break;
+    }
 
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
+    Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+
+    Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
     // HSET2 operates on each half float in the pack.
     std::array<Node, 2> values;
     for (u32 i = 0; i < 2; ++i) {
-        const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff;
-        const Node true_value = Immediate(raw_value << (i * 16));
-        const Node false_value = Immediate(0);
-
-        const Node comparison =
-            Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
-        const Node predicate = Operation(combiner, comparison, second_pred);
+        const u32 raw_value = bf ? 0x3c00 : 0xffff;
+        Node true_value = Immediate(raw_value << (i * 16));
+        Node false_value = Immediate(0);
 
+        Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
+        Node predicate = Operation(combiner, comparison, second_pred);
         values[i] =
-            Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value);
+            Operation(OperationCode::Select, predicate, move(true_value), move(false_value));
     }
 
-    const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]);
-    SetRegister(bb, instr.gpr0, value);
+    Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]);
+    SetRegister(bb, instr.gpr0, move(value));
 
     return pc;
 }
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 60b6ad72a..07778dc3e 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         break;
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         if (component == 0) {
             return descriptor.b_type;
         }
@@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
         }
         break;
     }
-    UNIMPLEMENTED_MSG("texture format not implement={}", format);
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
     return ComponentType::FLOAT;
 }
 
@@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
             return 6;
         }
         return 0;
+    case TextureFormat::BF10GF11RF11:
+        if (component == 1 || component == 2) {
+            return 11;
+        }
+        if (component == 0) {
+            return 10;
+        }
+        return 0;
     case TextureFormat::G8R24:
         if (component == 0) {
             return 8;
@@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
         return (component == 0 || component == 1) ? 8 : 0;
     case TextureFormat::G4R4:
         return (component == 0 || component == 1) ? 4 : 0;
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return 0;
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return 0;
 }
 
 std::size_t GetImageComponentMask(TextureFormat format) {
@@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R32_B24G8:
     case TextureFormat::B5G6R5:
     case TextureFormat::B6G5R5:
+    case TextureFormat::BF10GF11RF11:
         return std::size_t{R | G | B};
     case TextureFormat::R32_G32:
     case TextureFormat::R16_G16:
@@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) {
     case TextureFormat::R8:
     case TextureFormat::R1:
         return std::size_t{R};
-    default:
-        UNIMPLEMENTED_MSG("texture format not implement={}", format);
-        return std::size_t{R | G | B | A};
     }
+    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+    return std::size_t{R | G | B | A};
 }
 
 std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
@@ -299,7 +307,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type,
             return {std::move(original_value), true};
         }
     default:
-        UNIMPLEMENTED_MSG("Unimplement component type={}", component_type);
+        UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type);
         return {std::move(original_value), true};
     }
 }
@@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
             default:
                 break;
             }
-            UNIMPLEMENTED_MSG("Unimplemented operation={} type={}",
+            UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}",
                               static_cast<u64>(instr.suatom_d.operation.Value()),
                               static_cast<u64>(instr.suatom_d.operation_type.Value()));
             return OperationCode::AtomicImageAdd;
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 94d3a6ae5..0caf3b4f0 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -120,6 +120,9 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(
     }
     const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};
     const auto layer{static_cast<u32>(relative_address / layer_size)};
+    if (layer >= params.depth) {
+        return {};
+    }
     const GPUVAddr mipmap_address = relative_address - layer_size * layer;
     const auto mipmap_it =
         Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address);
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 85075e868..6207d8dfe 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -24,6 +24,7 @@
 #include "core/core.h"
 #include "core/memory.h"
 #include "core/settings.h"
+#include "video_core/compatible_formats.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -47,8 +48,8 @@ class RasterizerInterface;
 
 namespace VideoCommon {
 
+using VideoCore::Surface::FormatCompatibility;
 using VideoCore::Surface::PixelFormat;
-
 using VideoCore::Surface::SurfaceTarget;
 using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
 
@@ -595,7 +596,7 @@ private:
         } else {
             new_surface = GetUncachedSurface(gpu_addr, params);
         }
-        const auto& final_params = new_surface->GetSurfaceParams();
+        const SurfaceParams& final_params = new_surface->GetSurfaceParams();
         if (cr_params.type != final_params.type) {
             if (Settings::IsGPULevelExtreme()) {
                 BufferCopy(current_surface, new_surface);
@@ -603,7 +604,7 @@ private:
         } else {
             std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);
             for (auto& brick : bricks) {
-                ImageCopy(current_surface, new_surface, brick);
+                TryCopyImage(current_surface, new_surface, brick);
             }
         }
         Unregister(current_surface);
@@ -694,7 +695,7 @@ private:
                 }
                 const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
                                              src_params.depth);
-                ImageCopy(surface, new_surface, copy_params);
+                TryCopyImage(surface, new_surface, copy_params);
             }
         }
         if (passed_tests == 0) {
@@ -791,7 +792,7 @@ private:
             const u32 width = params.width;
             const u32 height = params.height;
             const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
-            ImageCopy(surface, new_surface, copy_params);
+            TryCopyImage(surface, new_surface, copy_params);
         }
         for (const auto& surface : overlaps) {
             Unregister(surface);
@@ -1192,6 +1193,19 @@ private:
         return {};
     }
 
+    /// Try to do an image copy logging when formats are incompatible.
+    void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) {
+        const SurfaceParams& src_params = src->GetSurfaceParams();
+        const SurfaceParams& dst_params = dst->GetSurfaceParams();
+        if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) {
+            LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}",
+                      static_cast<int>(dst_params.pixel_format),
+                      static_cast<int>(src_params.pixel_format));
+            return;
+        }
+        ImageCopy(src, dst, copy);
+    }
+
     constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {
         return siblings_table[static_cast<std::size_t>(format)];
     }
@@ -1241,6 +1255,7 @@ private:
     VideoCore::RasterizerInterface& rasterizer;
 
     FormatLookupTable format_lookup_table;
+    FormatCompatibility format_compatibility;
 
     u64 ticks{};